diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 1ca97d5c153e..45eb9c783659 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1,9909 +1,9911 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc.
  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015, 2017, Intel Corporation.
  * Copyright (c) 2020 Datto Inc.
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2023, 2024, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <getopt.h>
 #include <openssl/evp.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 #include <sys/backtrace.h>
 
 #include <libnvpair.h>
 #include <libzutil.h>
 #include <libzfs_core.h>
 
 #include <libzdb.h>
 
 #include "zdb.h"
 
 
 extern int reference_tracking_enable;
 extern int zfs_recover;
 extern uint_t zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
 extern boolean_t spa_mode_readable_spacemaps;
 extern uint_t zfs_reconstruct_indirect_combinations_max;
 extern uint_t zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 static uint64_t *zopt_metaslab = NULL;
 static unsigned zopt_metaslab_args = 0;
 
 
 static zopt_object_range_t *zopt_object_ranges = NULL;
 static unsigned zopt_object_args = 0;
 
 static int flagbits[256];
 
 
 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
 static int leaked_objects = 0;
 static zfs_range_tree_t *mos_refd_objs;
 static spa_t *spa;
 static objset_t *os;
 static boolean_t kernel_init_done;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
 static void mos_obj_refd(uint64_t);
 static void mos_obj_refd_multiple(uint64_t);
 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx);
 
 
 
 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
 static void zdb_exit(int reason);
 
 typedef struct sublivelist_verify_block_refcnt {
 	/* block pointer entry in livelist being verified */
 	blkptr_t svbr_blk;
 
 	/*
 	 * Refcount gets incremented to 1 when we encounter the first
 	 * FREE entry for the svfbr block pointer and a node for it
 	 * is created in our ZDB verification/tracking metadata.
 	 *
 	 * As we encounter more FREE entries we increment this counter
 	 * and similarly decrement it whenever we find the respective
 	 * ALLOC entries for this block.
 	 *
 	 * When the refcount gets to 0 it means that all the FREE and
 	 * ALLOC entries of this block have paired up and we no longer
 	 * need to track it in our verification logic (e.g. the node
 	 * containing this struct in our verification data structure
 	 * should be freed).
 	 *
 	 * [refer to sublivelist_verify_blkptr() for the actual code]
 	 */
 	uint32_t svbr_refcnt;
 } sublivelist_verify_block_refcnt_t;
 
 static int
 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_refcnt_t *l = larg;
 	const sublivelist_verify_block_refcnt_t *r = rarg;
 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
 }
 
 static int
 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx)
 {
 	ASSERT3P(tx, ==, NULL);
 	struct sublivelist_verify *sv = arg;
 	sublivelist_verify_block_refcnt_t current = {
 			.svbr_blk = *bp,
 
 			/*
 			 * Start with 1 in case this is the first free entry.
 			 * This field is not used for our B-Tree comparisons
 			 * anyway.
 			 */
 			.svbr_refcnt = 1,
 	};
 
 	zfs_btree_index_t where;
 	sublivelist_verify_block_refcnt_t *pair =
 	    zfs_btree_find(&sv->sv_pair, &current, &where);
 	if (free) {
 		if (pair == NULL) {
 			/* first free entry for this block pointer */
 			zfs_btree_add(&sv->sv_pair, &current);
 		} else {
 			pair->svbr_refcnt++;
 		}
 	} else {
 		if (pair == NULL) {
 			/* block that is currently marked as allocated */
 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
 				    .svb_allocated_txg =
 				    BP_GET_LOGICAL_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
 				    &where) == NULL) {
 					zfs_btree_add_idx(&sv->sv_leftover,
 					    &svb, &where);
 				}
 			}
 		} else {
 			/* alloc matches a free entry */
 			pair->svbr_refcnt--;
 			if (pair->svbr_refcnt == 0) {
 				/* all allocs and frees have been matched */
 				zfs_btree_remove_idx(&sv->sv_pair, &where);
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
 {
 	int err;
 	struct sublivelist_verify *sv = args;
 
 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
 	    sizeof (sublivelist_verify_block_refcnt_t));
 
 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
 	    sv, NULL);
 
 	sublivelist_verify_block_refcnt_t *e;
 	zfs_btree_index_t *cookie = NULL;
 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 		    &e->svbr_blk, B_TRUE);
 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
 		    e->svbr_refcnt, blkbuf);
 	}
 	zfs_btree_destroy(&sv->sv_pair);
 
 	return (err);
 }
 
 static int
 livelist_block_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_t *l = larg;
 	const sublivelist_verify_block_t *r = rarg;
 
 	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
 		return (+1);
 
 	return (0);
 }
 
 /*
  * Check for errors in a livelist while tracking all unfreed ALLOCs in the
  * sublivelist_verify_t: sv->sv_leftover
  */
 static void
 livelist_verify(dsl_deadlist_t *dl, void *arg)
 {
 	sublivelist_verify_t *sv = arg;
 	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
 }
 
 /*
  * Check for errors in the livelist entry and discard the intermediary
  * data structures
  */
 static int
 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
 {
 	(void) args;
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	int err = sublivelist_verify_func(&sv, dle);
 	zfs_btree_clear(&sv.sv_leftover);
 	zfs_btree_destroy(&sv.sv_leftover);
 	return (err);
 }
 
 typedef struct metaslab_verify {
 	/*
 	 * Tree containing all the leftover ALLOCs from the livelists
 	 * that are part of this metaslab.
 	 */
 	zfs_btree_t mv_livelist_allocs;
 
 	/*
 	 * Metaslab information.
 	 */
 	uint64_t mv_vdid;
 	uint64_t mv_msid;
 	uint64_t mv_start;
 	uint64_t mv_end;
 
 	/*
 	 * What's currently allocated for this metaslab.
 	 */
 	zfs_range_tree_t *mv_allocated;
 } metaslab_verify_t;
 
 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
 
 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
     void *arg);
 
 typedef struct unflushed_iter_cb_arg {
 	spa_t *uic_spa;
 	uint64_t uic_txg;
 	void *uic_arg;
 	zdb_log_sm_cb_t uic_cb;
 } unflushed_iter_cb_arg_t;
 
 static int
 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
 {
 	unflushed_iter_cb_arg_t *uic = arg;
 	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
 }
 
 static void
 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		unflushed_iter_cb_arg_t uic = {
 			.uic_spa = spa,
 			.uic_txg = sls->sls_txg,
 			.uic_arg = arg,
 			.uic_cb = cb
 		};
 		VERIFY0(space_map_iterate(sm, space_map_length(sm),
 		    iterate_through_spacemap_logs_cb, &uic));
 		space_map_close(sm);
 	}
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
     uint64_t offset, uint64_t size)
 {
 	sublivelist_verify_block_t svb = {{{0}}};
 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
 	DVA_SET_OFFSET(&svb.svb_dva, offset);
 	DVA_SET_ASIZE(&svb.svb_dva, size);
 	zfs_btree_index_t where;
 	uint64_t end_offset = offset + size;
 
 	/*
 	 *  Look for an exact match for spacemap entry in the livelist entries.
 	 *  Then, look for other livelist entries that fall within the range
 	 *  of the spacemap entry as it may have been condensed
 	 */
 	sublivelist_verify_block_t *found =
 	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
 	if (found == NULL) {
 		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
 	}
 	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
 	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
 	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		if (found->svb_allocated_txg <= txg) {
 			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
 			    "from TXG %llx FREED at TXG %llx\n",
 			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
 			    (u_longlong_t)found->svb_allocated_txg,
 			    (u_longlong_t)txg);
 		}
 	}
 }
 
 static int
 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t txg = sme->sme_txg;
 
 	if (sme->sme_type == SM_ALLOC) {
 		if (zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE ALLOC: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			zfs_range_tree_add(mv->mv_allocated,
 			    offset, size);
 		}
 	} else {
 		if (!zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE FREE: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 		} else {
 			zfs_range_tree_remove(mv->mv_allocated,
 			    offset, size);
 		}
 	}
 
 	if (sme->sme_type != SM_ALLOC) {
 		/*
 		 * If something is freed in the spacemap, verify that
 		 * it is not listed as allocated in the livelist.
 		 */
 		verify_livelist_allocs(mv, txg, offset, size);
 	}
 	return (0);
 }
 
 static int
 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	if (vdev_id != mv->mv_vdid)
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	if (ms->ms_id != mv->mv_msid)
 		return (0);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 
 	ASSERT3U(txg, ==, sme->sme_txg);
 	return (metaslab_spacemap_validation_cb(sme, mv));
 }
 
 static void
 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
 {
 	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
 }
 
 static void
 spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
 {
 	if (sm == NULL)
 		return;
 
 	VERIFY0(space_map_iterate(sm, space_map_length(sm),
 	    metaslab_spacemap_validation_cb, mv));
 }
 
 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
 
 /*
  * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
  * they are part of that metaslab (mv_msid).
  */
 static void
 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
 {
 	zfs_btree_index_t where;
 	sublivelist_verify_block_t *svb;
 	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
 	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
 		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
 		    (DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
 			continue;
 
 		if ((DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			continue;
 		}
 
 		zfs_btree_add(&mv->mv_livelist_allocs, svb);
 	}
 
 	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		zfs_btree_remove(&sv->sv_leftover, svb);
 	}
 }
 
 /*
  * [Livelist Check]
  * Iterate through all the sublivelists and:
  * - report leftover frees (**)
  * - record leftover ALLOCs together with their TXG [see Cross Check]
  *
  * (**) Note: Double ALLOCs are valid in datasets that have dedup
  *      enabled. Similarly double FREEs are allowed as well but
  *      only if they pair up with a corresponding ALLOC entry once
  *      we our done with our sublivelist iteration.
  *
  * [Spacemap Check]
  * for each metaslab:
  * - iterate over spacemap and then the metaslab's entries in the
  *   spacemap log, then report any double FREEs and ALLOCs (do not
  *   blow up).
  *
  * [Cross Check]
  * After finishing the Livelist Check phase and while being in the
  * Spacemap Check phase, we find all the recorded leftover ALLOCs
  * of the livelist check that are part of the metaslab that we are
  * currently looking at in the Spacemap Check. We report any entries
  * that are marked as ALLOCs in the livelists but have been actually
  * freed (and potentially allocated again) after their TXG stamp in
  * the spacemaps. Also report any ALLOCs from the livelists that
  * belong to indirect vdevs (e.g. their vdev completed removal).
  *
  * Note that this will miss Log Spacemap entries that cancelled each other
  * out before being flushed to the metaslab, so we are not guaranteed
  * to match all erroneous ALLOCs.
  */
 static void
 livelist_metaslab_validate(spa_t *spa)
 {
 	(void) printf("Verifying deleted livelist entries\n");
 
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	iterate_deleted_livelists(spa, livelist_verify, &sv);
 
 	(void) printf("Verifying metaslab entries\n");
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (!vdev_is_concrete(vd))
 			continue;
 
 		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
 			metaslab_t *m = vd->vdev_ms[mid];
 
 			(void) fprintf(stderr,
 			    "\rverifying concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)mid,
 			    (longlong_t)vd->vdev_ms_count);
 
 			uint64_t shift, start;
 			zfs_range_seg_type_t type =
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
 			mv.mv_allocated = zfs_range_tree_create(NULL,
 			    type, NULL, start, shift);
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
 			mv.mv_end = m->ms_start + m->ms_size;
 			zfs_btree_create(&mv.mv_livelist_allocs,
 			    livelist_block_compare, NULL,
 			    sizeof (sublivelist_verify_block_t));
 
 			mv_populate_livelist_allocs(&mv, &sv);
 
 			spacemap_check_ms_sm(m->ms_sm, &mv);
 			spacemap_check_sm_log(spa, &mv);
 
 			zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL);
 			zfs_range_tree_destroy(mv.mv_allocated);
 			zfs_btree_clear(&mv.mv_livelist_allocs);
 			zfs_btree_destroy(&mv.mv_livelist_allocs);
 		}
 	}
 	(void) fprintf(stderr, "\n");
 
 	/*
 	 * If there are any segments in the leftover tree after we walked
 	 * through all the metaslabs in the concrete vdevs then this means
 	 * that we have segments in the livelists that belong to indirect
 	 * vdevs and are marked as allocated.
 	 */
 	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
 		zfs_btree_destroy(&sv.sv_leftover);
 		return;
 	}
 	(void) printf("ERROR: Found livelist blocks marked as allocated "
 	    "for indirect vdevs:\n");
 
 	zfs_btree_index_t *where = NULL;
 	sublivelist_verify_block_t *svb;
 	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
 	    NULL) {
 		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
 		ASSERT3U(vdev_id, <, rvd->vdev_children);
 		vdev_t *vd = rvd->vdev_child[vdev_id];
 		ASSERT(!vdev_is_concrete(vd));
 		(void) printf("<%d:%llx:%llx> TXG %llx\n",
 		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
 		    (u_longlong_t)svb->svb_allocated_txg);
 	}
 	(void) printf("\n");
 	zfs_btree_destroy(&sv.sv_leftover);
 }
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
 	    "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
 	    "\t%s [-v] <bookmark>\n"
 	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O [-K <key>] <dataset> <path>\n"
 	    "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
 	    cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr,  "    If object numbers or object number "
 	    "ranges are specified, only those\n"
 	    "    objects or ranges are dumped.\n\n");
 	(void) fprintf(stderr,
 	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
 	    "        start    Starting object number\n"
 	    "        end      Ending object number, or -1 for no upper bound\n"
 	    "        flags    Optional flags to select object types:\n"
 	    "            A     All objects (this is the default)\n"
 	    "            d     ZFS directories\n"
 	    "            f     ZFS files \n"
 	    "            m     SPA space maps\n"
 	    "            z     ZAPs\n"
 	    "            -     Negate effect of next flag\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -b --block-stats             "
 	    "block statistics\n");
 	(void) fprintf(stderr, "        -B --backup                  "
 	    "backup stream\n");
 	(void) fprintf(stderr, "        -c --checksum                "
 	    "checksum all metadata (twice for all data) blocks\n");
 	(void) fprintf(stderr, "        -C --config                  "
 	    "config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -d --datasets                "
 	    "dataset(s)\n");
 	(void) fprintf(stderr, "        -D --dedup-stats             "
 	    "dedup statistics\n");
 	(void) fprintf(stderr, "        -E --embedded-block-pointer=INTEGER\n"
 	    "                                     decode and display block "
 	    "from an embedded block pointer\n");
 	(void) fprintf(stderr, "        -h --history                 "
 	    "pool history\n");
 	(void) fprintf(stderr, "        -i --intent-logs             "
 	    "intent logs\n");
 	(void) fprintf(stderr, "        -l --label                   "
 	    "read label contents\n");
 	(void) fprintf(stderr, "        -k --checkpointed-state      "
 	    "examine the checkpointed state of the pool\n");
 	(void) fprintf(stderr, "        -L --disable-leak-tracking   "
 	    "disable leak tracking (do not load spacemaps)\n");
 	(void) fprintf(stderr, "        -m --metaslabs               "
 	    "metaslabs\n");
 	(void) fprintf(stderr, "        -M --metaslab-groups         "
 	    "metaslab groups\n");
 	(void) fprintf(stderr, "        -O --object-lookups          "
 	    "perform object lookups by path\n");
 	(void) fprintf(stderr, "        -r --copy-object             "
 	    "copy an object by path to file\n");
 	(void) fprintf(stderr, "        -R --read-block              "
 	    "read and display block from a device\n");
 	(void) fprintf(stderr, "        -s --io-stats                "
 	    "report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -S --simulate-dedup          "
 	    "simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v --verbose                 "
 	    "verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -y --livelist                "
 	    "perform livelist and metaslab validation on any livelists being "
 	    "deleted\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A --ignore-assertions       "
 	    "ignore assertions (-A), enable panic recovery (-AA) or both "
 	    "(-AAA)\n");
 	(void) fprintf(stderr, "        -e --exported                "
 	    "pool is exported/destroyed/has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -F --automatic-rewind        "
 	    "attempt automatic rewind within safe range of transaction "
 	    "groups\n");
 	(void) fprintf(stderr, "        -G --dump-debug-msg          "
 	    "dump zfs_dbgmsg buffer before exiting\n");
 	(void) fprintf(stderr, "        -I --inflight=INTEGER        "
 	    "specify the maximum number of checksumming I/Os "
 	    "[default is 200]\n");
 	(void) fprintf(stderr, "        -K --key=KEY                 "
 	    "decryption key for encrypted dataset\n");
 	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" "
 	    "set global variable to an unsigned 32-bit integer\n");
 	(void) fprintf(stderr, "        -p --path==PATH              "
 	    "use one or more with -e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P --parseable               "
 	    "print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -q --skip-label              "
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
 	(void) fprintf(stderr, "        -T --brt-stats               "
 	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
 	    "use alternate cachefile\n");
 	(void) fprintf(stderr, "        -V --verbatim                "
 	    "do verbatim import\n");
 	(void) fprintf(stderr, "        -x --dump-blocks=PATH        "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -X --extreme-rewind          "
 	    "attempt extreme rewind (does not work with dataset)\n");
 	(void) fprintf(stderr, "        -Y --all-reconstruction      "
 	    "attempt all reconstruction combinations for split blocks\n");
 	(void) fprintf(stderr, "        -Z --zstd-headers            "
 	    "show ZSTD headers \n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	zdb_exit(1);
 }
 
 static void
 dump_debug_buffer(void)
 {
 	ssize_t ret __attribute__((unused));
 
 	if (!dump_opt['G'])
 		return;
 	/*
 	 * We use write() instead of printf() so that this function
 	 * is safe to call from a signal handler.
 	 */
 	ret = write(STDERR_FILENO, "\n", 1);
 	zfs_dbgmsg_print(STDERR_FILENO, "zdb");
 }
 
 static void sig_handler(int signo)
 {
 	struct sigaction action;
 
 	libspl_backtrace(STDERR_FILENO);
 	dump_debug_buffer();
 
 	/*
 	 * Restore default action and re-raise signal so SIGSEGV and
 	 * SIGABRT can trigger a core dump.
 	 */
 	action.sa_handler = SIG_DFL;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	(void) sigaction(signo, &action, NULL);
 	raise(signo);
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	dump_debug_buffer();
 
 	zdb_exit(1);
 }
 
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) size;
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) size;
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf, buflen);
 }
 
 static void
 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
 	else
 		zfs_nicebytes(bytes, buf, buflen);
 }
 
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] == 0)
 			continue;
 		if (histo[i] > max)
 			max = histo[i];
 		if (i > maxidx)
 			maxidx = i;
 		if (i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 static void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	uint64_t *arr;
 	uint64_t oursize;
 	if (dump_opt['d'] < 6)
 		return;
 
 	if (data == NULL) {
 		dmu_object_info_t doi;
 
 		VERIFY0(dmu_object_info(os, object, &doi));
 		size = doi.doi_max_offset;
 		/*
 		 * We cap the size at 1 mebibyte here to prevent
 		 * allocation failures and nigh-infinite printing if the
 		 * object is extremely large.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = kmem_alloc(oursize, KM_SLEEP);
 
 		int err = dmu_read(os, object, 0, oursize, arr, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(arr, oursize);
 			return;
 		}
 	} else {
 		/*
 		 * Even though the allocation is already done in this code path,
 		 * we still cap the size to prevent excessive printing.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = data;
 	}
 
 	if (size == 0) {
 		if (data == NULL)
 			kmem_free(arr, oursize);
 		(void) printf("\t\t[]\n");
 		return;
 	}
 
 	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
 	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
 		if (i % 4 != 0)
 			(void) printf(", %0llx", (u_longlong_t)arr[i]);
 		else
 			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
 	}
 	if (oursize != size)
 		(void) printf(", ... ");
 	(void) printf("]\n");
 
 	if (data == NULL)
 		kmem_free(arr, oursize);
 }
 
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	void *prop;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		boolean_t key64 =
 		    !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
 
 		if (key64)
 			(void) printf("\t\t0x%010" PRIu64 "x = ",
 			    *(uint64_t *)attrp->za_name);
 		else
 			(void) printf("\t\t%s = ", attrp->za_name);
 
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		if (key64)
 			(void) zap_lookup_uint64(os, object,
 			    (const uint64_t *)attrp->za_name, 1,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 		else
 			(void) zap_lookup(os, object, attrp->za_name,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 
 		if (attrp->za_integer_length == 1 && !key64) {
 			if (strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
 			    strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
 			    strcmp(attrp->za_name,
 			    DMU_POOL_CHECKSUM_SALT) == 0) {
 				uint8_t *u8 = prop;
 
 				for (i = 0; i < attrp->za_num_integers; i++) {
 					(void) printf("%02x", u8[i]);
 				}
 			} else {
 				(void) printf("%s", (char *)prop);
 			}
 		} else {
 			for (i = 0; i < attrp->za_num_integers; i++) {
 				switch (attrp->za_integer_length) {
 				case 1:
 					(void) printf("%u ",
 					    ((uint8_t *)prop)[i]);
 					break;
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	uint64_t i;
 	char bytes[32], comp[32], uncomp[32];
 
 	/* make sure the output won't get truncated */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= BPOBJ_SIZE_V2) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tnum_freed = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_freed);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
 		    BP_GET_FREE(&bp));
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dmu_object_info_t doi;
 	int64_t i;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attrp->za_first_integer,
 		    (int)ATTR_LENGTH(attrp->za_first_integer),
 		    (int)ATTR_BSWAP(attrp->za_first_integer),
 		    (int)ATTR_NUM(attrp->za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	uint16_t *layout_attrs;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attrp->za_integer_length == 2);
 		layout_attrs = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attrp->za_name,
 		    attrp->za_integer_length,
 		    attrp->za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attrp->za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 static int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd) {
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_obsolete_refcount(vdev_t *vd)
 {
 	uint64_t obsolete_sm_object;
 	int refcount = 0;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
 		    obsolete_sm_object, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			refcount++;
 		}
 	} else {
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 		ASSERT3U(obsolete_sm_object, ==, 0);
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++) {
 		refcount += get_obsolete_refcount(vd->vdev_child[c]);
 	}
 
 	return (refcount);
 }
 
 static int
 get_prev_obsolete_spacemap_refcount(spa_t *spa)
 {
 	uint64_t prev_obj =
 	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
 	if (prev_obj != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 get_checkpoint_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
 	    zap_contains(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
 		refcount++;
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_log_spacemap_refcount(spa_t *spa)
 {
 	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 	actual_refcount += get_log_spacemap_refcount(spa);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	(void) printf("space map object %llu:\n",
 	    (longlong_t)sm->sm_object);
 	(void) printf("  smp_length = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_length);
 	(void) printf("  smp_alloc = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_alloc);
 
 	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	uint8_t mapshift = sm->sm_shift;
 	int64_t alloc = 0;
 	uint64_t word, entry_id = 0;
 	for (uint64_t offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (word), &word, DMU_READ_PREFETCH));
 
 		if (sm_entry_is_debug(word)) {
 			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
 			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
 			if (de_txg == 0) {
 				(void) printf(
 				    "\t    [%6llu] PADDING\n",
 				    (u_longlong_t)entry_id);
 			} else {
 				(void) printf(
 				    "\t    [%6llu] %s: txg %llu pass %llu\n",
 				    (u_longlong_t)entry_id,
 				    ddata[SM_DEBUG_ACTION_DECODE(word)],
 				    (u_longlong_t)de_txg,
 				    (u_longlong_t)de_sync_pass);
 			}
 			entry_id++;
 			continue;
 		}
 
 		uint8_t words;
 		char entry_type;
 		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 
 		if (sm_entry_is_single_word(word)) {
 			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 			    sm->sm_start;
 			entry_run = SM_RUN_DECODE(word) << mapshift;
 			words = 1;
 		} else {
 			/* it is a two-word entry so we read another word */
 			ASSERT(sm_entry_is_double_word(word));
 
 			uint64_t extra_word;
 			offset += sizeof (extra_word);
 			VERIFY0(dmu_read(os, space_map_object(sm), offset,
 			    sizeof (extra_word), &extra_word,
 			    DMU_READ_PREFETCH));
 
 			ASSERT3U(offset, <=, space_map_length(sm));
 
 			entry_run = SM2_RUN_DECODE(word) << mapshift;
 			entry_vdev = SM2_VDEV_DECODE(word);
 			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 			    mapshift) + sm->sm_start;
 			words = 2;
 		}
 
 		(void) printf("\t    [%6llu]    %c  range:"
 		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 		    (u_longlong_t)entry_id,
 		    entry_type, (u_longlong_t)entry_off,
 		    (u_longlong_t)(entry_off + entry_run),
 		    (u_longlong_t)entry_run,
 		    (u_longlong_t)entry_vdev, words);
 
 		if (entry_type == 'A')
 			alloc += entry_run;
 		else
 			alloc -= entry_run;
 		entry_id++;
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
 		    "with space map summary (%lld)\n",
 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
 	_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
 
 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
 	    sizeof (freebuf));
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_ops)
 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
 	else
 		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
 
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
 		    (u_longlong_t)metaslab_unflushed_txg(msp));
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *bias_str = "";
 	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
 		bias_str = VDEV_ALLOC_BIAS_LOG;
 	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
 		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
 	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
 		bias_str = VDEV_ALLOC_BIAS_DEDUP;
 	}
 
 	uint64_t ms_flush_data_obj = 0;
 	if (vd->vdev_top_zap != 0) {
 		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 		    sizeof (uint64_t), 1, &ms_flush_data_obj);
 		if (error != ENOENT) {
 			ASSERT0(error);
 		}
 	}
 
 	(void) printf("\tvdev %10llu   %s",
 	    (u_longlong_t)vd->vdev_id, bias_str);
 
 	if (ms_flush_data_obj != 0) {
 		(void) printf("   ms_unflushed_phys object %llu",
 		    (u_longlong_t)ms_flush_data_obj);
 	}
 
 	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %12s\n",
 	    "---------------", "-------------------",
 	    "---------------", "------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	metaslab_class_t *smc = spa_special_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg == NULL || (mg->mg_class != mc &&
 		    (!show_special || mg->mg_class != smc)))
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram,
 		    ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 print_vdev_indirect(vdev_t *vd)
 {
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 	if (vim == NULL) {
 		ASSERT3P(vib, ==, NULL);
 		return;
 	}
 
 	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
 	    vic->vic_mapping_object);
 	ASSERT3U(vdev_indirect_births_object(vib), ==,
 	    vic->vic_births_object);
 
 	(void) printf("indirect births obj %llu:\n",
 	    (longlong_t)vic->vic_births_object);
 	(void) printf("    vib_count = %llu\n",
 	    (longlong_t)vdev_indirect_births_count(vib));
 	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
 		vdev_indirect_birth_entry_phys_t *cur_vibe =
 		    &vib->vib_entries[i];
 		(void) printf("\toffset %llx -> txg %llu\n",
 		    (longlong_t)cur_vibe->vibe_offset,
 		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
 	}
 	(void) printf("\n");
 
 	(void) printf("indirect mapping obj %llu:\n",
 	    (longlong_t)vic->vic_mapping_object);
 	(void) printf("    vim_max_offset = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
 	(void) printf("    vim_bytes_mapped = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
 	(void) printf("    vim_count = %llu\n",
 	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
 
 	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
 		return;
 
 	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		(void) printf("\t<%llx:%llx:%llx> -> "
 		    "<%llx:%llx:%llx> (%x obsolete)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    counts[i]);
 	}
 	(void) printf("\n");
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		(void) printf("obsolete space map object %llu:\n",
 		    (u_longlong_t)obsolete_sm_object);
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
 		    obsolete_sm_object);
 		dump_spacemap(mos, vd->vdev_obsolete_sm);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (!dump_opt['d'] && zopt_metaslab_args > 0) {
 		c = zopt_metaslab[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_metaslab_args > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_metaslab_args; m++) {
 				if (zopt_metaslab[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_metaslab[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_metaslab[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		print_vdev_indirect(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_log_spacemaps(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	(void) printf("\nLog Space Maps in Pool:\n");
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		(void) printf("Log Spacemap object %llu txg %llu\n",
 		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
 		dump_spacemap(spa->spa_meta_objset, sm);
 		space_map_close(sm);
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
     uint64_t index)
 {
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
 	for (p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu phys %d %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
 		    p, blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt_log(ddt_t *ddt)
 {
 	if (ddt->ddt_version != DDT_VERSION_FDT ||
 	    !(ddt->ddt_flags & DDT_FLAG_LOG))
 		return;
 
 	for (int n = 0; n < 2; n++) {
 		ddt_log_t *ddl = &ddt->ddt_log[n];
 
 		char flagstr[64] = {0};
 		if (ddl->ddl_flags > 0) {
 			flagstr[0] = ' ';
 			int c = 1;
 			if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
 				c += strlcpy(&flagstr[c], " FLUSHING",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT)
 				c += strlcpy(&flagstr[c], " CHECKPOINT",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags &
 			    ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT))
 				c += strlcpy(&flagstr[c], " UNKNOWN",
 				    sizeof (flagstr) - c);
 			flagstr[1] = '[';
 			flagstr[c++] = ']';
 		}
 
 		uint64_t count = avl_numnodes(&ddl->ddl_tree);
 
 		printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; "
 		    "len=%llu; txg=%llu; entries=%llu\n",
 		    zio_checksum_table[ddt->ddt_checksum].ci_name, n,
 		    ddl->ddl_flags, flagstr,
 		    (u_longlong_t)ddl->ddl_object,
 		    (u_longlong_t)ddl->ddl_length,
 		    (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count);
 
 		if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) {
 			const ddt_key_t *ddk = &ddl->ddl_checkpoint;
 			printf("    checkpoint: "
 			    "%016llx:%016llx:%016llx:%016llx:%016llx\n",
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[0],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[1],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[2],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[3],
 			    (u_longlong_t)ddk->ddk_prop);
 		}
 
 		if (count == 0 || dump_opt['D'] < 4)
 			continue;
 
 		ddt_lightweight_entry_t ddlwe;
 		uint64_t index = 0;
 		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
 		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
 			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
 			dump_ddt_entry(ddt, &ddlwe, index++);
 		}
 	}
 }
 
 static void
 dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
 	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT(error == 0);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT(error == 0);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name,
 	    (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count);
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	(void) printf("%s: object=%llu\n", name,
 	    (u_longlong_t)ddt->ddt_object[type][class]);
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
 		dump_ddt_entry(ddt, &ddlwe, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_ddt(ddt_t *ddt)
 {
 	if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 		return;
 
 	char flagstr[64] = {0};
 	if (ddt->ddt_flags > 0) {
 		flagstr[0] = ' ';
 		int c = 1;
 		if (ddt->ddt_flags & DDT_FLAG_FLAT)
 			c += strlcpy(&flagstr[c], " FLAT",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & DDT_FLAG_LOG)
 			c += strlcpy(&flagstr[c], " LOG",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & ~DDT_FLAG_MASK)
 			c += strlcpy(&flagstr[c], " UNKNOWN",
 			    sizeof (flagstr) - c);
 		flagstr[1] = '[';
 		flagstr[c] = ']';
 	}
 
 	printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n",
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    (u_longlong_t)ddt->ddt_version,
 	    (ddt->ddt_version == 0) ? "LEGACY" :
 	    (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN",
 	    (u_longlong_t)ddt->ddt_flags, flagstr,
 	    (u_longlong_t)ddt->ddt_dir_object);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++)
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++)
 			dump_ddt_object(ddt, type, class);
 
 	dump_ddt_log(ddt);
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 		dump_ddt(spa->spa_ddt[c]);
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 
 	/*
 	 * Dump a histogram of unique class entry age
 	 */
 	if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
 		ddt_age_histo_t histogram;
 
 		(void) printf("DDT walk unique, building age histogram...\n");
 		ddt_prune_walk(spa, 0, &histogram);
 
 		/*
 		 * print out histogram for unique entry class birth
 		 */
 		if (histogram.dah_entries > 0) {
 			(void) printf("%5s  %9s  %4s\n",
 			    "age", "blocks", "amnt");
 			(void) printf("%5s  %9s  %4s\n",
 			    "-----", "---------", "----");
 			for (int i = 0; i < HIST_BINS; i++) {
 				(void) printf("%5d  %9d %4d%%\n", 1 << i,
 				    (int)histogram.dah_age_histo[i],
 				    (int)((histogram.dah_age_histo[i] * 100) /
 				    histogram.dah_entries));
 			}
 		}
 	}
 }
 
 static void
 dump_brt(spa_t *spa)
 {
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: unsupported on this pool\n");
 		return;
 	}
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: empty\n");
 		return;
 	}
 
 	char count[32], used[32], saved[32];
 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
 	uint64_t ratio = brt_get_ratio(spa);
 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
 
 	if (dump_opt['T'] < 2)
 		return;
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated) {
 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 
 	if (dump_opt['T'] < 3)
 		return;
 
 	/* -TTT shows a per-vdev histograms; -TTTT shows all entries */
 	boolean_t do_histo = dump_opt['T'] == 3;
 
 	char dva[64];
 
 	if (!do_histo)
 		printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated)
 			continue;
 
 		uint64_t counts[64] = {};
 
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    brtvd->bv_mos_entries);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t refcnt;
 			VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
 			    brtvd->bv_mos_entries,
 			    (const uint64_t *)za->za_name, 1,
 			    za->za_integer_length, za->za_num_integers,
 			    &refcnt));
 
 			if (do_histo)
 				counts[highbit64(refcnt)]++;
 			else {
 				uint64_t offset =
 				    *(const uint64_t *)za->za_name;
 
 				snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx",
 				    vdevid, (u_longlong_t)offset);
 				printf("%-16s %-10llu\n", dva,
 				    (u_longlong_t)refcnt);
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		if (do_histo) {
 			printf("\nBRT: vdev %" PRIu64
 			    ": DVAs with 2^n refcnts:\n", vdevid);
 			dump_histogram(counts, 64, 0);
 		}
 	}
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
 		"outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_t *rt = vd->vdev_dtl[t];
 		if (zfs_range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		zfs_range_tree_walk(rt, dump_dtl_seg, prefix);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset,
 			    vd->vdev_dtl_sm);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	char tbuf[30];
 
 	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
 		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
 		    __func__);
 		return;
 	}
 
 	do {
 		len = SPA_OLD_MAXBLOCKSIZE;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			free(buf);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 
 		off -= resid;
 	} while (len != 0);
 
 	(void) printf("\nHistory:\n");
 	for (unsigned i = 0; i < num; i++) {
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
 			time_t tsec;
 			struct tm t;
 
 			tsec = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TIME);
 			(void) localtime_r(&tsec, &t);
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		} else {
 			tbuf[0] = '\0';
 		}
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s\n", tbuf,
 			    fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
 			uint64_t ievent;
 
 			ievent = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) printf(" %s [internal %s txg:%ju] %s\n",
 			    tbuf,
 			    zfs_history_event_names[ievent],
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
 			(void) printf("%s [txg:%ju] %s", tbuf,
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_NAME));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
 				(void) printf(" %s (%llu)",
 				    fnvlist_lookup_string(events[i],
 				    ZPOOL_HIST_DSNAME),
 				    (u_longlong_t)fnvlist_lookup_uint64(
 				    events[i],
 				    ZPOOL_HIST_DSID));
 			}
 
 			(void) printf(" %s\n", fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
 			(void) printf("%s ioctl %s\n", tbuf,
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_IOCTL));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
 				(void) printf("    input:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_INPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
 				(void) printf("    output:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_OUTPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
 				(void) printf("    errno: %lld\n",
 				    (longlong_t)fnvlist_lookup_int64(events[i],
 				    ZPOOL_HIST_ERRNO));
 			}
 		} else {
 			goto next;
 		}
 
 		printed = B_TRUE;
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 	free(buf);
 }
 
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
     const blkptr_t *bp)
 {
 	static abd_t *pabd = NULL;
 	void *buf;
 	zio_t *zio;
 	zfs_zstdhdr_t zstd_hdr;
 	int error;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
 		return;
 
 	if (BP_IS_HOLE(bp))
 		return;
 
 	if (BP_IS_EMBEDDED(bp)) {
 		buf = malloc(SPA_MAXBLOCKSIZE);
 		if (buf == NULL) {
 			(void) fprintf(stderr, "out of memory\n");
 			zdb_exit(1);
 		}
 		decode_embedded_bp_compressed(bp, buf);
 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 		free(buf);
 		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
 		    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 		    zfs_get_hdrlevel(&zstd_hdr));
 		return;
 	}
 
 	if (!pabd)
 		pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	/* Decrypt but don't decompress so we can read the compression header */
 	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
 	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
 	    NULL));
 	error = zio_wait(zio);
 	if (error) {
 		(void) fprintf(stderr, "read failed: %d\n", error);
 		return;
 	}
 	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
 	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 
 	(void) snprintf(blkbuf + strlen(blkbuf),
 	    buflen - strlen(blkbuf),
 	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
 	    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 	    zfs_get_hdrlevel(&zstd_hdr));
 
 	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
     boolean_t bp_freed)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 	int i;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		if (bp_freed) {
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		}
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
 	blkbuf[0] = '\0';
 
-	for (i = 0; i < ndvas; i++)
+	for (i = 0; i < ndvas; i++) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
-		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+		    buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
-		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]),
+		    (DVA_GET_GANG(&dva[i]) ? "G" : ""));
+	}
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
 		    (u_longlong_t)BP_GET_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " cksum=%016llx:%016llx:%016llx:%016llx",
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 	}
 }
 
 static void
 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
 	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
 		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
 	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 		ASSERT(!BP_IS_REDACTED(bp));
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
 		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
 }
 
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (int j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
 	    sizeof (nice)); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 	(void) printf("\t\tclones = %llu\n",
 	    (u_longlong_t)dd->dd_clones);
 }
 
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
 	_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
 	    "compressed truncated");
 	_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
 	    "uncompressed truncated");
 	_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
 	    sizeof (uncompressed));
 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, const char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	uint64_t i;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		} else {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		}
 
 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%llu freed, %s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    bytes);
 		} else {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    bytes);
 		}
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static int
 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
     boolean_t print_list)
 {
 	int err = 0;
 	zfs_bookmark_phys_t prop;
 	objset_t *mos = dp->dp_spa->spa_meta_objset;
 	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
 
 	if (err != 0) {
 		return (err);
 	}
 
 	(void) printf("\t#%s: ", strchr(name, '#') + 1);
 	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
 	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
 	    (u_longlong_t)prop.zbm_creation_txg,
 	    (u_longlong_t)prop.zbm_creation_time,
 	    (u_longlong_t)prop.zbm_redaction_obj);
 
 	IMPLY(print_list, print_redact);
 	if (!print_redact || prop.zbm_redaction_obj == 0)
 		return (0);
 
 	redaction_list_t *rl;
 	VERIFY0(dsl_redaction_list_hold_obj(dp,
 	    prop.zbm_redaction_obj, FTAG, &rl));
 
 	redaction_list_phys_t *rlp = rl->rl_phys;
 	(void) printf("\tRedacted:\n\t\tProgress: ");
 	if (rlp->rlp_last_object != UINT64_MAX ||
 	    rlp->rlp_last_blkid != UINT64_MAX) {
 		(void) printf("%llu %llu (incomplete)\n",
 		    (u_longlong_t)rlp->rlp_last_object,
 		    (u_longlong_t)rlp->rlp_last_blkid);
 	} else {
 		(void) printf("complete\n");
 	}
 	(void) printf("\t\tSnapshots: [");
 	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
 		if (i > 0)
 			(void) printf(", ");
 		(void) printf("%0llu",
 		    (u_longlong_t)rlp->rlp_snaps[i]);
 	}
 	(void) printf("]\n\t\tLength: %llu\n",
 	    (u_longlong_t)rlp->rlp_num_entries);
 
 	if (!print_list) {
 		dsl_redaction_list_rele(rl, FTAG);
 		return (0);
 	}
 
 	if (rlp->rlp_num_entries == 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		(void) printf("\t\tRedaction List: []\n\n");
 		return (0);
 	}
 
 	redact_block_phys_t *rbp_buf;
 	uint64_t size;
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
 	size = doi.doi_max_offset;
 	rbp_buf = kmem_alloc(size, KM_SLEEP);
 
 	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
 	    rbp_buf, 0);
 	if (err != 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		kmem_free(rbp_buf, size);
 		return (err);
 	}
 
 	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
 	    "%llx, blksz: %x, count: %llx}",
 	    (u_longlong_t)rbp_buf[0].rbp_object,
 	    (u_longlong_t)rbp_buf[0].rbp_blkid,
 	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
 	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
 
 	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
 		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
 		    "blksz: %x, count: %llx}",
 		    (u_longlong_t)rbp_buf[i].rbp_object,
 		    (u_longlong_t)rbp_buf[i].rbp_blkid,
 		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
 		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
 	}
 	dsl_redaction_list_rele(rl, FTAG);
 	kmem_free(rbp_buf, size);
 	(void) printf("]\n\n");
 	return (0);
 }
 
 static void
 dump_bookmarks(objset_t *os, int verbosity)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *attrp;
 	dsl_dataset_t *ds = dmu_objset_ds(os);
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	objset_t *mos = os->os_spa->spa_meta_objset;
 	if (verbosity < 4)
 		return;
 	attrp = zap_attribute_alloc();
 	dsl_pool_config_enter(dp, FTAG);
 
 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		char osname[ZFS_MAX_DATASET_NAME_LEN];
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		int len;
 		dmu_objset_name(os, osname);
 		len = snprintf(buf, sizeof (buf), "%s#%s", osname,
 		    attrp->za_name);
 		VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
 		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
 	}
 	zap_cursor_fini(&zc);
 	dsl_pool_config_exit(dp, FTAG);
 	zap_attribute_free(attrp);
 }
 
 static void
 bpobj_count_refd(bpobj_t *bpo)
 {
 	mos_obj_refd(bpo->bpo_object);
 
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				continue;
 			}
 			bpobj_count_refd(&subbpo);
 			bpobj_close(&subbpo);
 		}
 	}
 }
 
 static int
 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
 {
 	spa_t *spa = arg;
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 	if (dle->dle_bpobj.bpo_object != empty_bpobj)
 		bpobj_count_refd(&dle->dle_bpobj);
 	return (0);
 }
 
 static int
 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
 {
 	ASSERT(arg == NULL);
 	if (dump_opt['d'] >= 5) {
 		char buf[128];
 		(void) snprintf(buf, sizeof (buf),
 		    "mintxg %llu -> obj %llu",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 
 		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 	} else {
 		(void) printf("mintxg %llu -> obj %llu\n",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 	}
 	return (0);
 }
 
 static void
 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	char entries[32];
 	spa_t *spa = dmu_objset_spa(dl->dl_os);
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 
 	if (dl->dl_oldfmt) {
 		if (dl->dl_bpobj.bpo_object != empty_bpobj)
 			bpobj_count_refd(&dl->dl_bpobj);
 	} else {
 		mos_obj_refd(dl->dl_object);
 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
 	}
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 	_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
 	    name, bytes, comp, uncomp, entries);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) putchar('\n');
 
 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
 }
 
 static int
 verify_dd_livelist(objset_t *os)
 {
 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
 
 	ASSERT(!dmu_objset_is_snapshot(os));
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return (0);
 
 	/* Iterate through the livelist to check for duplicates */
 	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
 	    NULL);
 
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
 	    &ll_comp, &ll_uncomp);
 
 	dsl_dataset_t *origin_ds;
 	ASSERT(dsl_pool_config_held(dp));
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
 	    &used, &comp, &uncomp));
 	dsl_dataset_rele(origin_ds, FTAG);
 	dsl_pool_config_exit(dp, FTAG);
 	/*
 	 *  It's possible that the dataset's uncomp space is larger than the
 	 *  livelist's because livelists do not track embedded block pointers
 	 */
 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
 		char nice_used[32], nice_comp[32], nice_uncomp[32];
 		(void) printf("Discrepancy in space accounting:\n");
 		zdb_nicenum(used, nice_used, sizeof (nice_used));
 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("dir: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		return (1);
 	}
 	return (0);
 }
 
 static char *key_material = NULL;
 
 static boolean_t
 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
 {
 	uint64_t keyformat, salt, iters;
 	int i;
 	unsigned char c;
 
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
 	    1, &keyformat));
 
 	switch (keyformat) {
 	case ZFS_KEYFORMAT_HEX:
 		for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
 			if (!isxdigit(key_material[i]) ||
 			    !isxdigit(key_material[i+1]))
 				return (B_FALSE);
 			if (sscanf(&key_material[i], "%02hhx", &c) != 1)
 				return (B_FALSE);
 			key_out[i / 2] = c;
 		}
 		break;
 
 	case ZFS_KEYFORMAT_PASSPHRASE:
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
 		    sizeof (uint64_t), 1, &salt));
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
 		    sizeof (uint64_t), 1, &iters));
 
 		if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
 		    ((uint8_t *)&salt), sizeof (uint64_t), iters,
 		    WRAPPING_KEY_LEN, key_out) != 1)
 			return (B_FALSE);
 
 		break;
 
 	default:
 		fatal("no support for key format %u\n",
 		    (unsigned int) keyformat);
 	}
 
 	return (B_TRUE);
 }
 
 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
 static boolean_t key_loaded = B_FALSE;
 
 static void
 zdb_load_key(objset_t *os)
 {
 	dsl_pool_t *dp;
 	dsl_dir_t *dd, *rdd;
 	uint8_t key[WRAPPING_KEY_LEN];
 	uint64_t rddobj;
 	int err;
 
 	dp = spa_get_dsl(os->os_spa);
 	dd = os->os_dsl_dataset->ds_dir;
 
 	dsl_pool_config_enter(dp, FTAG);
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
 	VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
 	dsl_dir_name(rdd, encroot);
 	dsl_dir_rele(rdd, FTAG);
 
 	if (!zdb_derive_key(dd, key))
 		fatal("couldn't derive encryption key");
 
 	dsl_pool_config_exit(dp, FTAG);
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
 
 	dsl_crypto_params_t *dcp;
 	nvlist_t *crypto_args;
 
 	crypto_args = fnvlist_alloc();
 	fnvlist_add_uint8_array(crypto_args, "wkeydata",
 	    (uint8_t *)key, WRAPPING_KEY_LEN);
 	VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 	    NULL, crypto_args, &dcp));
 	err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
 
 	dsl_crypto_params_free(dcp, (err != 0));
 	fnvlist_free(crypto_args);
 
 	if (err != 0)
 		fatal(
 		    "couldn't load encryption key for %s: %s",
 		    encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
 		    "crypto params not supported" : strerror(err));
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
 
 	printf("Unlocked encryption root: %s\n", encroot);
 	key_loaded = B_TRUE;
 }
 
 static void
 zdb_unload_key(void)
 {
 	if (!key_loaded)
 		return;
 
 	VERIFY0(spa_keystore_unload_wkey(encroot));
 	key_loaded = B_FALSE;
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static objset_t *sa_os = NULL;
 static sa_attr_type_t *sa_attr_table = NULL;
 
 static int
 open_objset(const char *path, const void *tag, objset_t **osp)
 {
 	int err;
 	uint64_t sa_attrs = 0;
 	uint64_t version = 0;
 
 	VERIFY3P(sa_os, ==, NULL);
 
 	/*
 	 * We can't own an objset if it's redacted.  Therefore, we do this
 	 * dance: hold the objset, then acquire a long hold on its dataset, then
 	 * release the pool (which is held as part of holding the objset).
 	 */
 
 	if (dump_opt['K']) {
 		/* decryption requested, try to load keys */
 		err = dmu_objset_hold(path, tag, osp);
 		if (err != 0) {
 			(void) fprintf(stderr, "failed to hold dataset "
 			    "'%s': %s\n",
 			    path, strerror(err));
 			return (err);
 		}
 		dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 		dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 		/* succeeds or dies */
 		zdb_load_key(*osp);
 
 		/* release it all */
 		dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 		dsl_dataset_rele(dmu_objset_ds(*osp), tag);
 	}
 
 	int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
 
 	err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
 		    path, strerror(err));
 		return (err);
 	}
 	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 	dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 	if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
 	    (key_loaded || !(*osp)->os_encrypted)) {
 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version);
 		if (version >= ZPL_VERSION_SA) {
 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs);
 		}
 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
 		    &sa_attr_table);
 		if (err != 0) {
 			(void) fprintf(stderr, "sa_setup failed: %s\n",
 			    strerror(err));
 			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 			dsl_dataset_rele_flags(dmu_objset_ds(*osp),
 			    ds_hold_flags, tag);
 			*osp = NULL;
 		}
 	}
 	sa_os = *osp;
 
 	return (err);
 }
 
 static void
 close_objset(objset_t *os, const void *tag)
 {
 	VERIFY3P(os, ==, sa_os);
 	if (os->os_sa != NULL)
 		sa_tear_down(os);
 	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
 	dsl_dataset_rele_flags(dmu_objset_ds(os),
 	    key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
 	sa_attr_table = NULL;
 	sa_os = NULL;
 
 	zdb_unload_key();
 }
 
 static void
 fuid_table_destroy(void)
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
  * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
  * wouldn't want to anyway), but if we don't clean up the presence of stuff on
  * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
  *
  * Note that this is not a particularly efficient way to do this, but
  * ddt_remove() is the only public method that can do the work we need, and it
  * requires the right locks and etc to do the job. This is only ever called
  * during zdb shutdown so efficiency is not especially important.
  */
 static void
 zdb_ddt_cleanup(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt)
 			continue;
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		ddt_enter(ddt);
 		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
 		while (dde) {
 			next = AVL_NEXT(&ddt->ddt_tree, dde);
 			dde->dde_io = NULL;
 			ddt_remove(ddt, dde);
 			dde = next;
 		}
 		ddt_exit(ddt);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 }
 
 static void
 zdb_exit(int reason)
 {
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	exit(reason);
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		const char *domain =
 		    zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 static void
 dump_znode_sa_xattr(sa_handle_t *hdl)
 {
 	nvlist_t *sa_xattr;
 	nvpair_t *elem = NULL;
 	int sa_xattr_size = 0;
 	int sa_xattr_entries = 0;
 	int error;
 	char *sa_xattr_packed;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
 	if (error || sa_xattr_size == 0)
 		return;
 
 	sa_xattr_packed = malloc(sa_xattr_size);
 	if (sa_xattr_packed == NULL)
 		return;
 
 	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
 	    sa_xattr_packed, sa_xattr_size);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
 		sa_xattr_entries++;
 
 	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
 	    sa_xattr_size, sa_xattr_entries);
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
 		boolean_t can_print = !dump_opt['P'];
 		uchar_t *value;
 		uint_t cnt, idx;
 
 		(void) printf("\t\t%s = ", nvpair_name(elem));
 		nvpair_value_byte_array(elem, &value, &cnt);
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (!isprint(value[idx])) {
 				can_print = B_FALSE;
 				break;
 			}
 		}
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (can_print)
 				(void) putchar(value[idx]);
 			else
 				(void) printf("\\%3.3o", value[idx]);
 		}
 		(void) putchar('\n');
 	}
 
 	nvlist_free(sa_xattr);
 	free(sa_xattr_packed);
 }
 
 static void
 dump_znode_symlink(sa_handle_t *hdl)
 {
 	int sa_symlink_size = 0;
 	char linktarget[MAXPATHLEN];
 	int error;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
 	if (error || sa_symlink_size == 0) {
 		return;
 	}
 	if (sa_symlink_size >= sizeof (linktarget)) {
 		(void) printf("symlink size %d is too large\n",
 		    sa_symlink_size);
 		return;
 	}
 	linktarget[sa_symlink_size] = '\0';
 	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
 	    &linktarget, sa_symlink_size) == 0)
 		(void) printf("\ttarget	%s\n", linktarget);
 }
 
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	VERIFY3P(os, ==, sa_os);
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	if (dump_opt['d'] > 4) {
 		error = zfs_obj_to_path(os, object, path, sizeof (path));
 		if (error == ESTALE) {
 			(void) snprintf(path, sizeof (path), "on delete queue");
 		} else if (error != 0) {
 			leaked_objects++;
 			(void) snprintf(path, sizeof (path),
 			    "path not found, possibly leaked");
 		}
 		(void) printf("\tpath	%s\n", path);
 	}
 
 	if (S_ISLNK(mode))
 		dump_znode_symlink(hdl);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
 		uint64_t projid;
 
 		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
 		    sizeof (uint64_t)) == 0)
 			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
 	}
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	dump_znode_sa_xattr(hdl);
 	sa_handle_destroy(hdl);
 }
 
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group/project used	*/
 	dump_zap,		/* ZFS user/group/project quota	*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static boolean_t
 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
 {
 	boolean_t match = B_TRUE;
 
 	switch (obj_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (!(flags & ZOR_FLAG_DIRECTORY))
 			match = B_FALSE;
 		break;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (!(flags & ZOR_FLAG_PLAIN_FILE))
 			match = B_FALSE;
 		break;
 	case DMU_OT_SPACE_MAP:
 		if (!(flags & ZOR_FLAG_SPACE_MAP))
 			match = B_FALSE;
 		break;
 	default:
 		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
 			if (!(flags & ZOR_FLAG_ZAP))
 				match = B_FALSE;
 			break;
 		}
 
 		/*
 		 * If all bits except some of the supported flags are
 		 * set, the user combined the all-types flag (A) with
 		 * a negated flag to exclude some types (e.g. A-f to
 		 * show all object types except plain files).
 		 */
 		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
 			match = B_FALSE;
 
 		break;
 	}
 
 	return (match);
 }
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity,
     boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	boolean_t dnode_held = B_FALSE;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
 	_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
 	_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
 	_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
 	_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
 	    "bonus_size truncated");
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
 		    "lsize", "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 		dmu_object_info_from_dnode(dn, &doi);
 	} else {
 		/*
 		 * Encrypted datasets will have sensitive bonus buffers
 		 * encrypted. Therefore we cannot hold the bonus buffer and
 		 * must hold the dnode itself instead.
 		 */
 		error = dmu_object_info(os, object, &doi);
 		if (error)
 			fatal("dmu_object_info() failed, errno %u", error);
 
 		if (!key_loaded && os->os_encrypted &&
 		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
 			error = dnode_hold(os, object, FTAG, &dn);
 			if (error)
 				fatal("dnode_hold() failed, errno %u", error);
 			dnode_held = B_TRUE;
 		} else {
 			error = dmu_bonus_hold(os, object, FTAG, &db);
 			if (error)
 				fatal("dmu_bonus_hold(%llu) failed, errno %u",
 				    object, error);
 			bonus = db->db_data;
 			bsize = db->db_size;
 			dn = DB_DNODE((dmu_buf_impl_t *)db);
 		}
 	}
 
 	/*
 	 * Default to showing all object types if no flags were specified.
 	 */
 	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
 	    !match_object_type(doi.doi_type, flags))
 		goto out;
 
 	if (dnode_slots_used)
 		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
 	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
 	(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
 	    doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
 	    DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
 	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
 		const char *compname = NULL;
 		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
 		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
 		    &compname) == 0) {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
 			    compname);
 		} else {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux),
 			    " (Z=inherit=%s-unknown)",
 			    ZDB_COMPRESS_NAME(os->os_compress));
 		}
 	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
 	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", "", bonus_size, "bonus",
 		    zdb_ot_name(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
 		    "USEROBJUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		if (!dnode_held) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
 			    object, bonus, bsize);
 		} else {
 			(void) printf("\t\t(bonus encrypted)\n");
 		}
 
 		if (key_loaded ||
 		    (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
 			    NULL, 0);
 		} else {
 			(void) printf("\t\t(object encrypted)\n");
 		}
 
 		*print_header = B_TRUE;
 	}
 
 	if (verbosity >= 5) {
 		if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			char blkbuf[BP_SPRINTF_LEN];
 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 			    DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
 			(void) printf("\nSpill block: %s\n", blkbuf);
 		}
 		dump_indirect(dn);
 	}
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
 			    "segsize truncated");
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize, sizeof (segsize));
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 out:
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 	if (dnode_held)
 		dnode_rele(dn, FTAG);
 }
 
 static void
 count_dir_mos_objects(dsl_dir_t *dd)
 {
 	mos_obj_refd(dd->dd_object);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
 
 	/*
 	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
 	 * Ignore the references after the first one.
 	 */
 	mos_obj_refd_multiple(dd->dd_crypto_obj);
 }
 
 static void
 count_ds_mos_objects(dsl_dataset_t *ds)
 {
 	mos_obj_refd(ds->ds_object);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	mos_obj_refd(ds->ds_bookmarks_obj);
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		count_dir_mos_objects(ds->ds_dir);
 	}
 }
 
 static const char *const objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 /*
  * Parse a string denoting a range of object IDs of the form
  * <start>[:<end>[:flags]], and store the results in zor.
  * Return 0 on success. On error, return 1 and update the msg
  * pointer to point to a descriptive error message.
  */
 static int
 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
 {
 	uint64_t flags = 0;
 	char *p, *s, *dup, *flagstr, *tmp = NULL;
 	size_t len;
 	int i;
 	int rc = 0;
 
 	if (strchr(range, ':') == NULL) {
 		zor->zor_obj_start = strtoull(range, &p, 0);
 		if (*p != '\0') {
 			*msg = "Invalid characters in object ID";
 			rc = 1;
 		}
 		zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 		zor->zor_obj_end = zor->zor_obj_start;
 		return (rc);
 	}
 
 	if (strchr(range, ':') == range) {
 		*msg = "Invalid leading colon";
 		rc = 1;
 		return (rc);
 	}
 
 	len = strlen(range);
 	if (range[len - 1] == ':') {
 		*msg = "Invalid trailing colon";
 		rc = 1;
 		return (rc);
 	}
 
 	dup = strdup(range);
 	s = strtok_r(dup, ":", &tmp);
 	zor->zor_obj_start = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in start object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	zor->zor_obj_end = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	if (zor->zor_obj_start > zor->zor_obj_end) {
 		*msg = "Start object ID may not exceed end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	if (s == NULL) {
 		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
 		goto out;
 	} else if (strtok_r(NULL, ":", &tmp) != NULL) {
 		*msg = "Invalid colon-delimited field after flags";
 		rc = 1;
 		goto out;
 	}
 
 	flagstr = s;
 	for (i = 0; flagstr[i]; i++) {
 		int bit;
 		boolean_t negation = (flagstr[i] == '-');
 
 		if (negation) {
 			i++;
 			if (flagstr[i] == '\0') {
 				*msg = "Invalid trailing negation operator";
 				rc = 1;
 				goto out;
 			}
 		}
 		bit = flagbits[(uchar_t)flagstr[i]];
 		if (bit == 0) {
 			*msg = "Invalid flag";
 			rc = 1;
 			goto out;
 		}
 		if (negation)
 			flags &= ~bit;
 		else
 			flags |= bit;
 	}
 	zor->zor_flags = flags;
 
 	zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 	zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
 
 out:
 	free(dup);
 	return (rc);
 }
 
 static void
 dump_objset(objset_t *os)
 {
 	dmu_objset_stats_t dds = { 0 };
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	const char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	boolean_t print_header;
 	unsigned i;
 	int error;
 	uint64_t total_slots_used = 0;
 	uint64_t max_slot_used = 0;
 	uint64_t dnode_slots;
 	uint64_t obj_start;
 	uint64_t obj_end;
 	uint64_t flags;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	print_header = B_TRUE;
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf,
 	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
 
 	for (i = 0; i < zopt_object_args; i++) {
 		obj_start = zopt_object_ranges[i].zor_obj_start;
 		obj_end = zopt_object_ranges[i].zor_obj_end;
 		flags = zopt_object_ranges[i].zor_flags;
 
 		object = obj_start;
 		if (object == 0 || obj_start == obj_end)
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		else
 			object--;
 
 		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
 		    object <= obj_end) {
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		}
 	}
 
 	if (zopt_object_args > 0) {
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL) {
 		dsl_dataset_t *ds = dmu_objset_ds(os);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 		    !dmu_objset_is_snapshot(os)) {
 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
 			if (verify_dd_livelist(os) != 0)
 				fatal("livelist is incorrect");
 		}
 
 		if (dsl_dataset_remap_deadlist_exists(ds)) {
 			(void) printf("ds_remap_deadlist:\n");
 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
 		}
 		count_ds_mos_objects(ds);
 	}
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bookmarks(os, verbosity);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header, NULL, 0);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 	}
 
 	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
 		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
 		    &print_header, NULL, 0);
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header, &dnode_slots,
 		    0);
 		object_count++;
 		total_slots_used += dnode_slots;
 		max_slot_used = object + dnode_slots - 1;
 	}
 
 	(void) printf("\n");
 
 	(void) printf("    Dnode slots:\n");
 	(void) printf("\tTotal used:    %10llu\n",
 	    (u_longlong_t)total_slots_used);
 	(void) printf("\tMax used:      %10llu\n",
 	    (u_longlong_t)max_slot_used);
 	(void) printf("\tPercent empty: %10lf\n",
 	    (double)(max_slot_used - total_slots_used)*100 /
 	    (double)max_slot_used);
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	if (leaked_objects != 0) {
 		(void) printf("%d potentially leaked objects detected\n",
 		    leaked_objects);
 		leaked_objects = 0;
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf("%s", header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
 
 	char blkbuf[BP_SPRINTF_LEN];
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 	(void) printf("\tbp = %s\n", blkbuf);
 
 	(void) printf("\tmmp_magic = %016llx\n",
 	    (u_longlong_t)ub->ub_mmp_magic);
 	if (MMP_VALID(ub)) {
 		(void) printf("\tmmp_delay = %0llu\n",
 		    (u_longlong_t)ub->ub_mmp_delay);
 		if (MMP_SEQ_VALID(ub))
 			(void) printf("\tmmp_seq = %u\n",
 			    (unsigned int) MMP_SEQ(ub));
 		if (MMP_FAIL_INT_VALID(ub))
 			(void) printf("\tmmp_fail = %u\n",
 			    (unsigned int) MMP_FAIL_INT(ub));
 		if (MMP_INTERVAL_VALID(ub))
 			(void) printf("\tmmp_write = %u\n",
 			    (unsigned int) MMP_INTERVAL(ub));
 		/* After MMP_* to make summarize_uberblock_mmp cleaner */
 		(void) printf("\tmmp_valid = %x\n",
 		    (unsigned int) ub->ub_mmp_config & 0xFF);
 	}
 
 	if (dump_opt['u'] >= 4) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
 
 	(void) printf("\traidz_reflow state=%u off=%llu\n",
 	    (int)RRSS_GET_STATE(ub),
 	    (u_longlong_t)RRSS_GET_OFFSET(ub));
 
 	(void) printf("%s", footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		zdb_exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 /*
  * ZFS label nvlist stats
  */
 typedef struct zdb_nvl_stats {
 	int		zns_list_count;
 	int		zns_leaf_count;
 	size_t		zns_leaf_largest;
 	size_t		zns_leaf_total;
 	nvlist_t	*zns_string;
 	nvlist_t	*zns_uint64;
 	nvlist_t	*zns_boolean;
 } zdb_nvl_stats_t;
 
 static void
 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
 {
 	nvlist_t *list, **array;
 	nvpair_t *nvp = NULL;
 	const char *name;
 	uint_t i, items;
 
 	stats->zns_list_count++;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		name = nvpair_name(nvp);
 
 		switch (nvpair_type(nvp)) {
 		case DATA_TYPE_STRING:
 			fnvlist_add_string(stats->zns_string, name,
 			    fnvpair_value_string(nvp));
 			break;
 		case DATA_TYPE_UINT64:
 			fnvlist_add_uint64(stats->zns_uint64, name,
 			    fnvpair_value_uint64(nvp));
 			break;
 		case DATA_TYPE_BOOLEAN:
 			fnvlist_add_boolean(stats->zns_boolean, name);
 			break;
 		case DATA_TYPE_NVLIST:
 			if (nvpair_value_nvlist(nvp, &list) == 0)
 				collect_nvlist_stats(list, stats);
 			break;
 		case DATA_TYPE_NVLIST_ARRAY:
 			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
 				break;
 
 			for (i = 0; i < items; i++) {
 				collect_nvlist_stats(array[i], stats);
 
 				/* collect stats on leaf vdev */
 				if (strcmp(name, "children") == 0) {
 					size_t size;
 
 					(void) nvlist_size(array[i], &size,
 					    NV_ENCODE_XDR);
 					stats->zns_leaf_total += size;
 					if (size > stats->zns_leaf_largest)
 						stats->zns_leaf_largest = size;
 					stats->zns_leaf_count++;
 				}
 			}
 			break;
 		default:
 			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
 		}
 	}
 }
 
 static void
 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
 {
 	zdb_nvl_stats_t stats = { 0 };
 	size_t size, sum = 0, total;
 	size_t noise;
 
 	/* requires nvlist with non-unique names for stat collection */
 	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
 	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
 
 	(void) printf("\n\nZFS Label NVList Config Stats:\n");
 
 	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
 	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
 	    (int)total, (int)(cap - total), 100.0 * total / cap);
 
 	collect_nvlist_stats(nvl, &stats);
 
 	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
 	    (int)fnvlist_num_pairs(stats.zns_uint64),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
 	    (int)fnvlist_num_pairs(stats.zns_string),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
 	    (int)fnvlist_num_pairs(stats.zns_boolean),
 	    (int)size, 100.0 * size / total);
 
 	size = total - sum;	/* treat remainder as nvlist overhead */
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
 	    stats.zns_list_count, (int)size, 100.0 * size / total);
 
 	if (stats.zns_leaf_count > 0) {
 		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
 
 		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
 		    stats.zns_leaf_count, (int)average);
 		(void) printf("%24d bytes largest\n",
 		    (int)stats.zns_leaf_largest);
 
 		if (dump_opt['l'] >= 3 && average > 0)
 			(void) printf("  space for %d additional leaf vdevs\n",
 			    (int)((cap - total) / average));
 	}
 	(void) printf("\n");
 
 	nvlist_free(stats.zns_string);
 	nvlist_free(stats.zns_uint64);
 	nvlist_free(stats.zns_boolean);
 }
 
 typedef struct cksum_record {
 	zio_cksum_t cksum;
 	boolean_t labels[VDEV_LABELS];
 	avl_node_t link;
 } cksum_record_t;
 
 static int
 cksum_record_compare(const void *x1, const void *x2)
 {
 	const cksum_record_t *l = (cksum_record_t *)x1;
 	const cksum_record_t *r = (cksum_record_t *)x2;
 	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
 	int difference = 0;
 
 	for (int i = 0; i < arraysize; i++) {
 		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
 		if (difference)
 			break;
 	}
 
 	return (difference);
 }
 
 static cksum_record_t *
 cksum_record_alloc(zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
 	rec->cksum = *cksum;
 	rec->labels[l] = B_TRUE;
 
 	return (rec);
 }
 
 static cksum_record_t *
 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
 {
 	cksum_record_t lookup = { .cksum = *cksum };
 	avl_index_t where;
 
 	return (avl_find(tree, &lookup, &where));
 }
 
 static cksum_record_t *
 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = cksum_record_lookup(tree, cksum);
 	if (rec) {
 		rec->labels[l] = B_TRUE;
 	} else {
 		rec = cksum_record_alloc(cksum, l);
 		avl_add(tree, rec);
 	}
 
 	return (rec);
 }
 
 static int
 first_label(cksum_record_t *rec)
 {
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i])
 			return (i);
 
 	return (-1);
 }
 
 static void
 print_label_numbers(const char *prefix, const cksum_record_t *rec)
 {
 	fputs(prefix, stdout);
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i] == B_TRUE)
 			printf("%d ", i);
 	putchar('\n');
 }
 
 #define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
 
 typedef struct zdb_label {
 	vdev_label_t label;
 	uint64_t label_offset;
 	nvlist_t *config_nv;
 	cksum_record_t *config;
 	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
 	boolean_t header_printed;
 	boolean_t read_failed;
 	boolean_t cksum_valid;
 } zdb_label_t;
 
 static void
 print_label_header(zdb_label_t *label, int l)
 {
 
 	if (dump_opt['q'])
 		return;
 
 	if (label->header_printed == B_TRUE)
 		return;
 
 	(void) printf("------------------------------------\n");
 	(void) printf("LABEL %d %s\n", l,
 	    label->cksum_valid ? "" : "(Bad label cksum)");
 	(void) printf("------------------------------------\n");
 
 	label->header_printed = B_TRUE;
 }
 
 static void
 print_l2arc_header(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device header\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 print_l2arc_log_blocks(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device log blocks\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 dump_l2arc_log_entries(uint64_t log_entries,
     l2arc_log_ent_phys_t *le, uint64_t i)
 {
 	for (int j = 0; j < log_entries; j++) {
 		dva_t dva = le[j].le_dva;
 		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
 		    "vdev: %llu, offset: %llu\n",
 		    (u_longlong_t)i, j + 1,
 		    (u_longlong_t)DVA_GET_ASIZE(&dva),
 		    (u_longlong_t)DVA_GET_VDEV(&dva),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva));
 		(void) printf("|\t\t\t\tbirth: %llu\n",
 		    (u_longlong_t)le[j].le_birth);
 		(void) printf("|\t\t\t\tlsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tpsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcompr: %llu\n",
 		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcomplevel: %llu\n",
 		    (u_longlong_t)(&le[j])->le_complevel);
 		(void) printf("|\t\t\t\ttype: %llu\n",
 		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprotected: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprefetch: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
 		(void) printf("|\t\t\t\taddress: %llu\n",
 		    (u_longlong_t)le[j].le_daddr);
 		(void) printf("|\t\t\t\tARC state: %llu\n",
 		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
 		(void) printf("|\n");
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
 {
 	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
 	(void) printf("|\t\tpayload_asize: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_asize);
 	(void) printf("|\t\tpayload_start: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_start);
 	(void) printf("|\t\tlsize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tasize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tcompralgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
 	(void) printf("|\t\tcksumalgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
 	(void) printf("|\n\n");
 }
 
 static void
 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
     l2arc_dev_hdr_phys_t *rebuild)
 {
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
 
 	if (!dump_opt['q'])
 		print_l2arc_log_blocks();
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	dev.l2ad_evict = l2dhdr->dh_evict;
 	dev.l2ad_start = l2dhdr->dh_start;
 	dev.l2ad_end = l2dhdr->dh_end;
 
 	if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
 		/* no log blocks to read */
 		if (!dump_opt['q']) {
 			(void) printf("No log blocks to read\n");
 			(void) printf("\n");
 		}
 		return;
 	} else {
 		dev.l2ad_hand = lbps[0].lbp_daddr +
 		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 	}
 
 	dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
 			break;
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
 			if (!dump_opt['q']) {
 				(void) printf("Error while reading next log "
 				    "block\n\n");
 			}
 			break;
 		}
 
 		fletcher_4_native_varsize(&this_lb, asize, &cksum);
 		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
 			failed++;
 			if (!dump_opt['q']) {
 				(void) printf("Invalid cksum\n");
 				dump_l2arc_log_blkptr(&lbps[0]);
 			}
 			break;
 		}
 
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
 		default: {
 			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
 			abd_t dabd;
 			abd_get_from_buf_struct(&dabd, &this_lb,
 			    sizeof (this_lb));
 			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
 			    (&lbps[0])->lbp_prop), abd, &dabd,
 			    asize, sizeof (this_lb), NULL);
 			abd_free(&dabd);
 			abd_free(abd);
 			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
 				goto out;
 			}
 			break;
 		}
 		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
 		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
 			if (!dump_opt['q'])
 				(void) printf("Invalid log block magic\n\n");
 			break;
 		}
 
 		rebuild->dh_lb_count++;
 		rebuild->dh_lb_asize += asize;
 		if (dump_opt['l'] > 1 && !dump_opt['q']) {
 			(void) printf("lb[%4llu]\tmagic: %llu\n",
 			    (u_longlong_t)rebuild->dh_lb_count,
 			    (u_longlong_t)this_lb.lb_magic);
 			dump_l2arc_log_blkptr(&lbps[0]);
 		}
 
 		if (dump_opt['l'] > 2 && !dump_opt['q'])
 			dump_l2arc_log_entries(l2dhdr->dh_log_entries,
 			    this_lb.lb_entries,
 			    rebuild->dh_lb_count);
 
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
 		    !dev.l2ad_first)
 			break;
 
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb.lb_prev_lbp;
 	}
 out:
 	if (!dump_opt['q']) {
 		(void) printf("log_blk_count:\t %llu with valid cksum\n",
 		    (u_longlong_t)rebuild->dh_lb_count);
 		(void) printf("\t\t %d with invalid cksum\n", failed);
 		(void) printf("log_blk_asize:\t %llu\n\n",
 		    (u_longlong_t)rebuild->dh_lb_asize);
 	}
 }
 
 static int
 dump_l2arc_header(int fd)
 {
 	l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
 	int error = B_FALSE;
 
 	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
 	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
 		error = B_TRUE;
 	} else {
 		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
 
 		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
 			error = B_TRUE;
 	}
 
 	if (error) {
 		(void) printf("L2ARC device header not found\n\n");
 		/* Do not return an error here for backward compatibility */
 		return (0);
 	} else if (!dump_opt['q']) {
 		print_l2arc_header();
 
 		(void) printf("    magic: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_magic);
 		(void) printf("    version: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_version);
 		(void) printf("    pool_guid: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_spa_guid);
 		(void) printf("    flags: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_flags);
 		(void) printf("    start_lbps[0]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[0].lbp_daddr);
 		(void) printf("    start_lbps[1]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[1].lbp_daddr);
 		(void) printf("    log_blk_ent: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_log_entries);
 		(void) printf("    start: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_start);
 		(void) printf("    end: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_end);
 		(void) printf("    evict: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_evict);
 		(void) printf("    lb_asize_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_asize);
 		(void) printf("    lb_count_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_count);
 		(void) printf("    trim_action_time: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_trim_action_time);
 		(void) printf("    trim_state: %llu\n\n",
 		    (u_longlong_t)l2dhdr.dh_trim_state);
 	}
 
 	dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
 	/*
 	 * The total aligned size of log blocks and the number of log blocks
 	 * reported in the header of the device may be less than what zdb
 	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
 	 * This happens because dump_l2arc_log_blocks() lacks the memory
 	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
 	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
 	 * and dh_lb_count will be lower to begin with than what exists on the
 	 * device. This is normal and zdb should not exit with an error. The
 	 * opposite case should never happen though, the values reported in the
 	 * header should never be higher than what dump_l2arc_log_blocks() and
 	 * l2arc_rebuild() report. If this happens there is a leak in the
 	 * accounting of log blocks.
 	 */
 	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
 	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
 		return (1);
 
 	return (0);
 }
 
 static void
 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
 {
 	if (dump_opt['q'])
 		return;
 
 	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
 		return;
 
 	print_label_header(label, l);
 	dump_nvlist(label->config_nv, 4);
 	print_label_numbers("    labels = ", label->config);
 
 	if (dump_opt['l'] >= 2)
 		dump_nvlist_stats(label->config_nv, buflen);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
 {
 
 	vdev_t vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vd.vdev_top = &vd;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 		uberblock_t *ub = (void *)((char *)&label->label + uoff);
 		cksum_record_t *rec = label->uberblocks[i];
 
 		if (rec == NULL) {
 			if (dump_opt['u'] >= 2) {
 				print_label_header(label, label_num);
 				(void) printf("    Uberblock[%d] invalid\n", i);
 			}
 			continue;
 		}
 
 		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
 			continue;
 
 		if ((dump_opt['u'] < 4) &&
 		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
 		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
 			continue;
 
 		print_label_header(label, label_num);
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "    Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 		print_label_numbers("        labels = ", rec);
 	}
 }
 
 static char curpath[PATH_MAX];
 
 /*
  * Iterate through the path components, recursively passing
  * current one's obj and remaining path until we find the obj
  * for the last one.
  */
 static int
 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
 {
 	int err;
 	boolean_t header = B_TRUE;
 	uint64_t child_obj;
 	char *s;
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 
 	if ((s = strchr(name, '/')) != NULL)
 		*s = '\0';
 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
 
 	(void) strlcat(curpath, name, sizeof (curpath));
 
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
 		    curpath, strerror(err));
 		return (err);
 	}
 
 	child_obj = ZFS_DIRENT_OBJ(child_obj);
 	err = sa_buf_hold(os, child_obj, FTAG, &db);
 	if (err != 0) {
 		(void) fprintf(stderr,
 		    "failed to get SA dbuf for obj %llu: %s\n",
 		    (u_longlong_t)child_obj, strerror(err));
 		return (EINVAL);
 	}
 	dmu_object_info_from_db(db, &doi);
 	sa_buf_rele(db, FTAG);
 
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
 		return (EINVAL);
 	}
 
 	if (dump_opt['v'] > 6) {
 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
 		    doi.doi_bonus_type);
 	}
 
 	(void) strlcat(curpath, "/", sizeof (curpath));
 
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
 			return (dump_path_impl(os, child_obj, s + 1, retobj));
 		zfs_fallthrough;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (retobj != NULL) {
 			*retobj = child_obj;
 		} else {
 			dump_object(os, child_obj, dump_opt['v'], &header,
 			    NULL, 0);
 		}
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
 		break;
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Dump the blocks for the object specified by path inside the dataset.
  */
 static int
 dump_path(char *ds, char *path, uint64_t *retobj)
 {
 	int err;
 	objset_t *os;
 	uint64_t root_obj;
 
 	err = open_objset(ds, FTAG, &os);
 	if (err != 0)
 		return (err);
 
 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
 	if (err != 0) {
 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
 		    strerror(err));
 		close_objset(os, FTAG);
 		return (EINVAL);
 	}
 
 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
 
 	err = dump_path_impl(os, root_obj, path, retobj);
 
 	close_objset(os, FTAG);
 	return (err);
 }
 
 static int
 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
 {
 	const char *p = (const char *)buf;
 	ssize_t nwritten;
 
 	(void) os;
 	(void) arg;
 
 	/* Write the data out, handling short writes and signals. */
 	while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
 		if (nwritten < 0) {
 			if (errno == EINTR)
 				continue;
 			return (errno);
 		}
 		p += nwritten;
 		len -= nwritten;
 	}
 
 	return (0);
 }
 
 static void
 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
 {
 	boolean_t embed = B_FALSE;
 	boolean_t large_block = B_FALSE;
 	boolean_t compress = B_FALSE;
 	boolean_t raw = B_FALSE;
 
 	const char *c;
 	for (c = flagstr; c != NULL && *c != '\0'; c++) {
 		switch (*c) {
 			case 'e':
 				embed = B_TRUE;
 				break;
 			case 'L':
 				large_block = B_TRUE;
 				break;
 			case 'c':
 				compress = B_TRUE;
 				break;
 			case 'w':
 				raw = B_TRUE;
 				break;
 			default:
 				fprintf(stderr, "dump_backup: invalid flag "
 				    "'%c'\n", *c);
 				return;
 		}
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		fprintf(stderr, "dump_backup: stream cannot be written "
 		    "to a terminal\n");
 		return;
 	}
 
 	offset_t off = 0;
 	dmu_send_outparams_t out = {
 	    .dso_outfunc = dump_backup_bytes,
 	    .dso_dryrun  = B_FALSE,
 	};
 
 	int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
 	    large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
 	    &off, &out);
 	if (err != 0) {
 		fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
 		    strerror(err));
 		return;
 	}
 }
 
 static int
 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
 {
 	int err = 0;
 	uint64_t size, readsize, oursize, offset;
 	ssize_t writesize;
 	sa_handle_t *hdl;
 
 	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
 	    destfile);
 
 	VERIFY3P(os, ==, sa_os);
 	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return (err);
 	}
 	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
 		(void) sa_handle_destroy(hdl);
 		return (err);
 	}
 	(void) sa_handle_destroy(hdl);
 
 	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
 	    size);
 	if (size == 0) {
 		return (EINVAL);
 	}
 
 	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
 	if (fd == -1)
 		return (errno);
 	/*
 	 * We cap the size at 1 mebibyte here to prevent
 	 * allocation failures and nigh-infinite printing if the
 	 * object is extremely large.
 	 */
 	oursize = MIN(size, 1 << 20);
 	offset = 0;
 	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
 	if (buf == NULL) {
 		(void) close(fd);
 		return (ENOMEM);
 	}
 
 	while (offset < size) {
 		readsize = MIN(size - offset, 1 << 20);
 		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(buf, oursize);
 			(void) close(fd);
 			return (err);
 		}
 		if (dump_opt['v'] > 3) {
 			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
 			    " error=%d\n", offset, readsize, err);
 		}
 
 		writesize = write(fd, buf, readsize);
 		if (writesize < 0) {
 			err = errno;
 			break;
 		} else if (writesize != readsize) {
 			/* Incomplete write */
 			(void) fprintf(stderr, "Short write, only wrote %llu of"
 			    " %" PRIu64 " bytes, exiting...\n",
 			    (u_longlong_t)writesize, readsize);
 			break;
 		}
 
 		offset += readsize;
 	}
 
 	(void) close(fd);
 
 	if (buf != NULL)
 		kmem_free(buf, oursize);
 
 	return (err);
 }
 
 static boolean_t
 label_cksum_valid(vdev_label_t *label, uint64_t offset)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	zio_cksum_t expected_cksum;
 	zio_cksum_t actual_cksum;
 	zio_cksum_t verifier;
 	zio_eck_t *eck;
 	int byteswap;
 
 	void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
 	eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
 
 	offset += offsetof(vdev_label_t, vl_vdev_phys);
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	expected_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
 	ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
 	abd_free(abd);
 
 	if (byteswap)
 		byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static int
 dump_label(const char *dev)
 {
 	char path[MAXPATHLEN];
 	zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
 	uint64_t psize, ashift, l2cache;
 	struct stat64 statbuf;
 	boolean_t config_found = B_FALSE;
 	boolean_t error = B_FALSE;
 	boolean_t read_l2arc_header = B_FALSE;
 	avl_tree_t config_tree;
 	avl_tree_t uberblock_tree;
 	void *node, *cookie;
 	int fd;
 
 	/*
 	 * Check if we were given absolute path and use it as is.
 	 * Otherwise if the provided vdev name doesn't point to a file,
 	 * try prepending expected disk paths and partition numbers.
 	 */
 	(void) strlcpy(path, dev, sizeof (path));
 	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
 		int error;
 
 		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
 		if (error == 0 && zfs_dev_is_whole_disk(path)) {
 			if (zfs_append_partition(path, MAXPATHLEN) == -1)
 				error = ENOENT;
 		}
 
 		if (error || (stat64(path, &statbuf) != 0)) {
 			(void) printf("failed to find device %s, try "
 			    "specifying absolute path instead\n", dev);
 			return (1);
 		}
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64_blk(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
 		zdb_exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
 		(void) printf("failed to invalidate cache '%s' : %s\n", path,
 		    strerror(errno));
 
 	avl_create(&config_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 	avl_create(&uberblock_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
 	ashift = SPA_MINBLOCKSHIFT;
 
 	/*
 	 * 1. Read the label from disk
 	 * 2. Verify label cksum
 	 * 3. Unpack the configuration and insert in config tree.
 	 * 4. Traverse all uberblocks and insert in uberblock tree.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		char *buf = label->label.vl_vdev_phys.vp_nvlist;
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 		nvlist_t *config;
 		cksum_record_t *rec;
 		zio_cksum_t cksum;
 		vdev_t vd;
 
 		label->label_offset = vdev_label_offset(psize, l, 0);
 
 		if (pread64(fd, &label->label, sizeof (label->label),
 		    label->label_offset) != sizeof (label->label)) {
 			if (!dump_opt['q'])
 				(void) printf("failed to read label %d\n", l);
 			label->read_failed = B_TRUE;
 			error = B_TRUE;
 			continue;
 		}
 
 		label->read_failed = B_FALSE;
 		label->cksum_valid = label_cksum_valid(&label->label,
 		    label->label_offset);
 
 		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
 			nvlist_t *vdev_tree = NULL;
 			size_t size;
 
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 
 			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
 				size = buflen;
 
 			/* If the device is a cache device read the header. */
 			if (!read_l2arc_header) {
 				if (nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
 				    l2cache == POOL_STATE_L2CACHE) {
 					read_l2arc_header = B_TRUE;
 				}
 			}
 
 			fletcher_4_native_varsize(buf, size, &cksum);
 			rec = cksum_record_insert(&config_tree, &cksum, l);
 
 			label->config = rec;
 			label->config_nv = config;
 			config_found = B_TRUE;
 		} else {
 			error = B_TRUE;
 		}
 
 		vd.vdev_ashift = ashift;
 		vd.vdev_top = &vd;
 
 		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 			uberblock_t *ub = (void *)((char *)label + uoff);
 
 			if (uberblock_verify(ub))
 				continue;
 
 			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
 			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
 
 			label->uberblocks[i] = rec;
 		}
 	}
 
 	/*
 	 * Dump the label and uberblocks.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 
 		if (label->read_failed == B_TRUE)
 			continue;
 
 		if (label->config_nv) {
 			dump_config_from_label(label, buflen, l);
 		} else {
 			if (!dump_opt['q'])
 				(void) printf("failed to unpack label %d\n", l);
 		}
 
 		if (dump_opt['u'])
 			dump_label_uberblocks(label, ashift, l);
 
 		nvlist_free(label->config_nv);
 	}
 
 	/*
 	 * Dump the L2ARC header, if existent.
 	 */
 	if (read_l2arc_header)
 		error |= dump_l2arc_header(fd);
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	avl_destroy(&config_tree);
 	avl_destroy(&uberblock_tree);
 
 	(void) close(fd);
 
 	return (config_found == B_FALSE ? 2 :
 	    (error == B_TRUE ? 1 : 0));
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 static uint64_t global_feature_count[SPA_FEATURES];
 static uint64_t remap_deadlist_count = 0;
 
 static int
 dump_one_objset(const char *dsname, void *arg)
 {
 	(void) arg;
 	int error;
 	objset_t *os;
 	spa_feature_t f;
 
 	error = open_objset(dsname, FTAG, &os);
 	if (error != 0)
 		return (0);
 
 	for (f = 0; f < SPA_FEATURES; f++) {
 		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
 		remap_deadlist_count++;
 	}
 
 	for (dsl_bookmark_node_t *dbn =
 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
 		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 			global_feature_count[
 			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
 			objset_t *mos = os->os_spa->spa_meta_objset;
 			dnode_t *rl;
 			VERIFY0(dnode_hold(mos,
 			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
 			if (rl->dn_have_spill) {
 				global_feature_count[
 				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
 			}
 		}
 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
 	}
 
 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
 	    !dmu_objset_is_snapshot(os)) {
 		global_feature_count[SPA_FEATURE_LIVELIST]++;
 	}
 
 	dump_objset(os);
 	close_objset(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_ditto_same_ms;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static const char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
 
 typedef struct zdb_brt_entry {
 	dva_t		zbre_dva;
 	uint64_t	zbre_refcount;
 	avl_node_t	zbre_node;
 } zdb_brt_entry_t;
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_clone_asize;
 	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_total;
 	uint64_t	zcb_lsize_total;
 	uint64_t	zcb_asize_total;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE + 1];
 	uint64_t	zcb_start;
 	hrtime_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
 	avl_tree_t	zcb_brt;
 	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
 static boolean_t
 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 	uint64_t ms_shift = vd->vdev_ms_shift;
 
 	return ((off1 >> ms_shift) == (off2 >> ms_shift));
 }
 
 /*
  * Used to simplify reporting of the histogram data.
  */
 typedef struct one_histo {
 	const char *name;
 	uint64_t *count;
 	uint64_t *len;
 	uint64_t cumulative;
 } one_histo_t;
 
 /*
  * The number of separate histograms processed for psize, lsize and asize.
  */
 #define	NUM_HISTO 3
 
 /*
  * This routine will create a fixed column size output of three different
  * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
  * the count, length and cumulative length of the psize, lsize and
  * asize blocks.
  *
  * All three types of blocks are listed on a single line
  *
  * By default the table is printed in nicenumber format (e.g. 123K) but
  * if the '-P' parameter is specified then the full raw number (parseable)
  * is printed out.
  */
 static void
 dump_size_histograms(zdb_cb_t *zcb)
 {
 	/*
 	 * A temporary buffer that allows us to convert a number into
 	 * a string using zdb_nicenumber to allow either raw or human
 	 * readable numbers to be output.
 	 */
 	char numbuf[32];
 
 	/*
 	 * Define titles which are used in the headers of the tables
 	 * printed by this routine.
 	 */
 	const char blocksize_title1[] = "block";
 	const char blocksize_title2[] = "size";
 	const char count_title[] = "Count";
 	const char length_title[] = "Size";
 	const char cumulative_title[] = "Cum.";
 
 	/*
 	 * Setup the histogram arrays (psize, lsize, and asize).
 	 */
 	one_histo_t parm_histo[NUM_HISTO];
 
 	parm_histo[0].name = "psize";
 	parm_histo[0].count = zcb->zcb_psize_count;
 	parm_histo[0].len = zcb->zcb_psize_len;
 	parm_histo[0].cumulative = 0;
 
 	parm_histo[1].name = "lsize";
 	parm_histo[1].count = zcb->zcb_lsize_count;
 	parm_histo[1].len = zcb->zcb_lsize_len;
 	parm_histo[1].cumulative = 0;
 
 	parm_histo[2].name = "asize";
 	parm_histo[2].count = zcb->zcb_asize_count;
 	parm_histo[2].len = zcb->zcb_asize_len;
 	parm_histo[2].cumulative = 0;
 
 
 	(void) printf("\nBlock Size Histogram\n");
 	/*
 	 * Print the first line titles
 	 */
 	if (dump_opt['P'])
 		(void) printf("\n%s\t", blocksize_title1);
 	else
 		(void) printf("\n%7s   ", blocksize_title1);
 
 	for (int j = 0; j < NUM_HISTO; j++) {
 		if (dump_opt['P']) {
 			if (j < NUM_HISTO - 1) {
 				(void) printf("%s\t\t\t", parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("  %s", parm_histo[j].name);
 			}
 		} else {
 			if (j < NUM_HISTO - 1) {
 				/* Left aligned strings in the output */
 				(void) printf("%-7s              ",
 				    parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("%s", parm_histo[j].name);
 			}
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the second line titles
 	 */
 	if (dump_opt['P']) {
 		(void) printf("%s\t", blocksize_title2);
 	} else {
 		(void) printf("%7s ", blocksize_title2);
 	}
 
 	for (int i = 0; i < NUM_HISTO; i++) {
 		if (dump_opt['P']) {
 			(void) printf("%s\t%s\t%s\t",
 			    count_title, length_title, cumulative_title);
 		} else {
 			(void) printf("%7s%7s%7s",
 			    count_title, length_title, cumulative_title);
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the rows
 	 */
 	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
 
 		/*
 		 * Print the first column showing the blocksize
 		 */
 		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
 
 		if (dump_opt['P']) {
 			printf("%s", numbuf);
 		} else {
 			printf("%7s:", numbuf);
 		}
 
 		/*
 		 * Print the remaining set of 3 columns per size:
 		 * for psize, lsize and asize
 		 */
 		for (int j = 0; j < NUM_HISTO; j++) {
 			parm_histo[j].cumulative += parm_histo[j].len[i];
 
 			zdb_nicenum(parm_histo[j].count[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].len[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].cumulative,
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 		}
 		(void) printf("\n");
 	}
 }
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	/*
 	 * This flag controls if we will issue a claim for the block while
 	 * counting it, to ensure that all blocks are referenced in space maps.
 	 * We don't issue claims if we're not doing leak tracking, because it's
 	 * expensive if the user isn't interested. We also don't claim the
 	 * second or later occurences of cloned or dedup'd blocks, because we
 	 * already claimed them the first time.
 	 */
 	boolean_t do_claim = !dump_opt['L'];
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
 	blkptr_t tempbp;
 	if (BP_GET_DEDUP(bp)) {
 		/*
 		 * Dedup'd blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * We use the existing dedup system to track what we've seen.
 		 * The first time we see a block, we do a ddt_lookup() to see
 		 * if it exists in the DDT. If we're doing leak tracking, we
 		 * claim the block at this time.
 		 *
 		 * Each time we see a block, we reduce the refcount in the
 		 * entry by one, and add to the size and count of dedup'd
 		 * blocks to report at the end.
 		 */
 
 		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
 
 		ddt_enter(ddt);
 
 		/*
 		 * Find the block. This will create the entry in memory, but
 		 * we'll know if that happened by its refcount.
 		 */
 		ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE);
 
 		/*
 		 * ddt_lookup() can return NULL if this block didn't exist
 		 * in the DDT and creating it would take the DDT over its
 		 * quota. Since we got the block from disk, it must exist in
 		 * the DDT, so this can't happen. However, when unique entries
 		 * are pruned, the dedup bit can be set with no corresponding
 		 * entry in the DDT.
 		 */
 		if (dde == NULL) {
 			ddt_exit(ddt);
 			goto skipped;
 		}
 
 		/* Get the phys for this variant */
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
 		 * or count them on subsequent occurences. We don't have a
 		 * convenient way to track the first time we see each variant,
 		 * so we repurpose dde_io as a set of "seen" flag bits. We can
 		 * do this safely in zdb because it never writes, so it will
 		 * never have a writing zio for this block in that pointer.
 		 */
 		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
 		if (!seen)
 			dde->dde_io =
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
 		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
 			ddt_phys_decref(dde->dde_phys, v);
 
 		/*
 		 * If this entry has a single flat phys, it may have been
 		 * extended with additional DVAs at some time in its life.
 		 * This block might be from before it was fully extended, and
 		 * so have fewer DVAs.
 		 *
 		 * If this is the first time we've seen this block, and we
 		 * claimed it as-is, then we would miss the claim on some
 		 * number of DVAs, which would then be seen as leaked.
 		 *
 		 * In all cases, if we've had fewer DVAs, then the asize would
 		 * be too small, and would lead to the pool apparently using
 		 * more space than allocated.
 		 *
 		 * To handle this, we copy the canonical set of DVAs from the
 		 * entry back to the block pointer before we claim it.
 		 */
 		if (v == DDT_PHYS_FLAT) {
 			ASSERT3U(BP_GET_BIRTH(bp), ==,
 			    ddt_phys_birth(dde->dde_phys, v));
 			tempbp = *bp;
 			ddt_bp_fill(dde->dde_phys, v, &tempbp,
 			    BP_GET_BIRTH(bp));
 			bp = &tempbp;
 		}
 
 		if (seen) {
 			/*
 			 * The second or later time we see this block,
 			 * it's a duplicate and we count it.
 			 */
 			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_dedup_blocks++;
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 
 		ddt_exit(ddt);
 	} else if (zcb->zcb_brt_is_active &&
 	    brt_maybe_exists(zcb->zcb_spa, bp)) {
 		/*
 		 * Cloned blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * To do this, we keep our own in-memory BRT. For each block
 		 * we haven't seen before, we look it up in the real BRT and
 		 * if its there, we note it and its refcount then proceed as
 		 * normal. If we see the block again, we count it as a clone
 		 * and then give it no further consideration.
 		 */
 		zdb_brt_entry_t zbre_search, *zbre;
 		avl_index_t where;
 
 		zbre_search.zbre_dva = bp->blk_dva[0];
 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
 		if (zbre == NULL) {
 			/* Not seen before; track it */
 			uint64_t refcnt =
 			    brt_entry_get_refcount(zcb->zcb_spa, bp);
 			if (refcnt > 0) {
 				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
 				    UMEM_NOFAIL);
 				zbre->zbre_dva = bp->blk_dva[0];
 				zbre->zbre_refcount = refcnt;
 				avl_insert(&zcb->zcb_brt, zbre, where);
 			}
 		} else  {
 			/*
 			 * Second or later occurrence, count it and take a
 			 * refcount.
 			 */
 			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_clone_blocks++;
 
 			zbre->zbre_refcount--;
 			if (zbre->zbre_refcount == 0) {
 				avl_remove(&zcb->zcb_brt, zbre);
 				umem_free(zbre, sizeof (zdb_brt_entry_t));
 			}
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 	}
 
 skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) {
 				zb->zb_ditto_samevdev++;
 
 				if (same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0) {
 				zb->zb_ditto_samevdev++;
 
 				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		}
 	}
 
 	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 	/*
 	 * The binning histogram bins by powers of two up to
 	 * SPA_MAXBLOCKSIZE rather than creating bins for
 	 * every possible blocksize found in the pool.
 	 */
 	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
 
 	zcb->zcb_psize_count[bin]++;
 	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
 	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
 
 	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
 
 	zcb->zcb_lsize_count[bin]++;
 	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
 	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
 
 	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
 
 	zcb->zcb_asize_count[bin]++;
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
 	if (!do_claim)
 		return;
 
 	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
 	    ZIO_FLAG_CANFAIL)));
 }
 
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 
 	abd_free(zio->io_abd);
 }
 
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
 	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_load_verify_bytes > max_inflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_load_verify_bytes += size;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		uint64_t kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		uint64_t sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		/* make sure nicenum has enough space */
 		_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
 
 		zfs_nicebytes(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4"PRIu64"MB/s) "
 		    "estimated time remaining: "
 		    "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static int
 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	spa_vdev_removal_t *svr = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 
 	/* skip vdevs we don't care about */
 	if (sme->sme_vdev != svr->svr_vdev_id)
 		return (0);
 
 	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		zfs_range_tree_add(svr->svr_allocd_segs, offset, size);
 	else
 		zfs_range_tree_remove(svr->svr_allocd_segs, offset, size);
 
 	return (0);
 }
 
 static void
 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset, (void) arg;
 
 	/*
 	 * This callback was called through a remap from
 	 * a device being removed. Therefore, the vdev that
 	 * this callback is applied to is a concrete
 	 * vdev.
 	 */
 	ASSERT(vdev_is_concrete(vd));
 
 	VERIFY0(metaslab_claim_impl(vd, offset, size,
 	    spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    claim_segment_impl_cb, NULL);
 }
 
 /*
  * After accounting for all allocated blocks that are directly referenced,
  * we might have missed a reference to a block from a partially complete
  * (and thus unused) indirect mapping object. We perform a secondary pass
  * through the metaslabs we have already mapped and claim the destination
  * blocks.
  */
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return;
 
 	if (spa->spa_vdev_removal == NULL)
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 	zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 	    NULL, 0, 0);
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		ASSERT0(zfs_range_tree_space(allocs));
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
 		zfs_range_tree_vacate(allocs, zfs_range_tree_add,
 		    svr->svr_allocd_segs);
 	}
 	zfs_range_tree_destroy(allocs);
 
 	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
 
 	/*
 	 * Clear everything past what has been synced,
 	 * because we have not allocated mappings for
 	 * it yet.
 	 */
 	zfs_range_tree_clear(svr->svr_allocd_segs,
 	    vdev_indirect_mapping_max_offset(vim),
 	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
 
 	zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs);
 	zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 	spa_t *spa = zcb->zcb_spa;
 	vdev_t *vd;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	ASSERT(!bp_freed);
 	ASSERT(!dump_opt['L']);
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
 
 	vdev_indirect_mapping_increment_obsolete_count(
 	    vd->vdev_indirect_mapping,
 	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 
 	return (0);
 }
 
 static uint32_t *
 zdb_load_obsolete_counts(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint64_t obsolete_sm_object;
 	uint32_t *counts;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
 	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 	if (vd->vdev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    vd->vdev_obsolete_sm);
 	}
 	if (scip->scip_vdev == vd->vdev_id &&
 	    scip->scip_prev_obsolete_sm_object != 0) {
 		space_map_t *prev_obsolete_sm = NULL;
 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    prev_obsolete_sm);
 		space_map_close(prev_obsolete_sm);
 	}
 	return (counts);
 }
 
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
 	 * and the ms_sm space maps exist in the metaslab level,
 	 * an entry in the checkpoint space map could theoretically
 	 * cross the boundaries of the metaslab that it belongs.
 	 *
 	 * In reality, because of the way that we populate and
 	 * manipulate the checkpoint's space maps currently,
 	 * there shouldn't be any entries that cross metaslabs.
 	 * Hence the assertion below.
 	 *
 	 * That said, there is no fundamental requirement that
 	 * the checkpoint's space map entries should not cross
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * By removing the entry from the allocated segments we
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset,
 	    sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
 static void
 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 {
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *checkpoint_sm = NULL;
 	uint64_t checkpoint_sm_obj;
 
 	/*
 	 * If there is no vdev_top_zap, we are in a pool whose
 	 * version predates the pool checkpoint feature.
 	 */
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	/*
 	 * If there is no reference of the vdev_checkpoint_sm in
 	 * the vdev_top_zap, then one of the following scenarios
 	 * is true:
 	 *
 	 * 1] There is no checkpoint
 	 * 2] There is a checkpoint, but no checkpointed blocks
 	 *    have been freed yet
 	 * 3] The current vdev is indirect
 	 *
 	 * In these cases we return immediately.
 	 */
 	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 		return;
 
 	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
 	    &checkpoint_sm_obj));
 
 	checkpoint_sm_exclude_entry_arg_t cseea;
 	cseea.cseea_vd = vd;
 	cseea.cseea_checkpoint_size = 0;
 
 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 
 	VERIFY0(space_map_iterate(checkpoint_sm,
 	    space_map_length(checkpoint_sm),
 	    checkpoint_sm_exclude_entry_cb, &cseea));
 	space_map_close(checkpoint_sm);
 
 	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
 }
 
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
 		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
 	}
 }
 
 static int
 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	int64_t *ualloc_space = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		*ualloc_space += sme->sme_run;
 	else
 		*ualloc_space -= sme->sme_run;
 
 	return (0);
 }
 
 static int64_t
 get_unflushed_alloc_space(spa_t *spa)
 {
 	if (dump_opt['L'])
 		return (0);
 
 	int64_t ualloc_space = 0;
 	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
 	    &ualloc_space);
 	return (ualloc_space);
 }
 
 static int
 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
 {
 	maptype_t *uic_maptype = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (*uic_maptype == sme->sme_type)
 		zfs_range_tree_add(ms->ms_allocatable, offset, size);
 	else
 		zfs_range_tree_remove(ms->ms_allocatable, offset, size);
 
 	return (0);
 }
 
 static void
 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
 {
 	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
 }
 
 static void
 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		ASSERT3U(i, ==, vd->vdev_id);
 
 		if (vd->vdev_ops == &vdev_indirect_ops)
 			continue;
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rloading concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)msp->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			mutex_enter(&msp->ms_lock);
 			zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 			/*
 			 * We don't want to spend the CPU manipulating the
 			 * size-ordered tree, so clear the range_tree ops.
 			 */
 			msp->ms_allocatable->rt_ops = NULL;
 
 			if (msp->ms_sm != NULL) {
 				VERIFY0(space_map_load(msp->ms_sm,
 				    msp->ms_allocatable, maptype));
 			}
 			if (!msp->ms_loaded)
 				msp->ms_loaded = B_TRUE;
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	load_unflushed_to_ms_allocatables(spa, maptype);
 }
 
 /*
  * vm_idxp is an in-out parameter which (for indirect vdevs) is the
  * index in vim_entries that has the first entry in this metaslab.
  * On return, it will be set to the first entry after this metaslab.
  */
 static void
 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
     uint64_t *vim_idxp)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	mutex_enter(&msp->ms_lock);
 	zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 	/*
 	 * We don't want to spend the CPU manipulating the
 	 * size-ordered tree, so clear the range_tree ops.
 	 */
 	msp->ms_allocatable->rt_ops = NULL;
 
 	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
 	    (*vim_idxp)++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[*vim_idxp];
 		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
 		ASSERT3U(ent_offset, >=, msp->ms_start);
 		if (ent_offset >= msp->ms_start + msp->ms_size)
 			break;
 
 		/*
 		 * Mappings do not cross metaslab boundaries,
 		 * because we create them by walking the metaslabs.
 		 */
 		ASSERT3U(ent_offset + ent_len, <=,
 		    msp->ms_start + msp->ms_size);
 		zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
 	}
 
 	if (!msp->ms_loaded)
 		msp->ms_loaded = B_TRUE;
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		ASSERT3U(c, ==, vd->vdev_id);
 
 		if (vd->vdev_ops != &vdev_indirect_ops)
 			continue;
 
 		/*
 		 * Note: we don't check for mapping leaks on
 		 * removing vdevs because their ms_allocatable's
 		 * are used to look for leaks in allocated space.
 		 */
 		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
 
 		/*
 		 * Normally, indirect vdevs don't have any
 		 * metaslabs.  We want to set them up for
 		 * zio_claim().
 		 */
 		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, 0));
 
 		vdev_indirect_mapping_t *vim __maybe_unused =
 		    vd->vdev_indirect_mapping;
 		uint64_t vim_idx = 0;
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
 			(void) fprintf(stderr,
 			    "\rloading indirect vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)vd->vdev_ms[m]->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
 			    &vim_idx);
 		}
 		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
 	}
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (dump_opt['L'])
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We are going to be changing the meaning of the metaslab's
 	 * ms_allocatable.  Ensure that the allocator doesn't try to
 	 * use the tree.
 	 */
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 
 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
 	    UMEM_NOFAIL);
 
 	/*
 	 * For leak detection, we overload the ms_allocatable trees
 	 * to contain allocated segments instead of free segments.
 	 * As a result, we can't use the normal metaslab_load/unload
 	 * interfaces.
 	 */
 	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
 	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
 	/*
 	 * On load_concrete_ms_allocatable_trees() we loaded all the
 	 * allocated entries from the ms_sm to the ms_allocatable for
 	 * each metaslab. If the pool has a checkpoint or is in the
 	 * middle of discarding a checkpoint, some of these blocks
 	 * may have been freed but their ms_sm may not have been
 	 * updated because they are referenced by the checkpoint. In
 	 * order to avoid false-positives during leak-detection, we
 	 * go through the vdev's checkpoint space map and exclude all
 	 * its entries from their relevant ms_allocatable.
 	 *
 	 * We also aggregate the space held by the checkpoint and add
 	 * it to zcb_checkpoint_size.
 	 *
 	 * Note that at this point we are also verifying that all the
 	 * entries on the checkpoint_sm are marked as allocated in
 	 * the ms_sm of their relevant metaslab.
 	 * [see comment in checkpoint_sm_exclude_entry_cb()]
 	 */
 	zdb_leak_init_exclude_checkpoint(spa, zcb);
 	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 
 	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_DEVICE_REMOVAL));
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
 }
 
 static boolean_t
 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 {
 	boolean_t leaks = B_FALSE;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t total_leaked = 0;
 	boolean_t are_precise = B_FALSE;
 
 	ASSERT(vim != NULL);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		uint64_t obsolete_bytes = 0;
 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 		/*
 		 * This is not very efficient but it's easy to
 		 * verify correctness.
 		 */
 		for (uint64_t inner_offset = 0;
 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
 		    inner_offset += 1ULL << vd->vdev_ashift) {
 			if (zfs_range_tree_contains(msp->ms_allocatable,
 			    offset + inner_offset, 1ULL << vd->vdev_ashift)) {
 				obsolete_bytes += 1ULL << vd->vdev_ashift;
 			}
 		}
 
 		int64_t bytes_leaked = obsolete_bytes -
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
 		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
 
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
 			(void) printf("obsolete indirect mapping count "
 			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
 			    (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 			    (u_longlong_t)bytes_leaked);
 		}
 		total_leaked += ABS(bytes_leaked);
 	}
 
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (!are_precise && total_leaked > 0) {
 		int pct_leaked = total_leaked * 100 /
 		    vdev_indirect_mapping_bytes_mapped(vim);
 		(void) printf("cannot verify obsolete indirect mapping "
 		    "counts of vdev %llu because precise feature was not "
 		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
 		    "unreferenced\n",
 		    (u_longlong_t)vd->vdev_id, pct_leaked,
 		    (u_longlong_t)total_leaked);
 	} else if (total_leaked > 0) {
 		(void) printf("obsolete indirect mapping count mismatch "
 		    "for vdev %llu -- %llx total bytes mismatched\n",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)total_leaked);
 		leaks |= B_TRUE;
 	}
 
 	vdev_indirect_mapping_free_obsolete_counts(vim,
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
 
 	return (leaks);
 }
 
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return (B_FALSE);
 
 	boolean_t leaks = B_FALSE;
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
 		}
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
 			    spa_embedded_log_class(spa)) ?
 			    vd->vdev_log_mg : vd->vdev_mg);
 
 			/*
 			 * ms_allocatable has been overloaded
 			 * to contain allocated segments. Now that
 			 * we finished traversing all blocks, any
 			 * block that remains in the ms_allocatable
 			 * represents an allocated block that we
 			 * did not claim during the traversal.
 			 * Claimed blocks would have been removed
 			 * from the ms_allocatable.  For indirect
 			 * vdevs, space remaining in the tree
 			 * represents parts of the mapping that are
 			 * not referenced, which is not a bug.
 			 */
 			if (vd->vdev_ops == &vdev_indirect_ops) {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    NULL, NULL);
 			} else {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    zdb_leak, vd);
 			}
 			if (msp->ms_loaded) {
 				msp->ms_loaded = B_FALSE;
 			}
 		}
 	}
 
 	umem_free(zcb->zcb_vd_obsolete_counts,
 	    rvd->vdev_children * sizeof (uint32_t *));
 	zcb->zcb_vd_obsolete_counts = NULL;
 
 	return (leaks);
 }
 
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 /*
  * Iterate over livelists which have been destroyed by the user but
  * are still present in the MOS, waiting to be freed
  */
 static void
 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	ASSERT0(err);
 
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	dsl_deadlist_t ll;
 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
 	ll.dl_os = NULL;
 	for (zap_cursor_init(&zc, mos, zap_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer));
 		func(&ll, arg);
 		dsl_deadlist_close(&ll);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (count_block_cb(arg, bp, tx));
 }
 
 static int
 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
 {
 	zdb_cb_t *zbc = args;
 	bplist_t blks;
 	bplist_create(&blks);
 	/* determine which blocks have been alloc'd but not freed */
 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
 	/* count those blocks */
 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
 	bplist_destroy(&blks);
 	return (0);
 }
 
 static void
 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
 {
 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
 }
 
 /*
  * Count the blocks in the livelists that have been destroyed by the user
  * but haven't yet been freed.
  */
 static void
 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
 {
 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
 }
 
 static void
 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
 {
 	ASSERT3P(arg, ==, NULL);
 	global_feature_count[SPA_FEATURE_LIVELIST]++;
 	dump_blkptr_list(ll, "Deleted Livelist");
 	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
 }
 
 /*
  * Print out, register object references to, and increment feature counts for
  * livelists that have been destroyed by the user but haven't yet been freed.
  */
 static void
 deleted_livelists_dump_mos(spa_t *spa)
 {
 	uint64_t zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	mos_obj_refd(zap_obj);
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
 static int
 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
 {
 	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
 	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
 	int cmp;
 
 	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (cmp == 0)
 		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
 
 	return (cmp);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t *zcb;
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 	int e, c, err;
 	bp_embedded_type_t i;
 
 	ddt_prefetch_all(spa);
 
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
 		    sizeof (zdb_brt_entry_t),
 		    offsetof(zdb_brt_entry_t, zbre_node));
 		zcb->zcb_brt_is_active = B_TRUE;
 	}
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * When leak detection is enabled we load all space maps as SM_ALLOC
 	 * maps, then traverse the pool claiming each block we discover. If
 	 * the pool is perfectly consistent, the segment trees will be empty
 	 * when we're done. Anything left over is a leak; any block we can't
 	 * claim (because it's not part of any space map) is a double
 	 * allocation, reference to a freed block, or an unclaimed log block.
 	 *
 	 * When leak detection is disabled (-L option) we still traverse the
 	 * pool claiming each block we discover, but we skip opening any space
 	 * maps.
 	 */
 	zdb_leak_init(spa, zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    bpobj_count_block_cb, zcb, NULL);
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    bpobj_count_block_cb, zcb, NULL);
 	}
 
 	zdb_claim_removing(spa, zcb);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    zcb, NULL));
 	}
 
 	deleted_livelists_count_blocks(spa, zcb);
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (c = 0; c < max_ncpus; c++) {
 			(void) zio_wait(spa->spa_async_zio_root[c]);
 			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	/*
 	 * Done after zio_wait() since zcb_haderrors is modified in
 	 * zdb_blkptr_done()
 	 */
 	zcb->zcb_haderrors |= err;
 
 	if (zcb->zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (e = 0; e < 256; e++) {
 			if (zcb->zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb->zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	leaks |= zdb_leak_fini(spa, zcb);
 
 	tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
 	total_found =
 	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
 	if (total_found == total_alloc && !dump_opt['L']) {
 		(void) printf("\n\tNo leaks (block sum matches space"
 		    " maps exactly)\n");
 	} else if (!dump_opt['L']) {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 	}
 
 	if (tzb->zb_count == 0) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	(void) printf("\n");
 	(void) printf("\t%-16s %14llu\n", "bp count:",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\t%-16s %14llu\n", "ganged count:",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp physical:", (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\t%-16s %14llu    count: %6llu\n",
 	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
 	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_special_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Special class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_dedup_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_dedup_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Dedup class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_embedded_log_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Embedded log class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb->zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb->zcb_embedded_histogram[i],
 			    sizeof (zcb->zcb_embedded_histogram[i]) /
 			    sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 	if (tzb->zb_ditto_same_ms != 0) {
 		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
 		    (longlong_t)tzb->zb_ditto_same_ms);
 	}
 
 	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		if (vim == NULL) {
 			continue;
 		}
 
 		char mem[32];
 		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
 		    mem, vdev_indirect_mapping_size(vim));
 
 		(void) printf("\tindirect vdev id %llu has %llu segments "
 		    "(%s in memory)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		char csize[32], lsize[32], psize[32], asize[32];
 		char avg[32], gang[32];
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
 		    UMEM_NOFAIL);
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			const char *typename;
 
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
 			    "csize truncated");
 			_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
 			    "lsize truncated");
 			_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
 			    "psize truncated");
 			_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
 			    "asize truncated");
 			_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
 			    "avg truncated");
 			_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
 			    "gang truncated");
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb->zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
 				    (level > 0 || DMU_OT_IS_METADATA(t))) {
 					mdstats->zb_count += zb->zb_count;
 					mdstats->zb_lsize += zb->zb_lsize;
 					mdstats->zb_psize += zb->zb_psize;
 					mdstats->zb_asize += zb->zb_asize;
 					mdstats->zb_gangs += zb->zb_gangs;
 				}
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb->zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize,
 				    sizeof (csize));
 				zdb_nicenum(zb->zb_lsize, lsize,
 				    sizeof (lsize));
 				zdb_nicenum(zb->zb_psize, psize,
 				    sizeof (psize));
 				zdb_nicenum(zb->zb_asize, asize,
 				    sizeof (asize));
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
 				    sizeof (avg));
 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 		zdb_nicenum(mdstats->zb_count, csize,
 		    sizeof (csize));
 		zdb_nicenum(mdstats->zb_lsize, lsize,
 		    sizeof (lsize));
 		zdb_nicenum(mdstats->zb_psize, psize,
 		    sizeof (psize));
 		zdb_nicenum(mdstats->zb_asize, asize,
 		    sizeof (asize));
 		zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
 		    sizeof (avg));
 		zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
 
 		(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 		    "\t%5.2f\t%6.2f\t",
 		    csize, lsize, psize, asize, avg,
 		    (double)mdstats->zb_lsize / mdstats->zb_psize,
 		    100.0 * mdstats->zb_asize / tzb->zb_asize);
 		(void) printf("%s\n", "Metadata Total");
 
 		/* Output a table summarizing block sizes in the pool */
 		if (dump_opt['b'] >= 2) {
 			dump_size_histograms(zcb);
 		}
 
 		umem_free(mdstats, sizeof (zfs_blkstat_t));
 	}
 
 	(void) printf("\n");
 
 	if (leaks) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	if (zcb->zcb_haderrors) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (3);
 	}
 
 	umem_free(zcb, sizeof (zdb_cb_t));
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	/* key must be first for ddt_key_compare */
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	avl_create(&t, ddt_key_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
 
 		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
 		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
 		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
 		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
 
 		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
 		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
 		dds->dds_ref_psize += zdde->zdde_ref_psize;
 		dds->dds_ref_dsize += zdde->zdde_ref_dsize;
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_total(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static int
 verify_device_removal_feature_counts(spa_t *spa)
 {
 	uint64_t dr_feature_refcount = 0;
 	uint64_t oc_feature_refcount = 0;
 	uint64_t indirect_vdev_count = 0;
 	uint64_t precise_vdev_count = 0;
 	uint64_t obsolete_counts_object_count = 0;
 	uint64_t obsolete_sm_count = 0;
 	uint64_t obsolete_counts_count = 0;
 	uint64_t scip_count = 0;
 	uint64_t obsolete_bpobj_count = 0;
 	int ret = 0;
 
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	if (scip->scip_next_mapping_object != 0) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
 		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		(void) printf("Condensing indirect vdev %llu: new mapping "
 		    "object %llu, prev obsolete sm %llu\n",
 		    (u_longlong_t)scip->scip_vdev,
 		    (u_longlong_t)scip->scip_next_mapping_object,
 		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
 		if (scip->scip_prev_obsolete_sm_object != 0) {
 			space_map_t *prev_obsolete_sm = NULL;
 			VERIFY0(space_map_open(&prev_obsolete_sm,
 			    spa->spa_meta_objset,
 			    scip->scip_prev_obsolete_sm_object,
 			    0, vd->vdev_asize, 0));
 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
 			(void) printf("\n");
 			space_map_close(prev_obsolete_sm);
 		}
 
 		scip_count += 2;
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (vic->vic_mapping_object != 0) {
 			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
 			    vd->vdev_removing);
 			indirect_vdev_count++;
 
 			if (vd->vdev_indirect_mapping->vim_havecounts) {
 				obsolete_counts_count++;
 			}
 		}
 
 		boolean_t are_precise;
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (are_precise) {
 			ASSERT(vic->vic_mapping_object != 0);
 			precise_vdev_count++;
 		}
 
 		uint64_t obsolete_sm_object;
 		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		if (obsolete_sm_object != 0) {
 			ASSERT(vic->vic_mapping_object != 0);
 			obsolete_sm_count++;
 		}
 	}
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
 	    &dr_feature_refcount);
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
 	    &oc_feature_refcount);
 
 	if (dr_feature_refcount != indirect_vdev_count) {
 		ret = 1;
 		(void) printf("Number of indirect vdevs (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)indirect_vdev_count,
 		    (u_longlong_t)dr_feature_refcount);
 	} else {
 		(void) printf("Verified device_removal feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)dr_feature_refcount);
 	}
 
 	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
 		obsolete_bpobj_count++;
 	}
 
 
 	obsolete_counts_object_count = precise_vdev_count;
 	obsolete_counts_object_count += obsolete_sm_count;
 	obsolete_counts_object_count += obsolete_counts_count;
 	obsolete_counts_object_count += scip_count;
 	obsolete_counts_object_count += obsolete_bpobj_count;
 	obsolete_counts_object_count += remap_deadlist_count;
 
 	if (oc_feature_refcount != obsolete_counts_object_count) {
 		ret = 1;
 		(void) printf("Number of obsolete counts objects (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)obsolete_counts_object_count,
 		    (u_longlong_t)oc_feature_refcount);
 		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
 		    "ob:%llu rd:%llu\n",
 		    (u_longlong_t)precise_vdev_count,
 		    (u_longlong_t)obsolete_sm_count,
 		    (u_longlong_t)obsolete_counts_count,
 		    (u_longlong_t)scip_count,
 		    (u_longlong_t)obsolete_bpobj_count,
 		    (u_longlong_t)remap_deadlist_count);
 	} else {
 		(void) printf("Verified indirect_refcount feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)oc_feature_refcount);
 	}
 	return (ret);
 }
 
 static void
 zdb_set_skip_mmp(char *target)
 {
 	spa_t *spa;
 
 	/*
 	 * Disable the activity check to allow examination of
 	 * active pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(target)) != NULL) {
 		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 #define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
 /*
  * Import the checkpointed state of the pool specified by the target
  * parameter as readonly. The function also accepts a pool config
  * as an optional parameter, else it attempts to infer the config by
  * the name of the target pool.
  *
  * Note that the checkpointed state's pool name will be the name of
  * the original pool with the above suffix appended to it. In addition,
  * if the target is not a pool name (e.g. a path to a dataset) then
  * the new_path parameter is populated with the updated path to
  * reflect the fact that we are looking into the checkpointed state.
  *
  * The function returns a newly-allocated copy of the name of the
  * pool containing the checkpointed state. When this copy is no
  * longer needed it should be freed with free(3C). Same thing
  * applies to the new_path parameter if allocated.
  */
 static char *
 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
 	boolean_t freecfg = B_FALSE;
 
 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
 	if (path_start != NULL) {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
 	} else {
 		poolname = target;
 	}
 
 	if (cfg == NULL) {
 		zdb_set_skip_mmp(poolname);
 		error = spa_get_stats(poolname, &cfg, NULL, 0);
 		if (error != 0) {
 			fatal("Tried to read config of pool \"%s\" but "
 			    "spa_get_stats() failed with error %d\n",
 			    poolname, error);
 		}
 		freecfg = B_TRUE;
 	}
 
 	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
 		if (target != poolname)
 			free(poolname);
 		return (NULL);
 	}
 	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
 
 	error = spa_import(bogus_name, cfg, NULL,
 	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
 	    ZFS_IMPORT_SKIP_MMP);
 	if (freecfg)
 		nvlist_free(cfg);
 	if (error != 0) {
 		fatal("Tried to import pool \"%s\" but spa_import() failed "
 		    "with error %d\n", bogus_name, error);
 	}
 
 	if (new_path != NULL && path_start != NULL) {
 		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
 			free(bogus_name);
 			if (path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
 	}
 
 	if (target != poolname)
 		free(poolname);
 
 	return (bogus_name);
 }
 
 typedef struct verify_checkpoint_sm_entry_cb_arg {
 	vdev_t *vcsec_vd;
 
 	/* the following fields are only used for printing progress */
 	uint64_t vcsec_entryid;
 	uint64_t vcsec_num_entries;
 } verify_checkpoint_sm_entry_cb_arg_t;
 
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
 		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vcsec->vcsec_entryid,
 		    (longlong_t)vcsec->vcsec_num_entries);
 	}
 	vcsec->vcsec_entryid++;
 
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * The entries in the vdev_checkpoint_sm should be marked as
 	 * allocated in the checkpointed state of the pool, therefore
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_verify_not_present(ms->ms_allocatable,
 	    sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
 }
 
 /*
  * Verify that all segments in the vdev_checkpoint_sm are allocated
  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
  * ms_allocatable).
  *
  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
  * each vdev in the current state of the pool to the metaslab space maps
  * (ms_sm) of the checkpointed state of the pool.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of the current spa_t. The entries of these ms_allocatable
  * trees are cleared out and then repopulated from with the free
  * entries of their respective ms_sm space maps.
  */
 static void
 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
 
 	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
 		vdev_t *current_vd = current_rvd->vdev_child[c];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * Since we don't allow device removal in a pool
 			 * that has a checkpoint, we expect that all removed
 			 * vdevs were removed from the pool before the
 			 * checkpoint.
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		/*
 		 * If the checkpoint space map doesn't exist, then nothing
 		 * here is checkpointed so there's nothing to verify.
 		 */
 		if (current_vd->vdev_top_zap == 0 ||
 		    zap_contains(spa_meta_objset(current),
 		    current_vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(current),
 		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
 		    current_vd->vdev_ashift));
 
 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
 		vcsec.vcsec_vd = ckpoint_vd;
 		vcsec.vcsec_entryid = 0;
 		vcsec.vcsec_num_entries =
 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
 		VERIFY0(space_map_iterate(checkpoint_sm,
 		    space_map_length(checkpoint_sm),
 		    verify_checkpoint_sm_entry_cb, &vcsec));
 		if (dump_opt['m'] > 3)
 			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 
 	/*
 	 * If we've added vdevs since we took the checkpoint, ensure
 	 * that their checkpoint space maps are empty.
 	 */
 	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
 		for (uint64_t c = ckpoint_rvd->vdev_children;
 		    c < current_rvd->vdev_children; c++) {
 			vdev_t *current_vd = current_rvd->vdev_child[c];
 			VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 /*
  * Verifies that all space that's allocated in the checkpoint is
  * still allocated in the current version, by checking that everything
  * in checkpoint's ms_allocatable (which is actually allocated, not
  * allocatable/free) is not present in current's ms_allocatable.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of both spas when called. The entries of all ms_allocatable
  * trees are cleared out and then repopulated from their respective
  * ms_sm space maps. In the checkpointed state we load the allocated
  * entries, and in the current state we load the free entries.
  */
 static void
 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
 	load_concrete_ms_allocatable_trees(current, SM_FREE);
 
 	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
 		vdev_t *current_vd = current_rvd->vdev_child[i];
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * See comment in verify_checkpoint_vdev_spacemaps()
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
 			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
 			metaslab_t *current_msp = current_vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rverifying vdev %llu of %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)current_vd->vdev_id,
 			    (longlong_t)current_rvd->vdev_children,
 			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
 			    (longlong_t)current_vd->vdev_ms_count);
 
 			/*
 			 * We walk through the ms_allocatable trees that
 			 * are loaded with the allocated blocks from the
 			 * ms_sm spacemaps of the checkpoint. For each
 			 * one of these ranges we ensure that none of them
 			 * exists in the ms_allocatable trees of the
 			 * current state which are loaded with the ranges
 			 * that are currently free.
 			 *
 			 * This way we ensure that none of the blocks that
 			 * are part of the checkpoint were freed by mistake.
 			 */
 			zfs_range_tree_walk(ckpoint_msp->ms_allocatable,
 			    (zfs_range_tree_func_t *)
 			    zfs_range_tree_verify_not_present,
 			    current_msp->ms_allocatable);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
 	ASSERT(!dump_opt['L']);
 
 	spa_t *checkpoint_spa;
 	char *checkpoint_pool;
 	int error = 0;
 
 	/*
 	 * We import the checkpointed state of the pool (under a different
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
 	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
 
 	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
 	if (error != 0) {
 		fatal("Tried to open pool \"%s\" but spa_open() failed with "
 		    "error %d\n", checkpoint_pool, error);
 	}
 
 	/*
 	 * Ensure that ranges in the checkpoint space maps of each vdev
 	 * are allocated according to the checkpointed state's metaslab
 	 * space maps.
 	 */
 	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Ensure that allocated ranges in the checkpoint's metaslab
 	 * space maps remain allocated in the metaslab space maps of
 	 * the current state.
 	 */
 	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Once we are done, we get rid of the checkpointed state.
 	 */
 	spa_close(checkpoint_spa, FTAG);
 	free(checkpoint_pool);
 }
 
 static void
 dump_leftover_checkpoint_blocks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (vd->vdev_top_zap == 0)
 			continue;
 
 		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 }
 
 static int
 verify_checkpoint(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (0);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
 		 * checkpoint.
 		 */
 		(void) printf("\nPartially discarded checkpoint "
 		    "state found:\n");
 		if (dump_opt['m'] > 3)
 			dump_leftover_checkpoint_blocks(spa);
 		return (0);
 	} else if (error != 0) {
 		(void) printf("lookup error %d when looking for "
 		    "checkpointed uberblock in MOS\n", error);
 		return (error);
 	}
 	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
 
 	if (checkpoint.ub_checkpoint_txg == 0) {
 		(void) printf("\nub_checkpoint_txg not set in checkpointed "
 		    "uberblock\n");
 		error = 3;
 	}
 
 	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
 }
 
 static void
 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
 {
 	(void) arg;
 	for (uint64_t i = start; i < size; i++) {
 		(void) printf("MOS object %llu referenced but not allocated\n",
 		    (u_longlong_t)i);
 	}
 }
 
 static void
 mos_obj_refd(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL)
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 /*
  * Call on a MOS object that may already have been referenced.
  */
 static void
 mos_obj_refd_multiple(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL &&
 	    !zfs_range_tree_contains(mos_refd_objs, obj, 1))
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 static void
 mos_leak_vdev_top_zap(vdev_t *vd)
 {
 	uint64_t ms_flush_data_obj;
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(ms_flush_data_obj);
 }
 
 static void
 mos_leak_vdev(vdev_t *vd)
 {
 	mos_obj_refd(vd->vdev_dtl_object);
 	mos_obj_refd(vd->vdev_ms_array);
 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
 	mos_obj_refd(vd->vdev_leaf_zap);
 	if (vd->vdev_checkpoint_sm != NULL)
 		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
 	if (vd->vdev_indirect_mapping != NULL) {
 		mos_obj_refd(vd->vdev_indirect_mapping->
 		    vim_phys->vimp_counts_object);
 	}
 	if (vd->vdev_obsolete_sm != NULL)
 		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		mos_obj_refd(space_map_object(ms->ms_sm));
 	}
 
 	if (vd->vdev_root_zap != 0)
 		mos_obj_refd(vd->vdev_root_zap);
 
 	if (vd->vdev_top_zap != 0) {
 		mos_obj_refd(vd->vdev_top_zap);
 		mos_leak_vdev_top_zap(vd);
 	}
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		mos_leak_vdev(vd->vdev_child[c]);
 	}
 }
 
 static void
 mos_leak_log_spacemaps(spa_t *spa)
 {
 	uint64_t spacemap_zap;
 	int error = zap_lookup(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
 	    sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(spacemap_zap);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
 		mos_obj_refd(sls->sls_sm_obj);
 }
 
 static void
 errorlog_count_refd(objset_t *mos, uint64_t errlog)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, mos, errlog);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		mos_obj_refd(za->za_first_integer);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 static int
 dump_mos_leaks(spa_t *spa)
 {
 	int rv = 0;
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	/* Visit and mark all referenced objects in the MOS */
 
 	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
 	mos_obj_refd(spa->spa_pool_props_object);
 	mos_obj_refd(spa->spa_config_object);
 	mos_obj_refd(spa->spa_ddt_stat_object);
 	mos_obj_refd(spa->spa_feat_desc_obj);
 	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
 	mos_obj_refd(spa->spa_feat_for_read_obj);
 	mos_obj_refd(spa->spa_feat_for_write_obj);
 	mos_obj_refd(spa->spa_history);
 	mos_obj_refd(spa->spa_errlog_last);
 	mos_obj_refd(spa->spa_errlog_scrub);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		errorlog_count_refd(mos, spa->spa_errlog_last);
 		errorlog_count_refd(mos, spa->spa_errlog_scrub);
 	}
 
 	mos_obj_refd(spa->spa_all_vdev_zaps);
 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
 	bpobj_count_refd(&spa->spa_deferred_bpobj);
 	mos_obj_refd(dp->dp_empty_bpobj);
 	bpobj_count_refd(&dp->dp_obsolete_bpobj);
 	bpobj_count_refd(&dp->dp_free_bpobj);
 	mos_obj_refd(spa->spa_l2cache.sav_object);
 	mos_obj_refd(spa->spa_spares.sav_object);
 
 	if (spa->spa_syncing_log_sm != NULL)
 		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
 	mos_leak_log_spacemaps(spa);
 
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_next_mapping_object);
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_prev_obsolete_sm_object);
 	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
 		vdev_indirect_mapping_t *vim =
 		    vdev_indirect_mapping_open(mos,
 		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
 		mos_obj_refd(vim->vim_phys->vimp_counts_object);
 		vdev_indirect_mapping_close(vim);
 	}
 	deleted_livelists_dump_mos(spa);
 
 	if (dp->dp_origin_snap != NULL) {
 		dsl_dataset_t *ds;
 
 		dsl_pool_config_enter(dp, FTAG);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
 		    FTAG, &ds));
 		count_ds_mos_objects(ds);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		count_ds_mos_objects(dp->dp_origin_snap);
 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
 	}
 	count_dir_mos_objects(dp->dp_mos_dir);
 	if (dp->dp_free_dir != NULL)
 		count_dir_mos_objects(dp->dp_free_dir);
 	if (dp->dp_leak_dir != NULL)
 		count_dir_mos_objects(dp->dp_leak_dir);
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
 	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 
 		/* DDT store objects */
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
 
 		/* FDT container */
 		if (ddt->ddt_version == DDT_VERSION_FDT)
 			mos_obj_refd(ddt->ddt_dir_object);
 
 		/* FDT log objects */
 		if (ddt->ddt_flags & DDT_FLAG_LOG) {
 			mos_obj_refd(ddt->ddt_log[0].ddl_object);
 			mos_obj_refd(ddt->ddt_log[1].ddl_object);
 		}
 	}
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (brtvd->bv_initiated) {
 			mos_obj_refd(brtvd->bv_mos_brtvdev);
 			mos_obj_refd(brtvd->bv_mos_entries);
 		}
 	}
 
 	/*
 	 * Visit all allocated objects and make sure they are referenced.
 	 */
 	uint64_t object = 0;
 	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
 		if (zfs_range_tree_contains(mos_refd_objs, object, 1)) {
 			zfs_range_tree_remove(mos_refd_objs, object, 1);
 		} else {
 			dmu_object_info_t doi;
 			const char *name;
 			VERIFY0(dmu_object_info(mos, object, &doi));
 			if (doi.doi_type & DMU_OT_NEWTYPE) {
 				dmu_object_byteswap_t bswap =
 				    DMU_OT_BYTESWAP(doi.doi_type);
 				name = dmu_ot_byteswap[bswap].ob_name;
 			} else {
 				name = dmu_ot[doi.doi_type].ot_name;
 			}
 
 			(void) printf("MOS object %llu (%s) leaked\n",
 			    (u_longlong_t)object, name);
 			rv = 2;
 		}
 	}
 	(void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
 	if (!zfs_range_tree_is_empty(mos_refd_objs))
 		rv = 2;
 	zfs_range_tree_vacate(mos_refd_objs, NULL, NULL);
 	zfs_range_tree_destroy(mos_refd_objs);
 	return (rv);
 }
 
 typedef struct log_sm_obsolete_stats_arg {
 	uint64_t lsos_current_txg;
 
 	uint64_t lsos_total_entries;
 	uint64_t lsos_valid_entries;
 
 	uint64_t lsos_sm_entries;
 	uint64_t lsos_valid_sm_entries;
 } log_sm_obsolete_stats_arg_t;
 
 static int
 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	log_sm_obsolete_stats_arg_t *lsos = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	if (lsos->lsos_current_txg == 0) {
 		/* this is the first log */
 		lsos->lsos_current_txg = txg;
 	} else if (lsos->lsos_current_txg < txg) {
 		/* we just changed log - print stats and reset */
 		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 		    (u_longlong_t)lsos->lsos_valid_sm_entries,
 		    (u_longlong_t)lsos->lsos_sm_entries,
 		    (u_longlong_t)lsos->lsos_current_txg);
 		lsos->lsos_valid_sm_entries = 0;
 		lsos->lsos_sm_entries = 0;
 		lsos->lsos_current_txg = txg;
 	}
 	ASSERT3U(lsos->lsos_current_txg, ==, txg);
 
 	lsos->lsos_sm_entries++;
 	lsos->lsos_total_entries++;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 	lsos->lsos_valid_sm_entries++;
 	lsos->lsos_valid_entries++;
 	return (0);
 }
 
 static void
 dump_log_spacemap_obsolete_stats(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	log_sm_obsolete_stats_arg_t lsos = {0};
 
 	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
 
 	iterate_through_spacemap_logs(spa,
 	    log_spacemap_obsolete_stats_cb, &lsos);
 
 	/* print stats for latest log */
 	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 	    (u_longlong_t)lsos.lsos_valid_sm_entries,
 	    (u_longlong_t)lsos.lsos_sm_entries,
 	    (u_longlong_t)lsos.lsos_current_txg);
 
 	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
 	    (u_longlong_t)lsos.lsos_valid_entries,
 	    (u_longlong_t)lsos.lsos_total_entries);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['y']) {
 		livelist_metaslab_validate(spa);
 	}
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['T'])
 		dump_brt(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa, dump_opt['M'] > 1);
 	if (dump_opt['d'] > 2 || dump_opt['m']) {
 		dump_log_spacemaps(spa);
 		dump_log_spacemap_obsolete_stats(spa);
 	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
 		mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 		    NULL, 0, 0);
 		dump_objset(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
 			dsl_pool_t *dp = spa->spa_dsl_pool;
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(&dp->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_DEVICE_REMOVAL));
 				dump_full_bpobj(&dp->dp_obsolete_bpobj,
 				    "Pool obsolete blocks", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    dp->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
 			global_feature_count[f] = UINT64_MAX;
 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
 		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
 
 		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		if (rc == 0 && !dump_opt['L'])
 			rc = dump_mos_leaks(spa);
 
 		for (f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			uint64_t *arr;
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				if (global_feature_count[f] == UINT64_MAX)
 					continue;
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(global_feature_count[f]);
 					continue;
 				}
 				arr = global_feature_count;
 			} else {
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(dataset_feature_count[f]);
 					continue;
 				}
 				arr = dataset_feature_count;
 			}
 			if (feature_get_refcount(spa, &spa_feature_table[f],
 			    &refcount) == ENOTSUP)
 				continue;
 			if (arr[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld consumers != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)arr[f], (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 
 		if (rc == 0)
 			rc = verify_device_removal_feature_counts(spa);
 	}
 
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {
 		dump_debug_buffer();
 		zdb_exit(rc);
 	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_RAW		0x0020
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0040
 #define	ZDB_FLAG_VERBOSE	0x0080
 
 static int flagbits[256];
 static char flagbitstr[16];
 
 static void
 zdb_print_blkptr(const blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	VERIFY(write(fileno(stdout), buf, size) == size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	unsigned nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	unsigned i, j;
 	const char *hdr;
 	char *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 #ifdef _ZFS_LITTLE_ENDIAN
 	/* correct the endianness */
 	do_bswap = !do_bswap;
 #endif
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the hierarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, const char *path)
 {
 	char *s, *p, *q;
 	unsigned i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (s && *s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 static int
 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
 {
 	dsl_dataset_t *ds;
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
 	    NULL, &ds);
 	if (error != 0) {
 		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
 		    (u_longlong_t)objset_id, strerror(error));
 		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, outstr);
 	dsl_dataset_rele(ds, NULL);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	return (0);
 }
 
 static boolean_t
 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
 {
 	char *s0, *s1, *tmp = NULL;
 
 	if (sizes == NULL)
 		return (B_FALSE);
 
 	s0 = strtok_r(sizes, "/", &tmp);
 	if (s0 == NULL)
 		return (B_FALSE);
 	s1 = strtok_r(NULL, "/", &tmp);
 	*lsize = strtoull(s0, NULL, 16);
 	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
 	return (*lsize >= *psize && *psize > 0);
 }
 
 #define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
 
 static boolean_t
 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
     int flags, int cfunc, void *lbuf, void *lbuf2)
 {
 	if (flags & ZDB_FLAG_VERBOSE) {
 		(void) fprintf(stderr,
 		    "Trying %05llx -> %05llx (%s)\n",
 		    (u_longlong_t)psize,
 		    (u_longlong_t)lsize,
 		    zio_compress_table[cfunc].ci_name);
 	}
 
 	/*
 	 * We set lbuf to all zeros and lbuf2 to all
 	 * ones, then decompress to both buffers and
 	 * compare their contents. This way we can
 	 * know if decompression filled exactly to
 	 * lsize or if it left some bytes unwritten.
 	 */
 
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
 	abd_t labd, labd2;
 	abd_get_from_buf_struct(&labd, lbuf, lsize);
 	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
 
 	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
 	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
 	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
 		ret = B_TRUE;
 
 	abd_free(&labd2);
 	abd_free(&labd);
 
 	return (ret);
 }
 
 static uint64_t
 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
     uint64_t psize, int flags)
 {
 	(void) buf;
 	uint64_t orig_lsize = lsize;
 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
 	boolean_t found = B_FALSE;
 	/*
 	 * We don't know how the data was compressed, so just try
 	 * every decompress function at every inflated blocksize.
 	 */
 	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
 	int *cfuncp = cfuncs;
 	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
 	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
 	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
 	    ZIO_COMPRESS_MASK(ZLE);
 	*cfuncp++ = ZIO_COMPRESS_LZ4;
 	*cfuncp++ = ZIO_COMPRESS_LZJB;
 	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
 	/*
 	 * Every gzip level has the same decompressor, no need to
 	 * run it 9 times per bruteforce attempt.
 	 */
 	mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
 	mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
 	mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
 	mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
 	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
 		if (((1ULL << c) & mask) == 0)
 			*cfuncp++ = c;
 
 	/*
 	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
 	 * could take a while and we should let the user know
 	 * we are not stuck.  On the other hand, printing progress
 	 * info gets old after a while.  User can specify 'v' flag
 	 * to see the progression.
 	 */
 	if (lsize == psize)
 		lsize += SPA_MINBLOCKSIZE;
 	else
 		maxlsize = lsize;
 
 	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    *cfuncp, lbuf, lbuf2)) {
 				found = B_TRUE;
 				break;
 			}
 		}
 		if (*cfuncp != 0)
 			break;
 	}
 	if (!found && tryzle) {
 		for (lsize = orig_lsize; lsize <= maxlsize;
 		    lsize += SPA_MINBLOCKSIZE) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
 				*cfuncp = ZIO_COMPRESS_ZLE;
 				found = B_TRUE;
 				break;
 			}
 		}
 	}
 	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 	if (*cfuncp == ZIO_COMPRESS_ZLE) {
 		printf("\nZLE decompression was selected. If you "
 		    "suspect the results are wrong,\ntry avoiding ZLE "
 		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
 	}
 
 	return (lsize > maxlsize ? -1 : lsize);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		 c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 r: Dump raw data to stdout
  *		 v: Verbose
  *
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	abd_t *pabd;
 	void *lbuf, *buf;
 	char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
 	const char *vdev, *errmsg = NULL;
 	int i, len, error;
 	boolean_t borrowed = B_FALSE, found = B_FALSE;
 
 	dup = strdup(thing);
 	s = strtok_r(dup, ":", &tmp);
 	vdev = s ?: "";
 	s = strtok_r(NULL, ":", &tmp);
 	offset = strtoull(s ? s : "", NULL, 16);
 	sizes = strtok_r(NULL, ":", &tmp);
 	s = strtok_r(NULL, ":", &tmp);
 	flagstr = strdup(s ?: "");
 
 	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
 		errmsg = "invalid size(s)";
 	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
 		errmsg = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		errmsg = "offset must be a multiple of sector size";
 	if (errmsg) {
 		(void) printf("Invalid block specifier: %s  - %s\n",
 		    thing, errmsg);
 		goto done;
 	}
 
 	tmp = NULL;
 	for (s = strtok_r(flagstr, ":", &tmp);
 	    s != NULL;
 	    s = strtok_r(NULL, ":", &tmp)) {
 		len = strlen(flagstr);
 		for (i = 0; i < len; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Ignoring flag: %c\n",
 				    (uchar_t)flagstr[i]);
 				continue;
 			}
 			found = B_TRUE;
 			flags |= bit;
 
 			p = &flagstr[i + 1];
 			if (*p != ':' && *p != '\0') {
 				int j = 0, nextbit = flagbits[(uchar_t)*p];
 				char *end, offstr[8] = { 0 };
 				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
 				    (nextbit == 0)) {
 					/* look ahead to isolate the offset */
 					while (nextbit == 0 &&
 					    strchr(flagbitstr, *p) == NULL) {
 						offstr[j] = *p;
 						j++;
 						if (i + j > strlen(flagstr))
 							break;
 						p++;
 						nextbit = flagbits[(uchar_t)*p];
 					}
 					blkptr_offset = strtoull(offstr, &end,
 					    16);
 					i += j;
 				} else if (nextbit == 0) {
 					(void) printf("***Ignoring flag arg:"
 					    " '%c'\n", (uchar_t)*p);
 				}
 			}
 		}
 	}
 	if (blkptr_offset % sizeof (blkptr_t)) {
 		printf("Block pointer offset 0x%llx "
 		    "must be divisible by 0x%x\n",
 		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
 		goto done;
 	}
 	if (found == B_FALSE && strlen(flagstr) > 0) {
 		printf("Invalid flag arg: '%s'\n", flagstr);
 		goto done;
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		goto done;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
-	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_GANG(&dva[0], 0);
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	zio = zio_root(spa, NULL, NULL, 0);
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
 		    NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	uint64_t orig_lsize = lsize;
 	buf = lbuf;
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		lsize = zdb_decompress_block(pabd, buf, lbuf,
 		    lsize, psize, flags);
 		if (lsize == -1) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 	} else {
 		buf = abd_borrow_buf_copy(pabd, lsize);
 		borrowed = B_TRUE;
 	}
 	/*
 	 * Try to detect invalid block pointer.  If invalid, try
 	 * decompressing.
 	 */
 	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
 	    !(flags & ZDB_FLAG_DECOMPRESS)) {
 		const blkptr_t *b = (const blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 		if (zfs_blkptr_verify(spa, b,
 		    BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {
 			abd_return_buf_copy(pabd, buf, lsize);
 			borrowed = B_FALSE;
 			buf = lbuf;
 			lsize = zdb_decompress_block(pabd, buf,
 			    lbuf, lsize, psize, flags);
 			b = (const blkptr_t *)(void *)
 			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 			if (lsize == -1 || zfs_blkptr_verify(spa, b,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 				printf("invalid block pointer at this DVA\n");
 				goto out;
 			}
 		}
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, lsize, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf,
 		    orig_lsize / sizeof (blkptr_t), flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, flags);
 	else
 		zdb_dump_block(thing, buf, lsize, flags);
 
 	/*
 	 * If :c was specified, iterate through the checksum table to
 	 * calculate and display each checksum for our specified
 	 * DVA and length.
 	 */
 	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
 	    !(flags & ZDB_FLAG_GBH)) {
 		zio_t *czio;
 		(void) printf("\n");
 		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
 		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
 
 			if ((zio_checksum_table[ck].ci_flags &
 			    ZCHECKSUM_FLAG_EMBEDDED) ||
 			    ck == ZIO_CHECKSUM_NOPARITY) {
 				continue;
 			}
 			BP_SET_CHECKSUM(bp, ck);
 			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			if (vd == vd->vdev_top) {
 				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
 				    NULL, NULL,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_DONT_RETRY, NULL));
 			} else {
 				zio_nowait(zio_vdev_child_io(czio, bp, vd,
 				    offset, pabd, psize, ZIO_TYPE_READ,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY |
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_SPECULATIVE |
 				    ZIO_FLAG_OPTIONAL, NULL, NULL));
 			}
 			error = zio_wait(czio);
 			if (error == 0 || error == ECKSUM) {
 				zio_t *ck_zio = zio_null(NULL, spa, NULL,
 				    NULL, NULL, 0);
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
 				zio_checksum_compute(ck_zio, ck, pabd, lsize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
 				    zio_checksum_table[ck].ci_name,
 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
 				    (u_longlong_t)bp->blk_cksum.zc_word[1],
 				    (u_longlong_t)bp->blk_cksum.zc_word[2],
 				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 				zio_wait(ck_zio);
 			} else {
 				printf("error %d reading block\n", error);
 			}
 			spa_config_exit(spa, SCL_STATE, FTAG);
 		}
 	}
 
 	if (borrowed)
 		abd_return_buf_copy(pabd, buf, lsize);
 
 out:
 	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 done:
 	free(flagstr);
 	free(dup);
 }
 
 static void
 zdb_embedded_block(char *thing)
 {
 	blkptr_t bp = {{{{0}}}};
 	unsigned long long *words = (void *)&bp;
 	char *buf;
 	int err;
 
 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
 	    words + 0, words + 1, words + 2, words + 3,
 	    words + 4, words + 5, words + 6, words + 7,
 	    words + 8, words + 9, words + 10, words + 11,
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) fprintf(stderr, "invalid input format\n");
 		zdb_exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
 	buf = malloc(SPA_MAXBLOCKSIZE);
 	if (buf == NULL) {
 		(void) fprintf(stderr, "out of memory\n");
 		zdb_exit(1);
 	}
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) fprintf(stderr, "decode failed: %u\n", err);
 		zdb_exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
 	free(buf);
 }
 
 /* check for valid hex or decimal numeric string */
 static boolean_t
 zdb_numeric(char *str)
 {
 	int i = 0, len;
 
 	len = strlen(str);
 	if (len == 0)
 		return (B_FALSE);
 	if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
 		i = 2;
 	for (; i < len; i++) {
 		if (!isxdigit(str[i]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 static int
 dummy_get_file_info(dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zoi)
 {
 	(void) data, (void) zoi;
 
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 
 	(void) fprintf(stderr, "dummy_get_file_info: not implemented");
 	abort();
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int64_t objset_id = -1;
 	uint64_t object;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env, *objset_str;
 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
 	nvlist_t *cfg = NULL;
 	struct sigaction action;
 	boolean_t force_import = B_FALSE;
 	boolean_t config_path_console = B_FALSE;
 	char pbuf[MAXPATHLEN];
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * Set up signal handlers, so if we crash due to bad on-disk data we
 	 * can get more info. Unlike ztest, we don't bail out if we can't set
 	 * up signal handlers, because zdb is very useful without them.
 	 */
 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
 		    strerror(errno));
 	}
 	if (sigaction(SIGABRT, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
 		    strerror(errno));
 	}
 
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
 	 * override this environment variable settings once again.
 	 */
 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
 	/*
 	 * For performance reasons, we set this tunable down. We do so before
 	 * the arg parsing section so that the user can override this value if
 	 * they choose.
 	 */
 	zfs_btree_verify_intensity = 3;
 
 	struct option long_options[] = {
 		{"ignore-assertions",	no_argument,		NULL, 'A'},
 		{"block-stats",		no_argument,		NULL, 'b'},
 		{"backup",		no_argument,		NULL, 'B'},
 		{"checksum",		no_argument,		NULL, 'c'},
 		{"config",		no_argument,		NULL, 'C'},
 		{"datasets",		no_argument,		NULL, 'd'},
 		{"dedup-stats",		no_argument,		NULL, 'D'},
 		{"exported",		no_argument,		NULL, 'e'},
 		{"embedded-block-pointer",	no_argument,	NULL, 'E'},
 		{"automatic-rewind",	no_argument,		NULL, 'F'},
 		{"dump-debug-msg",	no_argument,		NULL, 'G'},
 		{"history",		no_argument,		NULL, 'h'},
 		{"intent-logs",		no_argument,		NULL, 'i'},
 		{"inflight",		required_argument,	NULL, 'I'},
 		{"checkpointed-state",	no_argument,		NULL, 'k'},
 		{"key",			required_argument,	NULL, 'K'},
 		{"label",		no_argument,		NULL, 'l'},
 		{"disable-leak-tracking",	no_argument,	NULL, 'L'},
 		{"metaslabs",		no_argument,		NULL, 'm'},
 		{"metaslab-groups",	no_argument,		NULL, 'M'},
 		{"numeric",		no_argument,		NULL, 'N'},
 		{"option",		required_argument,	NULL, 'o'},
 		{"object-lookups",	no_argument,		NULL, 'O'},
 		{"path",		required_argument,	NULL, 'p'},
 		{"parseable",		no_argument,		NULL, 'P'},
 		{"skip-label",		no_argument,		NULL, 'q'},
 		{"copy-object",		no_argument,		NULL, 'r'},
 		{"read-block",		no_argument,		NULL, 'R'},
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
 		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
 		{"verbatim",		no_argument,		NULL, 'V'},
 		{"dump-blocks",		required_argument,	NULL, 'x'},
 		{"extreme-rewind",	no_argument,		NULL, 'X'},
 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
 		{"livelist",		no_argument,		NULL, 'y'},
 		{"zstd-headers",	no_argument,		NULL, 'Z'},
 		{0, 0, 0, 0}
 	};
 
 	while ((c = getopt_long(argc, argv,
 	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
 		case 'B':
 		case 'c':
 		case 'C':
 		case 'd':
 		case 'D':
 		case 'E':
 		case 'G':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 'M':
 		case 'N':
 		case 'O':
 		case 'r':
 		case 'R':
 		case 's':
 		case 'S':
 		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'e':
 		case 'F':
 		case 'k':
 		case 'L':
 		case 'P':
 		case 'q':
 		case 'X':
 			dump_opt[c]++;
 			break;
 		case 'Y':
 			zfs_reconstruct_indirect_combinations_max = INT_MAX;
 			zfs_deadman_enabled = 0;
 			break;
 		/* NB: Sort single match options below. */
 		case 'I':
 			max_inflight_bytes = strtoull(optarg, NULL, 0);
 			if (max_inflight_bytes == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight bytes must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'K':
 			dump_opt[c]++;
 			key_material = strdup(optarg);
 			/* redact key material in process table */
 			while (*optarg != '\0') { *optarg++ = '*'; }
 			break;
 		case 'o':
 			error = set_global_var(optarg);
 			if (error != 0)
 				usage();
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				memcpy(tmp, searchdirs, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			config_path_console = B_TRUE;
 			spa_config_path = optarg;
 			if (spa_config_path[0] != '/') {
 				(void) fprintf(stderr,
 				    "cachefile must be an absolute path "
 				    "(i.e. start with a slash)\n");
 				usage();
 			}
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'V':
 			flags = ZFS_IMPORT_VERBATIM;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 #if defined(_LP64)
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
 	zfs_arc_max = 256 * 1024 * 1024;
 #endif
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	/*
 	 * Disable reference tracking for better performance.
 	 */
 	reference_tracking_enable = B_FALSE;
 
 	/*
 	 * Do not fail spa_load when spa_load_verify fails. This is needed
 	 * to load non-idle pools.
 	 */
 	spa_load_verify_dryrun = B_TRUE;
 
 	/*
 	 * ZDB should have ability to read spacemaps.
 	 */
 	spa_mode_readable_spacemaps = B_TRUE;
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 	if (argc < 2 && dump_opt['R'])
 		usage();
 
 	target = argv[0];
 
 	/*
 	 * Automate cachefile
 	 */
 	if (!spa_config_path_env && !config_path_console && target &&
 	    libzfs_core_init() == 0) {
 		char *pname = strdup(target);
 		const char *value;
 		nvlist_t *pnvl = NULL;
 		nvlist_t *vnvl = NULL;
 
 		if (strpbrk(pname, "/@") != NULL)
 			*strpbrk(pname, "/@") = '\0';
 
 		if (pname && lzc_get_props(pname, &pnvl) == 0) {
 			if (nvlist_lookup_nvlist(pnvl, "cachefile",
 			    &vnvl) == 0) {
 				value = fnvlist_lookup_string(vnvl,
 				    ZPROP_VALUE);
 			} else {
 				value = "-";
 			}
 			strlcpy(pbuf, value, sizeof (pbuf));
 			if (pbuf[0] != '\0') {
 				if (pbuf[0] == '/') {
 					if (access(pbuf, F_OK) == 0)
 						spa_config_path = pbuf;
 					else
 						force_import = B_TRUE;
 				} else if ((strcmp(pbuf, "-") == 0 &&
 				    access(ZPOOL_CACHE, F_OK) != 0) ||
 				    strcmp(pbuf, "none") == 0) {
 					force_import = B_TRUE;
 				}
 			}
 			nvlist_free(vnvl);
 		}
 
 		free(pname);
 		nvlist_free(pnvl);
 		libzfs_core_fini();
 	}
 
 	dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info);
 	kernel_init(SPA_MODE_READ);
 	kernel_init_done = B_TRUE;
 
 	if (dump_opt['E']) {
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
 		error = 0;
 		goto fini;
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			error = 0;
 			goto fini;
 		}
 		usage();
 	}
 
 	if (dump_opt['l']) {
 		error = dump_label(argv[0]);
 		goto fini;
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	/* -N implies -d */
 	if (dump_opt['N'] && dump_opt['d'] == 0)
 		dump_opt['d'] = dump_opt['N'];
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 
 	if (strpbrk(target, "/@") != NULL) {
 		size_t targetlen;
 
 		target_pool = strdup(target);
 		*strpbrk(target_pool, "/@") = '\0';
 
 		target_is_spa = B_FALSE;
 		targetlen = strlen(target);
 		if (targetlen && target[targetlen - 1] == '/')
 			target[targetlen - 1] = '\0';
 
 		/*
 		 * See if an objset ID was supplied (-d <pool>/<objset ID>).
 		 * To disambiguate tank/100, consider the 100 as objsetID
 		 * if -N was given, otherwise 100 is an objsetID iff
 		 * tank/100 as a named dataset fails on lookup.
 		 */
 		objset_str = strchr(target, '/');
 		if (objset_str && strlen(objset_str) > 1 &&
 		    zdb_numeric(objset_str + 1)) {
 			char *endptr;
 			errno = 0;
 			objset_str++;
 			objset_id = strtoull(objset_str, &endptr, 0);
 			/* dataset 0 is the same as opening the pool */
 			if (errno == 0 && endptr != objset_str &&
 			    objset_id != 0) {
 				if (dump_opt['N'])
 					dataset_lookup = B_TRUE;
 			}
 			/* normal dataset name not an objset ID */
 			if (endptr == objset_str) {
 				objset_id = -1;
 			}
 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
 		    dump_opt['N']) {
 			printf("Supply a numeric objset ID with -N\n");
 			error = 1;
 			goto fini;
 		}
 	} else {
 		target_pool = target;
 	}
 
 	if (dump_opt['e'] || force_import) {
 		importargs_t args = { 0 };
 
 		/*
 		 * If path is not provided, search in /dev
 		 */
 		if (searchdirs == NULL) {
 			searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL);
 			searchdirs[nsearch++] = (char *)ZFS_DEVDIR;
 		}
 
 		args.paths = nsearch;
 		args.path = searchdirs;
 		args.can_be_active = B_TRUE;
 
 		libpc_handle_t lpch = {
 			.lpc_lib_handle = NULL,
 			.lpc_ops = &libzpool_config_ops,
 			.lpc_printerr = B_TRUE
 		};
 		error = zpool_find_config(&lpch, target_pool, &cfg, &args);
 
 		if (error == 0) {
 
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_LOAD_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 
 			/*
 			 * Disable the activity check to allow examination of
 			 * active pools.
 			 */
 			error = spa_import(target_pool, cfg, NULL,
 			    flags | ZFS_IMPORT_SKIP_MMP);
 		}
 	}
 
 	if (searchdirs != NULL) {
 		umem_free(searchdirs, nsearch * sizeof (char *));
 		searchdirs = NULL;
 	}
 
 	/*
 	 * We need to make sure to process -O option or call
 	 * dump_path after the -e option has been processed,
 	 * which imports the pool to the namespace if it's
 	 * not in the cachefile.
 	 */
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		error = dump_path(argv[0], argv[1], NULL);
 		goto fini;
 	}
 
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
 	 * namespace. Because of that we need to make sure to call
 	 * it always after the -e option has been processed, which
 	 * imports the pool to the namespace if it's not in the
 	 * cachefile.
 	 */
 	char *checkpoint_pool = NULL;
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
 		    &checkpoint_target);
 
 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
 	}
 
 	if (cfg != NULL) {
 		nvlist_free(cfg);
 		cfg = NULL;
 	}
 
 	if (target_pool != target)
 		free(target_pool);
 
 	if (error == 0) {
 		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
 			ASSERT(checkpoint_pool != NULL);
 			ASSERT(checkpoint_target == NULL);
 
 			error = spa_open(checkpoint_pool, &spa, FTAG);
 			if (error != 0) {
 				fatal("Tried to open pool \"%s\" but "
 				    "spa_open() failed with error %d\n",
 				    checkpoint_pool, error);
 			}
 
 		} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
 		    objset_id == 0) {
 			zdb_set_skip_mmp(target);
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else if (strpbrk(target, "#") != NULL) {
 			dsl_pool_t *dp;
 			error = dsl_pool_hold(target, FTAG, &dp);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
 			dsl_pool_rele(dp, FTAG);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			goto fini;
 		} else {
 			target_pool = strdup(target);
 			if (strpbrk(target, "/@") != NULL)
 				*strpbrk(target_pool, "/@") = '\0';
 
 			zdb_set_skip_mmp(target);
 			/*
 			 * If -N was supplied, the user has indicated that
 			 * zdb -d <pool>/<objsetID> is in effect.  Otherwise
 			 * we first assume that the dataset string is the
 			 * dataset name.  If dmu_objset_hold fails with the
 			 * dataset string, and we have an objset_id, retry the
 			 * lookup with the objsetID.
 			 */
 			boolean_t retry = B_TRUE;
 retry_lookup:
 			if (dataset_lookup == B_TRUE) {
 				/*
 				 * Use the supplied id to get the name
 				 * for open_objset.
 				 */
 				error = spa_open(target_pool, &spa, FTAG);
 				if (error == 0) {
 					error = name_from_objset_id(spa,
 					    objset_id, dsname);
 					spa_close(spa, FTAG);
 					if (error == 0)
 						target = dsname;
 				}
 			}
 			if (error == 0) {
 				if (objset_id > 0 && retry) {
 					int err = dmu_objset_hold(target, FTAG,
 					    &os);
 					if (err) {
 						dataset_lookup = B_TRUE;
 						retry = B_FALSE;
 						goto retry_lookup;
 					} else {
 						dmu_objset_rele(os, FTAG);
 					}
 				}
 				error = open_objset(target, FTAG, &os);
 			}
 			if (error == 0)
 				spa = dmu_objset_spa(os);
 			free(target_pool);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	/*
 	 * Set the pool failure mode to panic in order to prevent the pool
 	 * from suspending.  A suspended I/O will have no way to resume and
 	 * can prevent the zdb(8) command from terminating as expected.
 	 */
 	if (spa != NULL)
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	argv++;
 	argc--;
 	if (dump_opt['r']) {
 		error = zdb_copy_object(os, object, argv[1]);
 	} else if (!dump_opt['R']) {
 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
 		flagbits['z'] = ZOR_FLAG_ZAP;
 		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
 
 		if (argc > 0 && dump_opt['d']) {
 			zopt_object_args = argc;
 			zopt_object_ranges = calloc(zopt_object_args,
 			    sizeof (zopt_object_range_t));
 			for (unsigned i = 0; i < zopt_object_args; i++) {
 				int err;
 				const char *msg = NULL;
 
 				err = parse_object_range(argv[i],
 				    &zopt_object_ranges[i], &msg);
 				if (err != 0)
 					fatal("Bad object or range: '%s': %s\n",
 					    argv[i], msg ?: "");
 			}
 		} else if (argc > 0 && dump_opt['m']) {
 			zopt_metaslab_args = argc;
 			zopt_metaslab = calloc(zopt_metaslab_args,
 			    sizeof (uint64_t));
 			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
 				errno = 0;
 				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_metaslab[i] == 0 && errno != 0)
 					fatal("bad number %s: %s", argv[i],
 					    strerror(errno));
 			}
 		}
 		if (dump_opt['B']) {
 			dump_backup(target, objset_id,
 			    argc > 0 ? argv[0] : NULL);
 		} else if (os != NULL) {
 			dump_objset(os);
 		} else if (zopt_object_args > 0 && !dump_opt['m']) {
 			dump_objset(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['r'] = ZDB_FLAG_RAW;
 		flagbits['v'] = ZDB_FLAG_VERBOSE;
 
 		for (int i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	if (dump_opt['k']) {
 		free(checkpoint_pool);
 		if (!target_is_spa)
 			free(checkpoint_target);
 	}
 
 fini:
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	dump_debug_buffer();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	return (error);
 }
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 2e9b7edf8691..285e02484c57 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -1,532 +1,533 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
 #define	_SYS_DBUF_H
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_refcount.h>
 #include <sys/zrlock.h>
 #include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	IN_DMU_SYNC 2
 
 /*
  * define flags for dbuf_read
  */
 
 #define	DB_RF_MUST_SUCCEED	(1 << 0)
 #define	DB_RF_CANFAIL		(1 << 1)
 #define	DB_RF_HAVESTRUCT	(1 << 2)
 #define	DB_RF_NOPREFETCH	(1 << 3)
 #define	DB_RF_NEVERWAIT		(1 << 4)
 #define	DB_RF_CACHED		(1 << 5)
 #define	DB_RF_NO_DECRYPT	(1 << 6)
 #define	DB_RF_PARTIAL_FIRST	(1 << 7)
 #define	DB_RF_PARTIAL_MORE	(1 << 8)
 
 /*
  * The simplified state transition diagram for dbufs looks like:
  *
  *                  +-------> READ ------+
  *                  |                    |
  *                  |                    V
  *  (alloc)-->UNCACHED                  CACHED-->EVICTING-->(free)
  *             ^    |                    ^          ^
  *             |    |                    |          |
  *             |    +-------> FILL ------+          |
  *             |    |                    |          |
  *             |    |                    |          |
  *             |    +------> NOFILL -----+-----> UNCACHED
  *             |               |               (Direct I/O)
  *             +---------------+
  *
  * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
  * to find all dbufs in a range of a dnode and must be less than any other
  * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
 	DB_MARKER = -2,
 	DB_SEARCH = -1,
 	DB_UNCACHED,
 	DB_FILL,
 	DB_NOFILL,
 	DB_READ,
 	DB_CACHED,
 	DB_EVICTING
 } dbuf_states_t;
 
 typedef enum dbuf_cached_state {
 	DB_NO_CACHE = -1,
 	DB_DBUF_CACHE,
 	DB_DBUF_METADATA_CACHE,
 	DB_CACHE_MAX
 } dbuf_cached_state_t;
 
 struct dnode;
 struct dmu_tx;
 
 /*
  * level = 0 means the user data
  * level = 1 means the single indirect block
  * etc.
  */
 
 struct dmu_buf_impl;
 
 typedef enum override_states {
 	DR_NOT_OVERRIDDEN,
 	DR_IN_DMU_SYNC,
 	DR_OVERRIDDEN
 } override_states_t;
 
 typedef enum db_lock_type {
 	DLT_NONE,
 	DLT_PARENT,
 	DLT_OBJSET
 } db_lock_type_t;
 
 typedef struct dbuf_dirty_record {
 	/* link on our parents dirty list */
 	list_node_t dr_dirty_node;
 
 	/* transaction group this data will sync in */
 	uint64_t dr_txg;
 
 	/* zio of outstanding write IO */
 	zio_t *dr_zio;
 
 	/* pointer back to our dbuf */
 	struct dmu_buf_impl *dr_dbuf;
 
 	/* list link for dbuf dirty records */
 	list_node_t dr_dbuf_node;
 
 	/*
 	 * The dnode we are part of.  Note that the dnode can not be moved or
 	 * evicted due to the hold that's added by dnode_setdirty() or
 	 * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or
 	 * userquota_updates_task().  This hold is necessary for
 	 * dirty_lightweight_leaf-type dirty records, which don't have a hold
 	 * on a dbuf.
 	 */
 	dnode_t *dr_dnode;
 
 	/* pointer to parent dirty record */
 	struct dbuf_dirty_record *dr_parent;
 
 	/* How much space was changed to dsl_pool_dirty_space() for this? */
 	unsigned int dr_accounted;
 
 	/* A copy of the bp that points to us */
 	blkptr_t dr_bp_copy;
 
 	union dirty_types {
 		struct dirty_indirect {
 
 			/* protect access to list */
 			kmutex_t dr_mtx;
 
 			/* Our list of dirty children */
 			list_t dr_children;
 		} di;
 		struct dirty_leaf {
 
 			/*
 			 * dr_data is set when we dirty the buffer
 			 * so that we can retain the pointer even if it
 			 * gets COW'd in a subsequent transaction group.
 			 */
 			arc_buf_t *dr_data;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
+			uint8_t dr_gang_copies;
 			boolean_t dr_nopwrite;
 			boolean_t dr_brtwrite;
 			boolean_t dr_diowrite;
 			boolean_t dr_has_raw_params;
 
 			/* Override and raw params are mutually exclusive. */
 			union {
 				blkptr_t dr_overridden_by;
 				struct {
 					/*
 					 * If dr_has_raw_params is set, the
 					 * following crypt params will be set
 					 * on the BP that's written.
 					 */
 					boolean_t dr_byteorder;
 					uint8_t	dr_salt[ZIO_DATA_SALT_LEN];
 					uint8_t	dr_iv[ZIO_DATA_IV_LEN];
 					uint8_t	dr_mac[ZIO_DATA_MAC_LEN];
 				};
 			};
 		} dl;
 		struct dirty_lightweight_leaf {
 			/*
 			 * This dirty record refers to a leaf (level=0)
 			 * block, whose dbuf has not been instantiated for
 			 * performance reasons.
 			 */
 			uint64_t dr_blkid;
 			abd_t *dr_abd;
 			zio_prop_t dr_props;
 			zio_flag_t dr_flags;
 		} dll;
 	} dt;
 } dbuf_dirty_record_t;
 
 typedef struct dmu_buf_impl {
 	/*
 	 * The following members are immutable, with the exception of
 	 * db.db_data, which is protected by db_mtx.
 	 */
 
 	/* the publicly visible structure */
 	dmu_buf_t db;
 
 	/* the objset we belong to */
 	struct objset *db_objset;
 
 	/*
 	 * Handle to safely access the dnode we belong to (NULL when evicted)
 	 * if dnode_move() is used on the platform, or just dnode otherwise.
 	 */
 #if !defined(__linux__) && !defined(__FreeBSD__)
 #define	USE_DNODE_HANDLE	1
 	struct dnode_handle *db_dnode_handle;
 #else
 	struct dnode *db_dnode;
 #endif
 
 	/*
 	 * our parent buffer; if the dnode points to us directly,
 	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 	 * only accessed by sync thread ???
 	 * (NULL when evicted)
 	 * May change from NULL to non-NULL under the protection of db_mtx
 	 * (see dbuf_check_blkptr())
 	 */
 	struct dmu_buf_impl *db_parent;
 
 	/*
 	 * link for hash table of all dmu_buf_impl_t's
 	 */
 	struct dmu_buf_impl *db_hash_next;
 
 	/*
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.  Should be on the same cache line
 	 * as db_level and db_blkid for the best avl_add() performance.
 	 */
 	avl_node_t db_link;
 
 	/* our block number */
 	uint64_t db_blkid;
 
 	/*
 	 * Pointer to the blkptr_t which points to us. May be NULL if we
 	 * don't have one yet. (NULL when evicted)
 	 */
 	blkptr_t *db_blkptr;
 
 	/*
 	 * Our indirection level.  Data buffers have db_level==0.
 	 * Indirect buffers which point to data buffers have
 	 * db_level==1. etc.  Buffers which contain dnodes have
 	 * db_level==0, since the dnodes are stored in a file.
 	 */
 	uint8_t db_level;
 
 	/* This block was freed while a read or write was active. */
 	uint8_t db_freed_in_flight;
 
 	/*
 	 * Evict user data as soon as the dirty and reference counts are equal.
 	 */
 	uint8_t db_user_immediate_evict;
 
 	/*
 	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to evict this dbuf,
 	 * but couldn't due to outstanding references.  Evict once the refcount
 	 * drops to 0.
 	 */
 	uint8_t db_pending_evict;
 
 	/* Number of TXGs in which this buffer is dirty. */
 	uint8_t db_dirtycnt;
 
 	/* The buffer was partially read.  More reads may follow. */
 	uint8_t db_partial_read;
 
 	/*
 	 * Protects db_buf's contents if they contain an indirect block or data
 	 * block of the meta-dnode. We use this lock to protect the structure of
 	 * the block tree. This means that when modifying this dbuf's data, we
 	 * grab its rwlock. When modifying its parent's data (including the
 	 * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering
 	 * for this lock is:
 	 * 1) dn_struct_rwlock
 	 * 2) db_rwlock
 	 * We don't currently grab multiple dbufs' db_rwlocks at once.
 	 */
 	krwlock_t db_rwlock;
 
 	/* buffer holding our data */
 	arc_buf_t *db_buf;
 
 	/* db_mtx protects the members below */
 	kmutex_t db_mtx;
 
 	/*
 	 * Current state of the buffer
 	 */
 	dbuf_states_t db_state;
 
 	/* In which dbuf cache this dbuf is, if any. */
 	dbuf_cached_state_t db_caching_status;
 
 	/*
 	 * Refcount accessed by dmu_buf_{hold,rele}.
 	 * If nonzero, the buffer can't be destroyed.
 	 * Protected by db_mtx.
 	 */
 	zfs_refcount_t db_holds;
 
 	kcondvar_t db_changed;
 	dbuf_dirty_record_t *db_data_pending;
 
 	/* List of dirty records for the buffer sorted newest to oldest. */
 	list_t db_dirty_records;
 
 	/* Link in dbuf_cache or dbuf_metadata_cache */
 	multilist_node_t db_cache_link;
 
 	uint64_t db_hash;
 
 	/* User callback information. */
 	dmu_buf_user_t *db_user;
 } dmu_buf_impl_t;
 
 #define	DBUF_HASH_MUTEX(h, idx) \
 	(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
 
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
 	uint64_t hash_mutex_mask;
 	dmu_buf_impl_t **hash_table;
 	kmutex_t *hash_mutexes;
 } dbuf_hash_table_t;
 
 typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
 
 extern kmem_cache_t *dbuf_dirty_kmem_cache;
 
 uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
     const uint64_t offset);
 
 void dbuf_create_bonus(struct dnode *dn);
 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
 
 void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, const void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     const void *tag);
 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     const void *tag, dmu_buf_impl_t **dbp);
 
 int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
     void *arg);
 int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, const void *tag);
 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
     uint64_t blkid, const void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, const void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag,
     boolean_t evicting);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid, uint64_t *hash_out);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
 boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
     dmu_tx_t *tx);
 boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp);
 int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
 int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const struct zio_prop *zp, zio_flag_t flags, dmu_tx_t *tx);
 
 void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx);
 void dbuf_destroy(dmu_buf_impl_t *db);
 
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw,
     const void *tag);
 void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type,
     const void *tag);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
 void dbuf_stats_init(dbuf_hash_table_t *hash);
 void dbuf_stats_destroy(void);
 
 int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift);
 
 #ifdef USE_DNODE_HANDLE
 #define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
 #define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
 #define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
 #else
 #define	DB_DNODE(_db)		((_db)->db_dnode)
 #define	DB_DNODE_LOCK(_db)
 #define	DB_DNODE_ENTER(_db)
 #define	DB_DNODE_EXIT(_db)
 #define	DB_DNODE_HELD(_db)	(B_TRUE)
 #endif
 
 void dbuf_init(void);
 void dbuf_fini(void);
 
 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
 static inline dbuf_dirty_record_t *
 dbuf_find_dirty_lte(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr;
 
 	for (dr = list_head(&db->db_dirty_records);
 	    dr != NULL && dr->dr_txg > txg;
 	    dr = list_next(&db->db_dirty_records, dr))
 		continue;
 	return (dr);
 }
 
 static inline dbuf_dirty_record_t *
 dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr;
 
 	dr = dbuf_find_dirty_lte(db, txg);
 	if (dr && dr->dr_txg == txg)
 		return (dr);
 	return (NULL);
 }
 
 #define	DBUF_GET_BUFC_TYPE(_db)	\
 	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 #define	DBUF_IS_CACHEABLE(_db)						\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
 boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);
 
 #ifdef ZFS_DEBUG
 
 /*
  * There should be a ## between the string literal and fmt, to make it
  * clear that we're joining two strings together, but gcc does not
  * support that preprocessor token.
  */
 #define	dprintf_dbuf(dbuf, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char __db_buf[32]; \
 	uint64_t __db_obj = (dbuf)->db.db_object; \
 	if (__db_obj == DMU_META_DNODE_OBJECT) \
 		(void) strlcpy(__db_buf, "mdn", sizeof (__db_buf));	\
 	else \
 		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
 		    (u_longlong_t)__db_obj); \
 	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
 	    "obj=%s lvl=%u blkid=%lld " fmt, \
 	    __db_buf, (dbuf)->db_level, \
 	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
 	} \
 } while (0)
 
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	}							\
 } while (0)
 
 #define	DBUF_VERIFY(db)	dbuf_verify(db)
 
 #else
 
 #define	dprintf_dbuf(db, fmt, ...)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...)
 #define	DBUF_VERIFY(db)
 
 #endif
 
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DBUF_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index af47d6f87a41..78adca4d7d00 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -1,736 +1,737 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  * Copyright (c) 2024 by George Melikov. All rights reserved.
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers.
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
 	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_BLAKE3,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1U << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_ON
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ZSTD ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 
 #define	ZIO_COMPRESS_ALGO(x)	(x & SPA_COMPRESSMASK)
 #define	ZIO_COMPRESS_LEVEL(x)	((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS)
 #define	ZIO_COMPRESS_RAW(type, level)	(type | ((level) << SPA_COMPRESSBITS))
 
 #define	ZIO_COMPLEVEL_ZSTD(level)	\
 	ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 typedef enum zio_suspend_reason {
 	ZIO_SUSPEND_NONE = 0,
 	ZIO_SUSPEND_IOERR,
 	ZIO_SUSPEND_MMP,
 } zio_suspend_reason_t;
 
 /*
  * This was originally an enum type. However, those are 32-bit and there is no
  * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
  * forced to upgrade it to a uint64_t.
  *
  * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
  * FLAG.
  */
 typedef uint64_t zio_flag_t;
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 #define	ZIO_FLAG_DONT_AGGREGATE	(1ULL << 0)
 #define	ZIO_FLAG_IO_REPAIR	(1ULL << 1)
 #define	ZIO_FLAG_SELF_HEAL	(1ULL << 2)
 #define	ZIO_FLAG_RESILVER	(1ULL << 3)
 #define	ZIO_FLAG_SCRUB		(1ULL << 4)
 #define	ZIO_FLAG_SCAN_THREAD	(1ULL << 5)
 #define	ZIO_FLAG_PHYSICAL	(1ULL << 6)
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 #define	ZIO_FLAG_CANFAIL	(1ULL << 7)	/* must be first for INHERIT */
 #define	ZIO_FLAG_SPECULATIVE	(1ULL << 8)
 #define	ZIO_FLAG_CONFIG_WRITER	(1ULL << 9)
 #define	ZIO_FLAG_DONT_RETRY	(1ULL << 10)
 #define	ZIO_FLAG_NODATA		(1ULL << 12)
 #define	ZIO_FLAG_INDUCE_DAMAGE	(1ULL << 13)
 #define	ZIO_FLAG_IO_ALLOCATING	(1ULL << 14)
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 #define	ZIO_FLAG_IO_RETRY	(1ULL << 15)	/* must be first for INHERIT */
 #define	ZIO_FLAG_PROBE		(1ULL << 16)
 #define	ZIO_FLAG_TRYHARD	(1ULL << 17)
 #define	ZIO_FLAG_OPTIONAL	(1ULL << 18)
 #define	ZIO_FLAG_DIO_READ	(1ULL << 19)
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 #define	ZIO_FLAG_DONT_QUEUE	(1ULL << 20)	/* must be first for INHERIT */
 #define	ZIO_FLAG_DONT_PROPAGATE	(1ULL << 21)
 #define	ZIO_FLAG_IO_BYPASS	(1ULL << 22)
 #define	ZIO_FLAG_IO_REWRITE	(1ULL << 23)
 #define	ZIO_FLAG_RAW_COMPRESS	(1ULL << 24)
 #define	ZIO_FLAG_RAW_ENCRYPT	(1ULL << 25)
 #define	ZIO_FLAG_GANG_CHILD	(1ULL << 26)
 #define	ZIO_FLAG_DDT_CHILD	(1ULL << 27)
 #define	ZIO_FLAG_GODFATHER	(1ULL << 28)
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 29)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 30)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 31)
 #define	ZIO_FLAG_DIO_CHKSUM_ERR	(1ULL << 32)
 
 #define	ZIO_ALLOCATOR_NONE	(-1)
 #define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_CHILD_BIT(x)		(1U << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1U << (x)))
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 #define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
 #define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
 	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_exclude_metadata;
 extern int zio_dva_throttle_enabled;
 extern const char *const zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  *
  * If the head_errlog feature is enabled a different on-disk format for error
  * logs is used. This introduces the use of an error bookmark, a four-tuple
  * <object, level, blkid, birth> that uniquely identifies any error block
  * in the pool. The birth transaction group is used to track whether the block
  * has been overwritten by newer data or added to a snapshot since its marking
  * as an error.
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 struct zbookmark_err_phys {
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 	uint64_t	zb_birth;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	uint8_t			zp_complevel;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
+	uint8_t			zp_gang_copies;
 	dmu_object_type_t	zp_type;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
 	boolean_t		zp_brtwrite;
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	boolean_t		zp_direct_write;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 	uint32_t		zp_zpl_smallblk;
 	dmu_object_type_t	zp_storage_type;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_sector;
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
  * be able to propagate them to the parent.  The normal io_flags are local
  * to the zio, not protected by any lock, and not modifiable by children;
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
  * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
  */
 enum trim_flag {
 	ZIO_TRIM_SECURE		= 1U << 0,
 };
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 enum zio_qstate {
 	ZIO_QS_NONE = 0,
 	ZIO_QS_QUEUED,
 	ZIO_QS_ACTIVE,
 };
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
 	enum zio_qstate	io_queue_state;	/* vdev queue state */
 	union {
 		list_node_t l;
 		avl_node_t a;
 	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
 	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	zio_flag_t	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	zio_flag_t	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	void		*io_bio;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
 
 enum blk_verify_flag {
 	BLK_VERIFY_ONLY,
 	BLK_VERIFY_LOG,
 	BLK_VERIFY_HALT
 };
 
 enum blk_config_flag {
 	BLK_CONFIG_HELD,   // SCL_VDEV held for writer
 	BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
 	BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
 	BLK_CONFIG_SKIP,   // skip checks which require SCL_VDEV
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern void zio_destroy(zio_t *zio);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
-    boolean_t nopwrite, boolean_t brtwrite);
+    int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, zio_flag_t flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern size_t zio_get_compression_max_size(enum zio_compress compress,
     uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len);
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(void *zio);
 extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, const char *tag);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 extern void zio_add_child_first(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern void zio_dio_chksum_verify_error_report(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress,
     uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, const char *tag,
     uint64_t type);
 extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
     uint64_t type, int error);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
 extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
 
 /*
  * Checksum ereport functions
  */
 extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, const abd_t *good_data, const abd_t *bad_data,
     struct zio_bad_cksum *info);
 
 void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr);
 extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa,
     const char *name);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index e97d588b4de7..d07a5f076a25 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1,11095 +1,11097 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * metadata limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed.  For example, when using the ZPL each dentry
  * holds a references on a znode.  These dentries must be pruned before
  * the arc buffer holding the znode can be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will memcpy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <sys/wmsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 #include <sys/zfs_racct.h>
 #include <sys/zstd/zstd.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 static arc_buf_hdr_t **arc_state_evict_markers;
 static int arc_state_evict_marker_count;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 static clock_t arc_last_uncached_flush;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 static uint_t zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 static uint_t zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 uint_t arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 static const int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 static int zfs_arc_overflow_shift = 8;
 
 /* log2(fraction of arc to reclaim) */
 uint_t arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 uint_t		arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static uint_t		arc_min_prefetch_ms;
 static uint_t		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 uint_t arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max = 0;
 uint64_t zfs_arc_min = 0;
 static uint64_t zfs_arc_dnode_limit = 0;
 static uint_t zfs_arc_dnode_reduce_percent = 10;
 static uint_t zfs_arc_grow_retry = 0;
 static uint_t zfs_arc_shrink_shift = 0;
 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle:
  * * total dirty data limit
  * * anon block dirty limit
  * * each pool's anon allowance
  */
 static const unsigned long zfs_arc_dirty_limit_percent = 50;
 static const unsigned long zfs_arc_anon_limit_percent = 25;
 static const unsigned long zfs_arc_pool_dirty_percent = 20;
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * Balance between metadata and data on ghost hits.  Values above 100
  * increase metadata caching by proportionally reducing effect of ghost
  * data hits on target data/metadata rate.
  */
 static uint_t zfs_arc_meta_balance = 500;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 static uint_t zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux-specific
  */
 static uint64_t zfs_arc_sys_free = 0;
 static uint_t zfs_arc_min_prefetch_ms = 0;
 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
 static uint_t zfs_arc_lotsfree_percent = 10;
 
 /*
  * Number of arc_prune threads
  */
 static int zfs_arc_prune_task_threads = 1;
 
 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
 static taskq_t *arc_flush_taskq;
 
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
 arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "iohits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_iohits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "uncached_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "meta",			KSTAT_DATA_UINT64 },
 	{ "pd",				KSTAT_DATA_UINT64 },
 	{ "pm",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_data",			KSTAT_DATA_UINT64 },
 	{ "anon_metadata",		KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_data",			KSTAT_DATA_UINT64 },
 	{ "mru_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_data",			KSTAT_DATA_UINT64 },
 	{ "mfu_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "uncached_size",		KSTAT_DATA_UINT64 },
 	{ "uncached_data",		KSTAT_DATA_UINT64 },
 	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 arc_sums_t arc_sums;
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 	} while (0)
 
 static kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_UNCACHED(hdr)	((hdr)->b_flags & ARC_FLAG_UNCACHED)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Asynchronous ARC flush
  *
  * We track these in a list for arc_async_flush_guid_inuse().
  * Used for both L1 and L2 async teardown.
  */
 static list_t arc_async_flush_list;
 static kmutex_t	arc_async_flush_lock;
 
 typedef struct arc_async_flush {
 	uint64_t	af_spa_guid;
 	taskq_ent_t	af_tqent;
 	uint_t		af_cache_level;	/* 1 or 2 to differentiate node */
 	list_node_t	af_node;
 } arc_async_flush_t;
 
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(32 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		8			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 typedef enum arc_ovf_level {
 	ARC_OVF_NONE,			/* ARC within target size. */
 	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_USE_RESERVE = 0x4,
 	ARC_HDR_ALLOC_LINEAR = 0x8,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
     const void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_hdr_destroy(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
 static void arc_buf_watch(arc_buf_t *);
 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
 static void arc_prune_async(uint64_t adjust);
 
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 #define	l2arc_hdr_arcstats_increment_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
 /*
  * l2arc_exclude_special : A zfs module parameter that controls whether buffers
  * 		present on special vdevs are eligibile for caching in L2ARC. If
  * 		set to 1, exclude dbufs on special vdevs from being cached to
  * 		L2ARC.
  */
 int l2arc_exclude_special = 0;
 
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
 static int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 static uint64_t l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 static int l2arc_rebuild_enabled = B_TRUE;
 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_GET_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 	ARCSTAT_BUMP(arcstat_hash_elements);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 #endif
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_t *buf = vbuf;
 
 	memset(buf, 0, sizeof (arc_buf_t));
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	(void) vbuf;
 
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	(void) sig, (void) unused;
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #else
 	(void) buf;
 #endif
 }
 
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #else
 	(void) buf;
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 #endif
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t asize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch, arc_state_type_t arcs_state)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	ASSERT(dev->l2ad_vdev != NULL);
 
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_L2SIZE(hdr, asize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 	hdr->b_l2hdr.b_arcs_state = arcs_state;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 	boolean_t free_abd = B_FALSE;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(abd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, &abd, lsize, MIN(lsize, psize),
 		    hdr->b_complevel);
 		if (csize >= lsize || csize > psize) {
 			ret = SET_ERROR(EIO);
 			return (ret);
 		}
 		ASSERT3P(abd, !=, NULL);
 		abd_zero_off(abd, csize, psize - csize);
 		free_abd = B_TRUE;
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret == ENOENT)
 		ret = 0;
 
 	if (free_abd)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, 0);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			goto error;
 		}
 
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(arc_buf_is_shared(buf));
 		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERTF(ARC_BUF_COMPRESSED(buf),
 			"buf %p was uncompressed", buf);
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			ASSERT(!arc_buf_is_shared(buf));
 
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			abd_t dabd;
 			abd_get_from_buf_struct(&dabd, buf->b_data,
 			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 			abd_free(&dabd);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
 		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    state != arc_anon && state != arc_l2c_only) {
 		/* We don't use the L2-only state list. */
 		multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		arc_evictable_space_decrement(hdr, state);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(!GHOST_STATE(state));	/* arc_l2c_only counts as a ghost. */
 
 	if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
 		return (cnt);
 
 	if (state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
 	arc_evictable_space_increment(hdr, state);
 	return (0);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	(void) state_index;
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = 0;
 		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
 			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		update_old = (hdr->b_l1hdr.b_buf != NULL ||
 		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
 		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 	if (GHOST_STATE(old_state))
 		update_old = B_TRUE;
 	if (GHOST_STATE(new_state))
 		update_new = B_TRUE;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(new_state, !=, old_state);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			/* remove_reference() saves on insert. */
 			if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 				multilist_remove(&old_state->arcs_list[type],
 				    hdr);
 				arc_evictable_space_decrement(hdr, old_state);
 			}
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(&new_state->arcs_list[type], hdr);
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have no arc
 			 * buffer to use for the reference. As a result, we
 			 * use the arc header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(
 			    &new_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(
 			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		hdr->b_l1hdr.b_state = new_state;
 
 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
 			l2arc_hdr_arcstats_decrement_state(hdr);
 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
 			l2arc_hdr_arcstats_increment_state(hdr);
 		}
 	}
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, -space);
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     const void *tag, boolean_t encrypted, boolean_t compressed,
     boolean_t noauth, boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT3P(*ret, ==, NULL);
 	IMPLY(encrypted, compressed);
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static const char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_free(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		/*
 		 * If we have no more encrypted buffers and we've already
 		 * gotten a copy of the decrypted data we can free b_rabd
 		 * to save some space.
 		 */
 		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
 		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
 			arc_buf_t *b;
 			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
 				if (b != buf && ARC_BUF_ENCRYPTED(b))
 					break;
 			}
 			if (b == NULL)
 				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 /*
  * Allocate empty anonymous ARC header.  The header will get its identity
  * assigned and buffers attached later as part of read or write operations.
  *
  * In case of read arc_read() assigns header its identify (b_dva + b_birth),
  * inserts it into ARC hash to become globally visible and allocates physical
  * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk.  On disk read
  * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
  * sharing one of them with the physical ABD buffer.
  *
  * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
  * data.  Then after compression and/or encryption arc_write_ready() allocates
  * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
  * buffer.  On disk write completion arc_write_done() assigns the header its
  * new identity (b_dva + b_birth) and inserts into ARC hash.
  *
  * In case of partial overwrite the old data is read first as described. Then
  * arc_release() either allocates new anonymous ARC header and moves the ARC
  * buffer to it, or reuses the old ARC header by discarding its identity and
  * removing it from ARC hash.  After buffer modification normal write process
  * follows as described.
  */
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 
 	ASSERT(HDR_EMPTY(hdr));
 #ifdef ZFS_DEBUG
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
     int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
     uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	/*
 	 * To ensure that the hdr has the correct data in it if we call
 	 * arc_untransform() on this buf before it's been written to disk,
 	 * it's easiest if we just set up sharing between the buf and the hdr.
 	 */
 	arc_share_buf(hdr, buf);
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
     boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = HDR_GET_L2SIZE(hdr);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
 	/* For L2 we expect the header's b_l2size to be valid */
 	ASSERT3U(asize, >=, psize);
 
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
 		asize_s = asize;
 	} else {
 		lsize_s = -lsize;
 		psize_s = -psize;
 		asize_s = -asize;
 	}
 
 	/* If the buffer is a prefetch, count it as such. */
 	if (HDR_PREFETCH(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
 	} else {
 		/*
 		 * We use the value stored in the L2 header upon initial
 		 * caching in L2ARC. This value will be updated in case
 		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
 		 * metadata (log entry) cannot currently be updated. Having
 		 * the ARC state in the L2 header solves the problem of a
 		 * possibly absent L1 header (apparent in buffers restored
 		 * from persistent L2ARC).
 		 */
 		switch (hdr->b_l2hdr.b_arcs_state) {
 			case ARC_STATE_MRU_GHOST:
 			case ARC_STATE_MRU:
 				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
 				break;
 			case ARC_STATE_MFU_GHOST:
 			case ARC_STATE_MFU:
 				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
 				break;
 			default:
 				break;
 		}
 	}
 
 	if (state_only)
 		return;
 
 	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
 	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
 
 	switch (type) {
 		case ARC_BUFC_DATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
 			break;
 		case ARC_BUFC_METADATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
 			break;
 		default:
 			break;
 	}
 }
 
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
 	if (dev->l2ad_vdev != NULL) {
 		uint64_t asize = HDR_GET_L2SIZE(hdr);
 		vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 	}
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr)) {
 
 			if (!HDR_EMPTY(hdr))
 				buf_discard_identity(hdr);
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, tag));
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	arc_buf_destroy_impl(buf);
 	(void) remove_reference(hdr, tag);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  *    - arc_uncached -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
  * progress at the same (or at least comparable) rate as from non-ghost states.
  *
  * Return *real_evicted for actual ARC size reduction to wake up threads
  * waiting for it.  For non-ghost states it includes size of evicted data
  * buffers (the headers are not freed there).  For ghost states it includes
  * only the evicted headers size.
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			(void) arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
 	evicted_state = (state == arc_uncached) ? arc_anon :
 	    ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
 
 	/* prefetch buffers have a minimum lifespan */
 	if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 
 			switch (state->arcs_state) {
 				case ARC_STATE_MRU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mru,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				case ARC_STATE_MFU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mfu,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				default:
 					break;
 			}
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	bytes_evicted += arc_hdr_size(hdr);
 	*real_evicted += arc_hdr_size(hdr);
 
 	/*
 	 * If this hdr is being evicted and has a compressed buffer then we
 	 * discard it here before we change states.  This ensures that the
 	 * accounting is updated correctly in arc_free_data_impl().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL)
 		arc_hdr_free_abd(hdr, B_FALSE);
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	arc_change_state(evicted_state, hdr);
 	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	if (evicted_state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		*real_evicted += HDR_FULL_SIZE;
 	} else {
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint_t evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
 	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((evict_count == 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
 			uint64_t evicted = arc_evict_hdr(hdr, &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
 
 static arc_buf_hdr_t *
 arc_state_alloc_marker(void)
 {
 	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 	/*
 	 * A b_spa of 0 is used to indicate that this header is
 	 * a marker. This fact is used in arc_evict_state_impl().
 	 */
 	marker->b_spa = 0;
 
 	return (marker);
 }
 
 static void
 arc_state_free_marker(arc_buf_hdr_t *marker)
 {
 	kmem_cache_free(hdr_full_cache, marker);
 }
 
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
  */
 static arc_buf_hdr_t **
 arc_state_alloc_markers(int count)
 {
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
 	for (int i = 0; i < count; i++)
 		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
 static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
 		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
     uint64_t bytes)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	if (zthr_iscurthread(arc_evict_zthr)) {
 		markers = arc_state_evict_markers;
 		ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
 	} else {
 		markers = arc_state_alloc_markers(num_sublists);
 	}
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
 		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (++sublist_idx >= num_sublists)
 				sublist_idx = 0;
 		}
 
 		/*
 		 * If we didn't evict anything during this scan, we have
 		 * no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified. This
  * function prevents us from trying to evict more from a state's list
  * than is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, type, 0, delta));
 	}
 
 	return (0);
 }
 
 /*
  * Adjust specified fraction, taking into account initial ghost state(s) size,
  * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
  * decreasing it, plus a balance factor, controlling the decrease rate, used
  * to balance metadata vs data.
  */
 static uint64_t
 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
     uint_t balance)
 {
 	if (total < 8 || up + down == 0)
 		return (frac);
 
 	/*
 	 * We should not have more ghost hits than ghost size, but they
 	 * may get close.  Restrict maximum adjustment in that case.
 	 */
 	if (up + down >= total / 4) {
 		uint64_t scale = (up + down) / (total / 8);
 		up /= scale;
 		down /= scale;
 	}
 
 	/* Get maximal dynamic range by choosing optimal shifts. */
 	int s = highbit64(total);
 	s = MIN(64 - s, 32);
 
 	uint64_t ofrac = (1ULL << 32) - frac;
 
 	if (frac >= 4 * ofrac)
 		up /= frac / (2 * ofrac + 1);
 	up = (up << s) / (total >> (32 - s));
 	if (ofrac >= 4 * frac)
 		down /= ofrac / (2 * frac + 1);
 	down = (down << s) / (total >> (32 - s));
 	down = down * 100 / balance;
 
 	return (frac + up - down);
 }
 
 /*
  * Calculate (x * multiplier / divisor) without unnecesary overflows.
  */
 static uint64_t
 arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor)
 {
 	uint64_t q = (x / divisor);
 	uint64_t r = (x % divisor);
 
 	return ((q * multiplier) + ((r * multiplier) / divisor));
 }
 
 /*
  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t bytes, total_evicted = 0;
 	int64_t e, mrud, mrum, mfud, mfum, w;
 	static uint64_t ogrd, ogrm, ogfd, ogfm;
 	static uint64_t gsrd, gsrm, gsfd, gsfm;
 	uint64_t ngrd, ngrm, ngfd, ngfm;
 
 	/* Get current size of ARC states we can evict from. */
 	mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	uint64_t d = mrud + mfud;
 	uint64_t m = mrum + mfum;
 	uint64_t t = d + m;
 
 	/* Get ARC ghost hits since last eviction. */
 	ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t grd = ngrd - ogrd;
 	ogrd = ngrd;
 	ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t grm = ngrm - ogrm;
 	ogrm = ngrm;
 	ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t gfd = ngfd - ogfd;
 	ogfd = ngfd;
 	ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t gfm = ngfm - ogfm;
 	ogfm = ngfm;
 
 	/* Adjust ARC states balance based on ghost hits. */
 	arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
 	    grm + gfm, grd + gfd, zfs_arc_meta_balance);
 	arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
 	arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
 
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 	uint64_t ac = arc_c;
 	int64_t wt = t - (asize - ac);
 
 	/*
 	 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
 	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
 	int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
 	    + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	if (nem > w * 3 / 4) {
 		prune = dn / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 		if (nem < w && w > 4)
 			prune = arc_mf(prune, nem - w * 3 / 4, w / 4);
 	}
 	if (dn > arc_dnode_limit) {
 		prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100);
 	}
 	if (prune > 0)
 		arc_prune_async(prune);
 
 	/* Evict MRU metadata. */
 	w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(mrum - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mrum -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU metadata. */
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(m - bytes - w));
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mfum -= bytes;
 	asize -= bytes;
 
 	/* Evict MRU data. */
 	wt -= m - total_evicted;
 	w = wt * (int64_t)(arc_pd >> 16) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(mrud - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
 	total_evicted += bytes;
 	mrud -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU data. */
 	e = asize - ac;
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
 	mfud -= bytes;
 	total_evicted += bytes;
 
 	/*
 	 * Evict ghost lists
 	 *
 	 * Size of each state's ghost list represents how much that state
 	 * may grow by shrinking the other states.  Would it need to shrink
 	 * other states to zero (that is unlikely), its ghost size would be
 	 * equal to sum of other three state sizes.  But excessive ghost
 	 * size may result in false ghost hits (too far back), that may
 	 * never result in real cache hits if several states are competing.
 	 * So choose some arbitraty point of 1/2 of other state sizes.
 	 */
 	gsrd = (mrum + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsrd;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
 
 	gsrm = (mrud + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsrm;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
 
 	gsfd = (mrud + mrum + mfum) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsfd;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
 
 	gsfm = (mrud + mrum + mfud) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsfm;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
 
 	return (total_evicted);
 }
 
 static void
 arc_flush_impl(uint64_t guid, boolean_t retry)
 {
 	ASSERT(!retry || guid == 0);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == NULL);
 
 	arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
 }
 
 static arc_async_flush_t *
 arc_async_flush_add(uint64_t spa_guid, uint_t level)
 {
 	arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
 	af->af_spa_guid = spa_guid;
 	af->af_cache_level = level;
 	taskq_init_ent(&af->af_tqent);
 	list_link_init(&af->af_node);
 
 	mutex_enter(&arc_async_flush_lock);
 	list_insert_tail(&arc_async_flush_list, af);
 	mutex_exit(&arc_async_flush_lock);
 
 	return (af);
 }
 
 static void
 arc_async_flush_remove(uint64_t spa_guid, uint_t level)
 {
 	mutex_enter(&arc_async_flush_lock);
 	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
 	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
 		if (af->af_spa_guid == spa_guid &&
 		    af->af_cache_level == level) {
 			list_remove(&arc_async_flush_list, af);
 			kmem_free(af, sizeof (*af));
 			break;
 		}
 	}
 	mutex_exit(&arc_async_flush_lock);
 }
 
 static void
 arc_flush_task(void *arg)
 {
 	arc_async_flush_t *af = arg;
 	hrtime_t start_time = gethrtime();
 	uint64_t spa_guid = af->af_spa_guid;
 
 	arc_flush_impl(spa_guid, B_FALSE);
 	arc_async_flush_remove(spa_guid, af->af_cache_level);
 
 	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
 	if (elaspsed > 0) {
 		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
 		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
 	}
 }
 
 /*
  * ARC buffers use the spa's load guid and can continue to exist after
  * the spa_t is gone (exported). The blocks are orphaned since each
  * spa import has a different load guid.
  *
  * It's OK if the spa is re-imported while this asynchronous flush is
  * still in progress. The new spa_load_guid will be different.
  *
  * Also, arc_fini will wait for any arc_flush_task to finish.
  */
 void
 arc_flush_async(spa_t *spa)
 {
 	uint64_t spa_guid = spa_load_guid(spa);
 	arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1);
 
 	taskq_dispatch_ent(arc_flush_taskq, arc_flush_task,
 	    af, TQ_SLEEP, &af->af_tqent);
 }
 
 /*
  * Check if a guid is still in-use as part of an async teardown task
  */
 boolean_t
 arc_async_flush_guid_inuse(uint64_t spa_guid)
 {
 	mutex_enter(&arc_async_flush_lock);
 	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
 	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
 		if (af->af_spa_guid == spa_guid) {
 			mutex_exit(&arc_async_flush_lock);
 			return (B_TRUE);
 		}
 	}
 	mutex_exit(&arc_async_flush_lock);
 	return (B_FALSE);
 }
 
 uint64_t
 arc_reduce_target_size(uint64_t to_free)
 {
 	/*
 	 * Get the actual arc size.  Even if we don't need it, this updates
 	 * the aggsum lower bound estimate for arc_is_overflowing().
 	 */
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t c = arc_c;
 	if (c > arc_c_min) {
 		c = MIN(c, MAX(asize, arc_c_min));
 		to_free = MIN(to_free, c - arc_c_min);
 		arc_c = c - to_free;
 	} else {
 		to_free = 0;
 	}
 
 	/*
 	 * Whether or not we reduced the target size, request eviction if the
 	 * current size is over it now, since caller obviously wants some RAM.
 	 */
 	if (asize > arc_c) {
 		/* See comment in arc_evict_cb_check() on why lock+flag */
 		mutex_enter(&arc_evict_lock);
 		arc_evict_needed = B_TRUE;
 		mutex_exit(&arc_evict_lock);
 		zthr_wakeup(arc_evict_zthr);
 	}
 
 	return (to_free);
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
 #ifdef _KERNEL
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
 	 * with this call, the data might be out of date if the
 	 * evict thread hasn't been woken recently; but that should
 	 * suffice.  The arc_state_t structures can be queried
 	 * directly if more accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 #endif
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	if (arc_evict_needed)
 		return (B_TRUE);
 
 	/*
 	 * If we have buffers in uncached state, evict them periodically.
 	 */
 	return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
 	    ddi_get_lbolt() - arc_last_uncached_flush >
 	    MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg;
 
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Always try to evict from uncached state. */
 	arc_last_uncached_flush = ddi_get_lbolt();
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
 
 	/* Evict from other states only if told to. */
 	if (arc_evict_needed)
 		evicted += arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 *
 	 * Note we cancel using zthr instead of arc_evict_zthr
 	 * because the latter may not yet be initializd when the
 	 * callback is first invoked.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	/*
 	 * Called unconditionally every 60 seconds to reclaim unused
 	 * zstd compression and decompression context. This is done
 	 * here to avoid the need for an independent thread.
 	 */
 	if (!((reap_cb_check_counter++) % 60))
 		zfs_zstd_cache_reap_now();
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	int64_t can_free, free_memory, to_free;
 
 	(void) arg, (void) zthr;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less than the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 	can_free = arc_c - arc_c_min;
 	to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory;
 	if (to_free > 0)
 		arc_reduce_target_size(to_free);
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(uint64_t bytes)
 {
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (aggsum_upper_bound(&arc_sums.arcstat_size) +
 	    2 * SPA_MAXBLOCKSIZE >= arc_c) {
 		uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
 		if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
 			arc_c = arc_c_max;
 	}
 }
 
 /*
  * Check if ARC current size has grown past our upper thresholds.
  */
 static arc_ovf_level_t
 arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 {
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
 	    zfs_max_recordsize;
 
 	/* Always allow at least one block of overflow. */
 	if (over < 0)
 		return (ARC_OVF_NONE);
 
 	/* If we are under memory pressure, report severe overflow. */
 	if (!lax)
 		return (ARC_OVF_SEVERE);
 
 	/* We are not under pressure, so be more or less relaxed. */
 	int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
 	if (use_reserve)
 		overflow *= 3;
 	return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, alloc_flags);
 	if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
 		return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
 	else
 		return (abd_alloc(size, type == ARC_BUFC_METADATA));
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, 0);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.
  * The lax argument specifies that caller does not have a specific reason
  * to wait, not aware of any memory pressure.  Low memory handlers though
  * should set it to B_FALSE to wait for all required evictions to complete.
  * The use_reserve argument allows some callers to wait less than others
  * to not block critical code paths, possibly blocking other resources.
  */
 void
 arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
 {
 	switch (arc_is_overflowing(lax, use_reserve)) {
 	case ARC_OVF_NONE:
 		return;
 	case ARC_OVF_SOME:
 		/*
 		 * This is a bit racy without taking arc_evict_lock, but the
 		 * worst that can happen is we either call zthr_wakeup() extra
 		 * time due to race with other thread here, or the set flag
 		 * get cleared by arc_evict_cb(), which is unlikely due to
 		 * big hysteresis, but also not important since at this level
 		 * of overflow the eviction is purely advisory.  Same time
 		 * taking the global lock here every time without waiting for
 		 * the actual eviction creates a significant lock contention.
 		 */
 		if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		return;
 	case ARC_OVF_SEVERE:
 	default:
 	{
 		arc_evict_waiter_t aw;
 		list_link_init(&aw.aew_node);
 		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 		uint64_t last_count = 0;
 		mutex_enter(&arc_evict_lock);
 		if (!list_is_empty(&arc_evict_waiters)) {
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			last_count = last->aew_count;
 		} else if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		/*
 		 * Note, the last waiter's count may be less than
 		 * arc_evict_count if we are low on memory in which
 		 * case arc_evict_state_impl() may have deferred
 		 * wakeups (but still incremented arc_evict_count).
 		 */
 		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
 
 		list_insert_tail(&arc_evict_waiters, &aw);
 
 		arc_set_need_free();
 
 		DTRACE_PROBE3(arc__wait__for__eviction,
 		    uint64_t, amount,
 		    uint64_t, arc_evict_count,
 		    uint64_t, aw.aew_count);
 
 		/*
 		 * We will be woken up either when arc_evict_count reaches
 		 * aew_count, or when the ARC is no longer overflowing and
 		 * eviction completes.
 		 * In case of "false" wakeup, we will still be on the list.
 		 */
 		do {
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 		} while (list_link_active(&aw.aew_node));
 		mutex_exit(&arc_evict_lock);
 
 		cv_destroy(&aw.aew_cv);
 	}
 	}
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_adapt(size);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 */
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
 	    B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE);
 
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size[type], size,
 		    tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
     const void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
 {
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * Update buffer prefetch status.
 	 */
 	boolean_t was_prefetch = HDR_PREFETCH(hdr);
 	boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
 	if (was_prefetch != now_prefetch) {
 		if (was_prefetch) {
 			ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
 			    HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
 			    prefetch);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_decrement_state(hdr);
 		if (was_prefetch) {
 			arc_hdr_clear_flags(hdr,
 			    ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
 		} else {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_increment_state(hdr);
 	}
 	if (now_prefetch) {
 		if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 			ARCSTAT_BUMP(arcstat_prescient_prefetch);
 		} else {
 			ARCSTAT_BUMP(arcstat_predictive_prefetch);
 		}
 	}
 	if (arc_flags & ARC_FLAG_L2CACHE)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	clock_t now = ddi_get_lbolt();
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer is not in the cache, and does not appear in
 		 * our "ghost" lists.  Add it to the MRU or uncached state.
 		 */
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = now;
 		if (HDR_UNCACHED(hdr)) {
 			new_state = arc_uncached;
 			DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
 			    hdr);
 		} else {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		/*
 		 * This buffer has been accessed once recently and either
 		 * its read is still in progress or it is in the cache.
 		 */
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 		hdr->b_l1hdr.b_mru_hits++;
 		ARCSTAT_BUMP(arcstat_mru_hits);
 
 		/*
 		 * If the previous access was a prefetch, then it already
 		 * handled possible promotion, so nothing more to do for now.
 		 */
 		if (was_prefetch) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * If more than ARC_MINTIME have passed from the previous
 		 * hit, promote the buffer to the MFU state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr);
 		}
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been accessed once recently, but was
 		 * evicted from the cache.  Would we have bigger MRU, it
 		 * would be an MRU hit, so handle it the same way, except
 		 * we don't need to check the previous access time.
 		 */
 		hdr->b_l1hdr.b_mru_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		if (was_prefetch) {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and either
 		 * still in the cache or being restored from one of ghosts.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_mfu_hits++;
 			ARCSTAT_BUMP(arcstat_mfu_hits);
 		}
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once recently, but
 		 * has been evicted from the cache.  Would we have bigger MFU
 		 * it would stay in cache, so move it back to MFU state.
 		 */
 		hdr->b_l1hdr.b_mfu_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_uncached) {
 		/*
 		 * This buffer is uncacheable, but we got a hit.  Probably
 		 * a demand read after prefetch.  Nothing more to do here.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr))
 			ARCSTAT_BUMP(arcstat_uncached_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC and was not accessed
 		 * for a long time, so treat it as new and put into MRU.
 		 */
 		hdr->b_l1hdr.b_arc_access = now;
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
 		return;
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu ||
 	    hdr->b_l1hdr.b_state == arc_uncached);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, 0, B_TRUE);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
 	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zio, (void) zb, (void) bp;
 
 	if (buf == NULL)
 		return;
 
 	memcpy(arg, buf->b_data, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zb, (void) bp;
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (zio->io_error == 0) {
 			if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 				void *tmpbuf;
 
 				tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 				    sizeof (zil_chain_t));
 				zio_crypt_decode_mac_zil(tmpbuf,
 				    hdr->b_crypt_hdr.b_mac);
 				abd_return_buf(zio->io_abd, tmpbuf,
 				    sizeof (zil_chain_t));
 			} else {
 				zio_crypt_decode_mac_bp(bp,
 				    hdr->b_crypt_hdr.b_mac);
 			}
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 	hdr->b_l1hdr.b_acb = NULL;
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 
 		/* We need the last one to call below in original order. */
 		callback_list = acb;
 
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
 				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	(void) remove_reference(hdr, hdr);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_prev;
 		if (acb->acb_wait) {
 			mutex_enter(&acb->acb_wait_lock);
 			acb->acb_wait_error = zio->io_error;
 			acb->acb_wait = B_FALSE;
 			cv_signal(&acb->acb_wait_cv);
 			mutex_exit(&acb->acb_wait_lock);
 			/* acb will be freed by the waiting thread. */
 		} else {
 			kmem_free(acb, sizeof (arc_callback_t));
 		}
 	}
 }
 
 /*
  * Lookup the block at the specified DVA (in bp), and return the manner in
  * which the block is cached. A zero return indicates not cached.
  */
 int
 arc_cached(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	uint64_t guid = spa_load_guid(spa);
 	int flags = 0;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (ARC_CACHED_EMBEDDED);
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return (0);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_state_t *state = hdr->b_l1hdr.b_state;
 		/*
 		 * We switch to ensure that any future arc_state_type_t
 		 * changes are handled. This is just a shift to promote
 		 * more compile-time checking.
 		 */
 		switch (state->arcs_state) {
 		case ARC_STATE_ANON:
 			break;
 		case ARC_STATE_MRU:
 			flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1;
 			break;
 		case ARC_STATE_MFU:
 			flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1;
 			break;
 		case ARC_STATE_UNCACHED:
 			/* The header is still in L1, probably not for long */
 			flags |= ARC_CACHED_IN_L1;
 			break;
 		default:
 			break;
 		}
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		flags |= ARC_CACHED_IN_L2;
 
 	mutex_exit(hash_lock);
 
 	return (flags);
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
 	arc_buf_t *buf = NULL;
 	int rc = 0;
 	boolean_t bp_validation = B_FALSE;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
 
 		/*
 		 * Verify the block pointer contents are reasonable.  This
 		 * should always be the case since the blkptr is protected by
 		 * a checksum.
 		 */
 		if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
 		    BLK_VERIFY_LOG)) {
 			mutex_exit(hash_lock);
 			rc = SET_ERROR(ECKSUM);
 			goto done;
 		}
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto done;
 			}
 
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 
 			DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
 			arc_access(hdr, *arc_flags, B_FALSE);
 
 			/*
 			 * If there are multiple threads reading the same block
 			 * and that block is not yet in the ARC, then only one
 			 * thread will do the physical I/O and all other
 			 * threads will wait until that I/O completes.
 			 * Synchronous reads use the acb_wait_cv whereas nowait
 			 * reads register a callback. Both are signalled/called
 			 * in arc_read_done.
 			 *
 			 * Errors of the physical I/O may need to be propagated.
 			 * Synchronous read errors are returned here from
 			 * arc_read_done via acb_wait_error.  Nowait reads
 			 * attach the acb_zio_dummy zio to pio and
 			 * arc_read_done propagates the physical I/O's io_error
 			 * to acb_zio_dummy, and thereby to pio.
 			 */
 			arc_callback_t *acb = NULL;
 			if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
 				if (*arc_flags & ARC_FLAG_WAIT) {
 					acb->acb_wait = B_TRUE;
 					mutex_init(&acb->acb_wait_lock, NULL,
 					    MUTEX_DEFAULT, NULL);
 					cv_init(&acb->acb_wait_cv, NULL,
 					    CV_DEFAULT, NULL);
 				}
 				acb->acb_zb = *zb;
 				if (pio != NULL) {
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
 
 			ARCSTAT_BUMP(arcstat_iohits);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, is_data, data, metadata, iohits);
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				rc = acb->acb_wait_error;
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 			}
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu ||
 		    hdr->b_l1hdr.b_state == arc_uncached);
 
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, *arc_flags, B_TRUE);
 
 		if (done && !no_buf) {
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 				(void) remove_reference(hdr, private);
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		}
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 		    demand, prefetch, is_data, data, metadata, hits);
 		*arc_flags |= ARC_FLAG_CACHED;
 		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 		arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 		int config_lock;
 		int error;
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			rc = SET_ERROR(ENOENT);
 			goto done;
 		}
 
 		if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
 			config_lock = BLK_CONFIG_HELD;
 		} else if (hash_lock != NULL) {
 			/*
 			 * Prevent lock order reversal
 			 */
 			config_lock = BLK_CONFIG_NEEDED_TRY;
 		} else {
 			config_lock = BLK_CONFIG_NEEDED;
 		}
 
 		/*
 		 * Verify the block pointer contents are reasonable.  This
 		 * should always be the case since the blkptr is protected by
 		 * a checksum.
 		 */
 		if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
 		    config_lock, BLK_VERIFY_LOG))) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 				bp_validation = B_TRUE;
 				goto top;
 			}
 			rc = SET_ERROR(ECKSUM);
 			goto done;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
 			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_GET_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				arc_callback_t *acb = kmem_zalloc(
 				    sizeof (arc_callback_t), KM_SLEEP);
 				acb->acb_wait = B_TRUE;
 				mutex_init(&acb->acb_wait_lock, NULL,
 				    MUTEX_DEFAULT, NULL);
 				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
 				    NULL);
 				acb->acb_zio_head =
 				    hdr->b_l1hdr.b_acb->acb_zio_head;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
 		}
 		if (*arc_flags & ARC_FLAG_UNCACHED) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 			if (!encrypted_read)
 				alloc_flags |= ARC_HDR_ALLOC_LINEAR;
 		}
 
 		/*
 		 * Take additional reference for IO_IN_PROGRESS.  It stops
 		 * arc_access() from putting this header without any buffers
 		 * and so other references but obviously nonevictable onto
 		 * the evictable list of MRU or MFU state.
 		 */
 		add_reference(hdr, hdr);
 		if (!embedded_bp)
 			arc_access(hdr, *arc_flags, B_FALSE);
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		arc_hdr_alloc_abd(hdr, alloc_flags);
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_nobuf = no_buf;
 		acb->acb_zb = *zb;
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(spa, size, 1, 0);
 		}
 
 		/* Check if the spa even has l2 configured */
 		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
 		    spa->spa_l2cache.sav_count > 0;
 
 		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				hdr->b_l2hdr.b_hits++;
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 
 			/*
 			 * Only a spa with l2 should contribute to l2
 			 * miss stats.  (Including the case of having a
 			 * faulted cache device - that's also a miss.)
 			 */
 			if (spa_has_l2) {
 				/*
 				 * Skip ARC stat bump for block pointers with
 				 * embedded data. The data are read from the
 				 * blkptr itself via
 				 * decode_embedded_bp_compressed().
 				 */
 				if (!embedded_bp) {
 					DTRACE_PROBE1(l2arc__miss,
 					    arc_buf_hdr_t *, hdr);
 					ARCSTAT_BUMP(arcstat_l2_misses);
 				}
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 
 done:
 	if (done)
 		done(NULL, zb, bp, buf, private);
 	if (pio && rc != 0) {
 		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
 		zio->io_error = rc;
 		zio_nowait(zio);
 	}
 	goto out;
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Helper function for arc_prune_async() it is responsible for safely
  * handling the execution of a registered arc_prune_func_t.
  */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 
 	(void) zfs_refcount_remove(&ap->p_refcnt, func);
 }
 
 /*
  * Notify registered consumers they must drop holds on a portion of the ARC
  * buffers they reference.  This provides a mechanism to ensure the ARC can
  * honor the metadata limit and reclaim otherwise pinned ARC buffers.
  *
  * This operation is performed asynchronously so it may be safely called
  * in the context of the arc_reclaim_thread().  A reference is taken here
  * for each registered arc_prune_t and the arc_prune_task() is responsible
  * for releasing it once the registered arc_prune_func_t has completed.
  */
 static void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			(void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) ||
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 	ASSERT3P(state, !=, arc_l2c_only);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 
 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the buffer off of this hdr and find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, 0);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
 	} else {
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 			/* Recheck to prevent race with l2arc_evict(). */
 			if (HDR_HAS_L2HDR(hdr))
 				arc_hdr_l2hdr_destroy(hdr);
 			mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 		}
 
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		arc_change_state(arc_anon, hdr);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	return (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 		    ARC_HDR_USE_RESERVE);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (!(HDR_UNCACHED(hdr) ||
 	    abd_size_alloc_linear(arc_buf_size(buf))) ||
 	    !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 			    ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT3U(zio->io_error, ==, 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, 0, B_FALSE);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 	}
 
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	if (uncached)
 		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 	else if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
 		    ZIO_DATA_SALT_LEN);
 		memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
 		    ZIO_DATA_IV_LEN);
 		memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
+			localprop.zp_gang_copies =
+			    MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)
 	    (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
 	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
 		    (u_longlong_t)arc_tempreserve >> 10,
 		    (u_longlong_t)meta_esize >> 10,
 		    (u_longlong_t)data_esize >> 10,
 		    (u_longlong_t)reserve >> 10,
 		    (u_longlong_t)rarc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *data, kstat_named_t *metadata,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
 	metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	size->value.ui64 = data->value.ui64 + metadata->value.ui64;
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
 	as->arcstat_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_iohits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
 	as->arcstat_demand_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_iohits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
 	as->arcstat_demand_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
 	as->arcstat_prefetch_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
 	as->arcstat_prefetch_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_hits);
 	as->arcstat_mru_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
 	as->arcstat_mfu_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
 	as->arcstat_uncached_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncached_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mutex_miss);
 	as->arcstat_access_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_access_skip);
 	as->arcstat_evict_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_skip);
 	as->arcstat_evict_not_enough.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_not_enough);
 	as->arcstat_evict_l2_cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_cached);
 	as->arcstat_evict_l2_eligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
 	as->arcstat_evict_l2_eligible_mfu.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	as->arcstat_evict_l2_eligible_mru.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
 	as->arcstat_evict_l2_ineligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
 	as->arcstat_evict_l2_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_skip);
 	as->arcstat_hash_elements.value.ui64 =
 	    as->arcstat_hash_elements_max.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_elements);
 	as->arcstat_hash_collisions.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_collisions);
 	as->arcstat_hash_chains.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_chains);
 	as->arcstat_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_size);
 	as->arcstat_compressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_compressed_size);
 	as->arcstat_uncompressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncompressed_size);
 	as->arcstat_overhead_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_overhead_size);
 	as->arcstat_hdr_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hdr_size);
 	as->arcstat_data_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_data_size);
 	as->arcstat_metadata_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_metadata_size);
 	as->arcstat_dbuf_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
 	    wmsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
 	    &as->arcstat_anon_data,
 	    &as->arcstat_anon_metadata,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
 	    &as->arcstat_mru_data,
 	    &as->arcstat_mru_metadata,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
 	    &as->arcstat_mru_ghost_data,
 	    &as->arcstat_mru_ghost_metadata,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
 	    &as->arcstat_mfu_data,
 	    &as->arcstat_mfu_metadata,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
 	    &as->arcstat_mfu_ghost_data,
 	    &as->arcstat_mfu_ghost_metadata,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_uncached,
 	    &as->arcstat_uncached_size,
 	    &as->arcstat_uncached_data,
 	    &as->arcstat_uncached_metadata,
 	    &as->arcstat_uncached_evictable_data,
 	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_misses);
 	as->arcstat_l2_prefetch_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
 	as->arcstat_l2_mru_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mru_asize);
 	as->arcstat_l2_mfu_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
 	as->arcstat_l2_bufc_data_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
 	as->arcstat_l2_bufc_metadata_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	as->arcstat_l2_feeds.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_feeds);
 	as->arcstat_l2_rw_clash.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rw_clash);
 	as->arcstat_l2_read_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_read_bytes);
 	as->arcstat_l2_write_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_write_bytes);
 	as->arcstat_l2_writes_sent.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_sent);
 	as->arcstat_l2_writes_done.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_done);
 	as->arcstat_l2_writes_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_error);
 	as->arcstat_l2_writes_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
 	as->arcstat_l2_evict_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
 	as->arcstat_l2_evict_reading.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_reading);
 	as->arcstat_l2_evict_l1cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
 	as->arcstat_l2_free_on_write.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_free_on_write);
 	as->arcstat_l2_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
 	as->arcstat_l2_cksum_bad.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
 	as->arcstat_l2_io_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_io_error);
 	as->arcstat_l2_lsize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_lsize);
 	as->arcstat_l2_psize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_psize);
 	as->arcstat_l2_hdr_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_l2_hdr_size);
 	as->arcstat_l2_log_blk_writes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
 	as->arcstat_l2_log_blk_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
 	as->arcstat_l2_log_blk_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
 	as->arcstat_l2_rebuild_success.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
 	as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	as->arcstat_l2_rebuild_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
 	as->arcstat_l2_rebuild_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
 	as->arcstat_l2_rebuild_bufs.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
 	as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	as->arcstat_l2_rebuild_log_blks.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
 	as->arcstat_memory_throttle_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_throttle_count);
 	as->arcstat_memory_direct_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_direct_count);
 	as->arcstat_memory_indirect_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_indirect_count);
 
 	as->arcstat_memory_all_bytes.value.ui64 =
 	    arc_all_memory();
 	as->arcstat_memory_free_bytes.value.ui64 =
 	    arc_free_memory();
 	as->arcstat_memory_available_bytes.value.i64 =
 	    arc_available_memory();
 
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
 	as->arcstat_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_predictive_prefetch);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	as->arcstat_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prescient_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
 	as->arcstat_abd_chunk_waste_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 static unsigned int
 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
 {
 	panic("Header %p insert into arc_l2c_only %p", obj, ml);
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
 		    (#tuning), (u_longlong_t)(value));	\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		if (arc_dnode_limit > arc_c_max)
 			arc_dnode_limit = arc_c_max;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if (zfs_arc_lotsfree_percent <= 100)
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(zfs_arc_sys_free, allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_multilist_init(multilist_t *ml,
     multilist_sublist_index_func_t *index_func, int *maxcountp)
 {
 	multilist_create(ml, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
 	*maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
 }
 
 static void
 arc_state_init(void)
 {
 	int num_sublists = 0;
 
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.  Special index function asserts that.
 	 */
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 
 	/*
 	 * Keep track of the number of markers needed to reclaim buffers from
 	 * any ARC state.  The markers will be pre-allocated so as to minimize
 	 * the number of memory allocations performed by the eviction thread.
 	 */
 	arc_state_evict_marker_count = num_sublists;
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
 	wmsum_init(&arc_sums.arcstat_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
 	wmsum_init(&arc_sums.arcstat_hash_elements, 0);
 	wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
 	wmsum_init(&arc_sums.arcstat_hash_chains, 0);
 	aggsum_init(&arc_sums.arcstat_size, 0);
 	wmsum_init(&arc_sums.arcstat_compressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_overhead_size, 0);
 	wmsum_init(&arc_sums.arcstat_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
 	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
 	wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
 	wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
 	wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
 	wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_psize, 0);
 	aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
 	wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
 	wmsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
 	wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 	arc_uncached->arcs_state = ARC_STATE_UNCACHED;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
 
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
 	wmsum_fini(&arc_sums.arcstat_iohits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_uncached_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_not_enough);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
 	wmsum_fini(&arc_sums.arcstat_hash_elements);
 	wmsum_fini(&arc_sums.arcstat_hash_collisions);
 	wmsum_fini(&arc_sums.arcstat_hash_chains);
 	aggsum_fini(&arc_sums.arcstat_size);
 	wmsum_fini(&arc_sums.arcstat_compressed_size);
 	wmsum_fini(&arc_sums.arcstat_uncompressed_size);
 	wmsum_fini(&arc_sums.arcstat_overhead_size);
 	wmsum_fini(&arc_sums.arcstat_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
 	wmsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
 	wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_feeds);
 	wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
 	wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_done);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_error);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
 	wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
 	wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
 	wmsum_fini(&arc_sums.arcstat_l2_io_error);
 	wmsum_fini(&arc_sums.arcstat_l2_lsize);
 	wmsum_fini(&arc_sums.arcstat_l2_psize);
 	aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
 	wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
 	wmsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
 	wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_set_limits(uint64_t allmem)
 {
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	arc_set_limits(allmem);
 
 #ifdef _KERNEL
 	/*
 	 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
 	 * environment before the module was loaded, don't block setting the
 	 * maximum because it is less than arc_c_min, instead, reset arc_c_min
 	 * to a lower value.
 	 * zfs_arc_min will be handled by arc_tuning_update().
 	 */
 	if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
 	    zfs_arc_max < allmem) {
 		arc_c_max = zfs_arc_max;
 		if (arc_c_min >= arc_c_max) {
 			arc_c_min = MAX(zfs_arc_max / 2,
 			    2ULL << SPA_MAXBLOCKSHIFT);
 		}
 	}
 #else
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	/*
 	 * 32-bit fixed point fractions of metadata from total ARC size,
 	 * MRU data from all data and MRU metadata from all metadata.
 	 */
 	arc_meta = (1ULL << 32) / 4;	/* Metadata is 25% of arc_c. */
 	arc_pd = (1ULL << 32) / 2;	/* Data MRU is 50% of data. */
 	arc_pm = (1ULL << 32) / 2;	/* Metadata MRU is 50% of metadata. */
 
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_limit = arc_c_max * percent / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_register_hotplug();
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
 	    offsetof(arc_async_flush_t, af_node));
 	mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_flush_taskq = taskq_create("arc_flush", MIN(boot_ncpus, 4),
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_state_evict_markers =
 	    arc_state_alloc_markers(arc_state_evict_marker_count);
 	arc_evict_zthr = zthr_create_timer("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 
 	if (zfs_wrlog_data_max == 0) {
 
 		/*
 		 * dp_wrlog_total is reduced for each txg at the end of
 		 * spa_sync(). However, dp_dirty_total is reduced every time
 		 * a block is written out. Thus under normal operation,
 		 * dp_wrlog_total could grow 2 times as big as
 		 * zfs_dirty_data_max.
 		 */
 		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Wait for any background flushes */
 	taskq_wait(arc_flush_taskq);
 	taskq_destroy(arc_flush_taskq);
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	list_destroy(&arc_async_flush_list);
 	mutex_destroy(&arc_async_flush_lock);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
 		(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 	arc_state_free_markers(arc_state_evict_markers,
 	    arc_state_evict_marker_count);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	arc_unregister_hotplug();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
 		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/* We need to add in the worst case scenario of log block overhead. */
 	size += l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the writesize, whichever is greater.
 		 */
 		size += MAX(64 * 1024 * 1024,
 		    (size * l2arc_trim_ahead) / 100);
 	}
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
 
 	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 static boolean_t
 l2arc_dev_invalid(const l2arc_dev_t *dev)
 {
 	/*
 	 * We want to skip devices that are being rebuilt, trimmed,
 	 * removed, or belong to a spa that is being exported.
 	 */
 	return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) ||
 	    dev->l2ad_rebuild || dev->l2ad_trim_all ||
 	    dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 		ASSERT3P(next, !=, NULL);
 	} while (l2arc_dev_invalid(next));
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (l2arc_dev_invalid(next))
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	l2arc_data_free_t *df;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
 			ASSERT(dev->l2ad_vdev != NULL);
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					memset(l2dhdr, 0,
 					    dev->l2ad_dev_hdr_asize);
 				} else {
 					memset(&l2dhdr->dh_start_lbps[i], 0,
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	ARCSTAT_BUMP(arcstat_l2_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		ASSERT(dev->l2ad_vdev != NULL);
 
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	ASSERT(vd != NULL || all);
 	ASSERT(dev->l2ad_spa != NULL || all);
 
 	buflist = &dev->l2ad_buflist;
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand + distance > dev->l2ad_end) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			if (vd != NULL)
 				vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	if (!all) {
 		/*
 		 * In case of cache device removal (all) the following
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
 		ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr)) {
 		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
 		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, MIN(size, psize), hdr->b_complevel);
 		if (csize >= size || csize > psize) {
 			/*
 			 * We can't re-compress the block into the original
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
 			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
 			abd_zero_off(cabd, csize, asize - csize);
 		to_write = cabd;
 	}
 
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_free(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *head, *marker;
 	uint64_t 		write_asize, write_psize, headroom;
 	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
 		 * pass == 0: MFU meta
 		 * pass == 1: MRU meta
 		 * pass == 2: MFU data
 		 * pass == 3: MRU data
 		 */
 		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
 		} else if (l2arc_mfuonly > 1) {
 			if (pass == 3)
 				continue;
 		}
 
 		uint64_t passed_sz = 0;
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		ASSERT3P(mls, !=, NULL);
 		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 skip:
 				/* Skip this buffer rather than waiting. */
 				if (from_head)
 					hdr = multilist_sublist_next(mls, hdr);
 				else
 					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			/*
 			 * If the allocated size of this buffer plus the max
 			 * size for the pending log block exceeds the evicted
 			 * target size, terminate writing buffers for this run.
 			 */
 			if (write_asize + asize +
 			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We should not sleep with sublist lock held or it
 			 * may block ARC eviction.  Insert a marker to save
 			 * the position and drop the lock.
 			 */
 			if (from_head) {
 				multilist_sublist_insert_after(mls, hdr,
 				    marker);
 			} else {
 				multilist_sublist_insert_before(mls, hdr,
 				    marker);
 			}
 			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
 					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
 			/* l2arc_hdr_arcstats_update() expects a valid asize */
 			HDR_SET_L2SIZE(hdr, asize);
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
 			    ARC_FLAG_L2_WRITING);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				list_insert_head(&dev->l2ad_buflist, head);
 			}
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 
 			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
 			mutex_exit(hash_lock);
 
 			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 
 			if (commit) {
 				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
 next:
 			multilist_sublist_lock(mls);
 			if (from_head)
 				hdr = multilist_sublist_next(mls, marker);
 			else
 				hdr = multilist_sublist_prev(mls, marker);
 			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	arc_state_free_marker(marker);
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		if (dev->l2ad_evict != l2dhdr->dh_evict)
 			l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
 static boolean_t
 l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
 	return (arc_reclaim_needed() ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static  __attribute__((noreturn)) void
 l2arc_feed_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 static void
 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 {
 	l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	spa_t *spa = dev->l2ad_spa;
 
 	/*
 	 * After a l2arc_remove_vdev(), the spa_t will no longer be valid
 	 */
 	if (spa == NULL)
 		return;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			memset(l2dhdr, 0, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Decide if dev is eligible for L2ARC rebuild or whole device
 	 * trimming. This has to happen before the device is added in the
 	 * cache device list and l2arc_dev_mtx is released. Otherwise
 	 * l2arc_feed_thread() might already start writing on the
 	 * device.
 	 */
 	l2arc_rebuild_dev(adddev, B_FALSE);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
  * in case of onlining a cache device.
  */
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 
 	/*
 	 * In contrast to l2arc_add_vdev() we do not have to worry about
 	 * l2arc_feed_thread() invalidating previous content when onlining a
 	 * cache device. The device parameters (l2ad*) are not cleared when
 	 * offlining the device and writing new buffers will not invalidate
 	 * all previous content. In worst case only buffers that have not had
 	 * their log block written to the device will be lost.
 	 * When onlining the cache device (ie offline->online without exporting
 	 * the pool in between) this happens:
 	 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
 	 * 			|			|
 	 * 		vdev_is_dead() = B_FALSE	l2ad_rebuild = B_TRUE
 	 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
 	 * is set to B_TRUE we might write additional buffers to the device.
 	 */
 	l2arc_rebuild_dev(dev, reopen);
 }
 
 typedef struct {
 	l2arc_dev_t	*rva_l2arc_dev;
 	uint64_t	rva_spa_gid;
 	uint64_t	rva_vdev_gid;
 	boolean_t	rva_async;
 
 } remove_vdev_args_t;
 
 static void
 l2arc_device_teardown(void *arg)
 {
 	remove_vdev_args_t *rva = arg;
 	l2arc_dev_t *remdev = rva->rva_l2arc_dev;
 	hrtime_t start_time = gethrtime();
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 
 	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
 	if (elaspsed > 0) {
 		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
 		    (u_longlong_t)rva->rva_spa_gid,
 		    (u_longlong_t)rva->rva_vdev_gid,
 		    (u_longlong_t)elaspsed);
 	}
 
 	if (rva->rva_async)
 		arc_async_flush_remove(rva->rva_spa_gid, 2);
 	kmem_free(rva, sizeof (remove_vdev_args_t));
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED ||
 	    spa->spa_state == POOL_STATE_DESTROYED;
 
 	/*
 	 * Find the device by vdev
 	 */
 	l2arc_dev_t *remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Save info for final teardown
 	 */
 	remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
 	    KM_SLEEP);
 	rva->rva_l2arc_dev = remdev;
 	rva->rva_spa_gid = spa_load_guid(spa);
 	rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	remdev->l2ad_rebuild_cancel = B_TRUE;
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 	rva->rva_async = asynchronous;
 
 	/*
 	 * Remove device from global list
 	 */
 	ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 
 	/* During a pool export spa & vdev will no longer be valid */
 	if (asynchronous) {
 		remdev->l2ad_spa = NULL;
 		remdev->l2ad_vdev = NULL;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	if (!asynchronous) {
 		l2arc_device_teardown(rva);
 		return;
 	}
 
 	arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2);
 
 	taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva,
 	    TQ_SLEEP, &af->af_tqent);
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 void
 l2arc_spa_rebuild_stop(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL)
 			continue;
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		dev->l2ad_rebuild_cancel = B_TRUE;
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL)
 			continue;
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild_began == B_TRUE) {
 			while (dev->l2ad_rebuild == B_TRUE) {
 				cv_wait(&l2arc_rebuild_thr_cv,
 				    &l2arc_rebuild_thr_lock);
 			}
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static __attribute__((noreturn)) void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	cv_signal(&l2arc_rebuild_thr_cv);
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 	}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 		return (err);
 	} else if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err,
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid,
 		    (u_longlong_t)dev->l2ad_hand,
 		    (u_longlong_t)dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4: {
 		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		abd_t dabd;
 		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
 		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, &dabd, asize, sizeof (*this_lb), NULL);
 		abd_free(&dabd);
 		abd_free(abd);
 		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
 	/*
 	 * Usually arc_adapt() is called only for data, not headers, but
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
 	arc_adapt(log_entries * HDR_L2ONLY_SIZE);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), asize, le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop),
 	    L2BLK_GET_STATE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls vdev_space_update().
 	 */
 	l2arc_hdr_arcstats_increment(hdr);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
 			/* l2arc_hdr_arcstats_update() expects a valid asize */
 			HDR_SET_L2SIZE(exists, asize);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			l2arc_hdr_arcstats_increment(exists);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static uint64_t
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer, at least one sector to save */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, &abd, sizeof (*lb),
 	    zio_get_compression_max_size(ZIO_COMPRESS_LZ4,
 	    dev->l2ad_vdev->vdev_ashift,
 	    dev->l2ad_vdev->vdev_ashift, sizeof (*lb)), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 
 	return (asize);
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	memset(le, 0, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
 	spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
 	spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
 	"Balance between metadata and data on ghost hits.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim ARC to");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed ARC buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
 	"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 01f92411bcb3..0a243a24266f 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1,5443 +1,5443 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
 #include <sys/wmsum.h>
 #include <sys/vdev_impl.h>
 
 static kstat_t *dbuf_ksp;
 
 typedef struct dbuf_stats {
 	/*
 	 * Various statistics about the size of the dbuf cache.
 	 */
 	kstat_named_t cache_count;
 	kstat_named_t cache_size_bytes;
 	kstat_named_t cache_size_bytes_max;
 	/*
 	 * Statistics regarding the bounds on the dbuf cache size.
 	 */
 	kstat_named_t cache_target_bytes;
 	kstat_named_t cache_lowater_bytes;
 	kstat_named_t cache_hiwater_bytes;
 	/*
 	 * Total number of dbuf cache evictions that have occurred.
 	 */
 	kstat_named_t cache_total_evicts;
 	/*
 	 * The distribution of dbuf levels in the dbuf cache and
 	 * the total size of all dbufs at each level.
 	 */
 	kstat_named_t cache_levels[DN_MAX_LEVELS];
 	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
 	/*
 	 * Statistics about the dbuf hash table.
 	 */
 	kstat_named_t hash_hits;
 	kstat_named_t hash_misses;
 	kstat_named_t hash_collisions;
 	kstat_named_t hash_elements;
 	/*
 	 * Number of sublists containing more than one dbuf in the dbuf
 	 * hash table. Keep track of the longest hash chain.
 	 */
 	kstat_named_t hash_chains;
 	kstat_named_t hash_chain_max;
 	/*
 	 * Number of times a dbuf_create() discovers that a dbuf was
 	 * already created and in the dbuf hash table.
 	 */
 	kstat_named_t hash_insert_race;
 	/*
 	 * Number of entries in the hash table dbuf and mutex arrays.
 	 */
 	kstat_named_t hash_table_count;
 	kstat_named_t hash_mutex_count;
 	/*
 	 * Statistics about the size of the metadata dbuf cache.
 	 */
 	kstat_named_t metadata_cache_count;
 	kstat_named_t metadata_cache_size_bytes;
 	kstat_named_t metadata_cache_size_bytes_max;
 	/*
 	 * For diagnostic purposes, this is incremented whenever we can't add
 	 * something to the metadata cache because it's full, and instead put
 	 * the data in the regular dbuf cache.
 	 */
 	kstat_named_t metadata_cache_overflow;
 } dbuf_stats_t;
 
 dbuf_stats_t dbuf_stats = {
 	{ "cache_count",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
 	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
 	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
 	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
 	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
 	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
 	{ "hash_hits",				KSTAT_DATA_UINT64 },
 	{ "hash_misses",			KSTAT_DATA_UINT64 },
 	{ "hash_collisions",			KSTAT_DATA_UINT64 },
 	{ "hash_elements",			KSTAT_DATA_UINT64 },
 	{ "hash_chains",			KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
 	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
 	{ "hash_table_count",			KSTAT_DATA_UINT64 },
 	{ "hash_mutex_count",			KSTAT_DATA_UINT64 },
 	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
 	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
 	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t cache_count;
 	wmsum_t cache_total_evicts;
 	wmsum_t cache_levels[DN_MAX_LEVELS];
 	wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
 	wmsum_t hash_hits;
 	wmsum_t hash_misses;
 	wmsum_t hash_collisions;
 	wmsum_t hash_elements;
 	wmsum_t hash_chains;
 	wmsum_t hash_insert_race;
 	wmsum_t metadata_cache_count;
 	wmsum_t metadata_cache_overflow;
 } dbuf_sums;
 
 #define	DBUF_STAT_INCR(stat, val)	\
 	wmsum_add(&dbuf_sums.stat, val)
 #define	DBUF_STAT_DECR(stat, val)	\
 	DBUF_STAT_INCR(stat, -(val))
 #define	DBUF_STAT_BUMP(stat)		\
 	DBUF_STAT_INCR(stat, 1)
 #define	DBUF_STAT_BUMPDOWN(stat)	\
 	DBUF_STAT_INCR(stat, -1)
 #define	DBUF_STAT_MAX(stat, v) {					\
 	uint64_t _m;							\
 	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
 	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
 		continue;						\
 }
 
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_kmem_cache;
 kmem_cache_t *dbuf_dirty_kmem_cache;
 static taskq_t *dbu_evict_taskq;
 
 static kthread_t *dbuf_cache_evict_thread;
 static kmutex_t dbuf_evict_lock;
 static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
  * There are two dbuf caches; each dbuf can only be in one of them at a time.
  *
  * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
  *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
  *    that represent the metadata that describes filesystems/snapshots/
  *    bookmarks/properties/etc. We only evict from this cache when we export a
  *    pool, to short-circuit as much I/O as possible for all administrative
  *    commands that need the metadata. There is no eviction policy for this
  *    cache, because we try to only include types in it which would occupy a
  *    very small amount of space per object but create a large impact on the
  *    performance of these commands. Instead, after it reaches a maximum size
  *    (which should only happen on very small memory systems with a very large
  *    number of filesystem objects), we stop taking new dbufs into the
  *    metadata cache, instead putting them in the normal dbuf cache.
  *
  * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
  *    are not currently held but have been recently released. These dbufs
  *    are not eligible for arc eviction until they are aged out of the cache.
  *    Dbufs that are aged out of the cache will be immediately destroyed and
  *    become eligible for arc eviction.
  *
  * Dbufs are added to these caches once the last hold is released. If a dbuf is
  * later accessed and still exists in the dbuf cache, then it will be removed
  * from the cache and later re-added to the head of the cache.
  *
  * If a given dbuf meets the requirements for the metadata cache, it will go
  * there, otherwise it will be considered for the generic LRU dbuf cache. The
  * caches and the refcounts tracking their sizes are stored in an array indexed
  * by those caches' matching enum values (from dbuf_cached_state_t).
  */
 typedef struct dbuf_cache {
 	multilist_t cache;
 	zfs_refcount_t size ____cacheline_aligned;
 } dbuf_cache_t;
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
 static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
 static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
 
 /* Set the default sizes of the caches to log2 fraction of arc size */
 static uint_t dbuf_cache_shift = 5;
 static uint_t dbuf_metadata_cache_shift = 6;
 
 /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
 static uint_t dbuf_mutex_cache_shift = 0;
 
 static unsigned long dbuf_cache_target_bytes(void);
 static unsigned long dbuf_metadata_cache_target_bytes(void);
 
 /*
  * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
  *	signal the eviction thread to run.
  *	- The high water mark indicates when the eviction thread
  *	is unable to keep up with the incoming load and eviction must
  *	happen in the context of the calling thread.
  *
  * The dbuf cache:
  *                                                 (max size)
  *                                      low water   mid water   hi water
  * +----------------------------------------+----------+----------+
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * +----------------------------------------+----------+----------+
  *                                        stop        signal     evict
  *                                      evicting     eviction   directly
  *                                                    thread
  *
  * The high and low water marks indicate the operating range for the eviction
  * thread. The low water mark is, by default, 90% of the total size of the
  * cache and the high water mark is at 110% (both of these percentages can be
  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
  * respectively). The eviction thread will try to ensure that the cache remains
  * within this range by waking up every second and checking if the cache is
  * above the low water mark. The thread can also be woken up by callers adding
  * elements into the cache if the cache is larger than the mid water (i.e max
  * cache size). Once the eviction thread is woken up and eviction is required,
  * it will continue evicting buffers until it's able to reduce the cache size
  * to the low water mark. If the cache size continues to grow and hits the high
  * water mark, then callers adding elements to the cache will begin to evict
  * directly from the cache until the cache is no longer above the high water
  * mark.
  */
 
 /*
  * The percentage above and below the maximum cache size.
  */
 static uint_t dbuf_cache_hiwater_pct = 10;
 static uint_t dbuf_cache_lowater_pct = 10;
 
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	dmu_buf_impl_t *db = vdb;
 	memset(db, 0, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
 	rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&db->db_cache_link);
 	zfs_refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	(void) unused;
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	rw_destroy(&db->db_rwlock);
 	cv_destroy(&db->db_changed);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 	zfs_refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
 }
 
 #define	DTRACE_SET_STATE(db, why) \
 	DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db,	\
 	    const char *, why)
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
     uint64_t *hash_out)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv;
 	uint64_t idx;
 	dmu_buf_impl_t *db;
 
 	hv = dbuf_hash(os, obj, level, blkid);
 	idx = hv & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	if (hash_out != NULL)
 		*hash_out = hv;
 	return (NULL);
 }
 
 static dmu_buf_impl_t *
 dbuf_find_bonus(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db = NULL;
 
 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		if (dn->dn_bonus != NULL) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 	}
 	return (db);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid, idx;
 	dmu_buf_impl_t *dbf;
 	uint32_t i;
 
 	blkid = db->db_blkid;
 	ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
 	    dbf = dbf->db_hash_next, i++) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	if (i > 0) {
 		DBUF_STAT_BUMP(hash_collisions);
 		if (i == 1)
 			DBUF_STAT_BUMP(hash_chains);
 
 		DBUF_STAT_MAX(hash_chain_max, i);
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	DBUF_STAT_BUMP(hash_elements);
 
 	return (NULL);
 }
 
 /*
  * This returns whether this dbuf should be stored in the metadata cache, which
  * is based on whether it's from one of the dnode types that store data related
  * to traversing dataset hierarchies.
  */
 static boolean_t
 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
 {
 	DB_DNODE_ENTER(db);
 	dmu_object_type_t type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	/* Check if this dbuf is one of the types we care about */
 	if (DMU_OT_IS_METADATA_CACHED(type)) {
 		/* If we hit this, then we set something up wrong in dmu_ot */
 		ASSERT(DMU_OT_IS_METADATA(type));
 
 		/*
 		 * Sanity check for small-memory systems: don't allocate too
 		 * much memory for this purpose.
 		 */
 		if (zfs_refcount_count(
 		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
 		    dbuf_metadata_cache_target_bytes()) {
 			DBUF_STAT_BUMP(metadata_cache_overflow);
 			return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t idx;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
 	    db->db_blkid), ==, db->db_hash);
 	idx = db->db_hash & h->hash_table_mask;
 
 	/*
 	 * We mustn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	if (h->hash_table[idx] &&
 	    h->hash_table[idx]->db_hash_next == NULL)
 		DBUF_STAT_BUMPDOWN(hash_chains);
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	DBUF_STAT_BUMPDOWN(hash_elements);
 }
 
 typedef enum {
 	DBVU_EVICTING,
 	DBVU_NOT_EVICTING
 } dbvu_verify_type_t;
 
 static void
 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 {
 #ifdef ZFS_DEBUG
 	int64_t holds;
 
 	if (db->db_user == NULL)
 		return;
 
 	/* Only data blocks support the attachment of user data. */
 	ASSERT(db->db_level == 0);
 
 	/* Clients must resolve a dbuf before attaching user data. */
 	ASSERT(db->db.db_data != NULL);
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 
 	holds = zfs_refcount_count(&db->db_holds);
 	if (verify_type == DBVU_EVICTING) {
 		/*
 		 * Immediate eviction occurs when holds == dirtycnt.
 		 * For normal eviction buffers, holds is zero on
 		 * eviction, except when dbuf_fix_old_data() calls
 		 * dbuf_clear_data().  However, the hold count can grow
 		 * during eviction even though db_mtx is held (see
 		 * dmu_bonus_hold() for an example), so we can only
 		 * test the generic invariant that holds >= dirtycnt.
 		 */
 		ASSERT3U(holds, >=, db->db_dirtycnt);
 	} else {
 		if (db->db_user_immediate_evict == TRUE)
 			ASSERT3U(holds, >=, db->db_dirtycnt);
 		else
 			ASSERT3U(holds, >, 0);
 	}
 #endif
 }
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	dmu_buf_user_t *dbu = db->db_user;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (dbu == NULL)
 		return;
 
 	dbuf_verify_user(db, DBVU_EVICTING);
 	db->db_user = NULL;
 
 #ifdef ZFS_DEBUG
 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
 	if (db->db_caching_status != DB_NO_CACHE) {
 		/*
 		 * This is a cached dbuf, so the size of the user data is
 		 * included in its cached amount. We adjust it here because the
 		 * user data has already been detached from the dbuf, and the
 		 * sync functions are not supposed to touch it (the dbuf might
 		 * not exist anymore by the time the sync functions run.
 		 */
 		uint64_t size = dbu->dbu_size;
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size, size, dbu);
 		if (db->db_caching_status == DB_DBUF_CACHE)
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
 	}
 
 	/*
 	 * There are two eviction callbacks - one that we call synchronously
 	 * and one that we invoke via a taskq.  The async one is useful for
 	 * avoiding lock order reversals and limiting stack depth.
 	 *
 	 * Note that if we have a sync callback but no async callback,
 	 * it's likely that the sync callback will free the structure
 	 * containing the dbu.  In that case we need to take care to not
 	 * dereference dbu after calling the sync evict func.
 	 */
 	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
 
 	if (dbu->dbu_evict_func_sync != NULL)
 		dbu->dbu_evict_func_sync(dbu);
 
 	if (has_async) {
 		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
 		    dbu, 0, &dbu->dbu_tqent);
 	}
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	/*
 	 * Consider indirect blocks and spill blocks to be meta data.
 	 */
 	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 /*
  * We want to exclude buffers that are on a special allocation class from
  * L2ARC.
  */
 boolean_t
 dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
 {
 	if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (db->db_objset->os_secondary_cache ==
 	    ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		/*
 		 * bp must be checked in the event it was passed from
 		 * dbuf_read_impl() as the result of a the BP being set from
 		 * a Direct I/O write in dbuf_read(). See comments in
 		 * dbuf_read().
 		 */
 		blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
 
 		if (db_bp == NULL || BP_IS_HOLE(db_bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
 		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static inline boolean_t
 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
 {
 	if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
 	    (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
 	    (level > 0 ||
 	    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dmu_buf_impl_t *db = obj;
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * dmu_buf_impl_t will remain constant throughout it's lifetime
 	 * (i.e. it's objset, object, level and blkid fields don't change).
 	 * Thus, we don't need to store the dbuf's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid) %
 	    multilist_get_num_sublists(ml));
 }
 
 /*
  * The target size of the dbuf cache can grow with the ARC target,
  * unless limited by the tunable dbuf_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_cache_target_bytes(void)
 {
 	return (MIN(dbuf_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_cache_shift));
 }
 
 /*
  * The target size of the dbuf metadata cache can grow with the ARC target,
  * unless limited by the tunable dbuf_metadata_cache_max_bytes.
  */
 static inline unsigned long
 dbuf_metadata_cache_target_bytes(void)
 {
 	return (MIN(dbuf_metadata_cache_max_bytes,
 	    arc_target_bytes() >> dbuf_metadata_cache_shift));
 }
 
 static inline uint64_t
 dbuf_cache_hiwater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target +
 	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
 }
 
 static inline uint64_t
 dbuf_cache_lowater_bytes(void)
 {
 	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 	return (dbuf_cache_target -
 	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
 }
 
 static inline boolean_t
 dbuf_cache_above_lowater(void)
 {
 	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_lowater_bytes());
 }
 
 /*
  * Evict the oldest eligible dbuf from the dbuf cache.
  */
 static void
 dbuf_evict_one(void)
 {
 	int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
 	multilist_sublist_t *mls = multilist_sublist_lock_idx(
 	    &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
 	}
 
 	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
 	    multilist_sublist_t *, mls);
 
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
 		uint64_t size = db->db.db_size;
 		uint64_t usize = dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[DB_DBUF_CACHE].size, size, db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);
 		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 		DBUF_STAT_BUMPDOWN(cache_count);
 		DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);
 		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
 		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 		DBUF_STAT_BUMP(cache_total_evicts);
 	} else {
 		multilist_sublist_unlock(mls);
 	}
 }
 
 /*
  * The dbuf evict thread is responsible for aging out dbufs from the
  * cache. Once the cache has reached it's maximum size, dbufs are removed
  * and destroyed. The eviction thread will continue running until the size
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
 static __attribute__((noreturn)) void
 dbuf_evict_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 
 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&dbuf_evict_lock);
 	while (!dbuf_evict_thread_exit) {
 		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			CALLB_CPR_SAFE_BEGIN(&cpr);
 			(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
 			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
 		}
 		mutex_exit(&dbuf_evict_lock);
 
 		/*
 		 * Keep evicting as long as we're above the low water mark
 		 * for the cache. We do this without holding the locks to
 		 * minimize lock contention.
 		 */
 		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			dbuf_evict_one();
 		}
 
 		mutex_enter(&dbuf_evict_lock);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	cv_broadcast(&dbuf_evict_cv);
 	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
 	thread_exit();
 }
 
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
  * dbuf cache using the caller's context.
  */
 static void
 dbuf_evict_notify(uint64_t size)
 {
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
 	if (size > dbuf_cache_target_bytes()) {
 		if (size > dbuf_cache_hiwater_bytes())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
 	}
 }
 
 static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
 	dbuf_stats_t *ds = ksp->ks_data;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	ds->cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_count);
 	ds->cache_size_bytes.value.ui64 =
 	    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
 	ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
 	ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
 	ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
 	ds->cache_total_evicts.value.ui64 =
 	    wmsum_value(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		ds->cache_levels[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels[i]);
 		ds->cache_levels_bytes[i].value.ui64 =
 		    wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	ds->hash_hits.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_hits);
 	ds->hash_misses.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_misses);
 	ds->hash_collisions.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_collisions);
 	ds->hash_elements.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_elements);
 	ds->hash_chains.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_chains);
 	ds->hash_insert_race.value.ui64 =
 	    wmsum_value(&dbuf_sums.hash_insert_race);
 	ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
 	ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
 	ds->metadata_cache_count.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_count);
 	ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
 	    &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
 	ds->metadata_cache_overflow.value.ui64 =
 	    wmsum_value(&dbuf_sums.metadata_cache_overflow);
 	return (0);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hmsize, hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	/*
 	 * The hash table is big enough to fill one eighth of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
 		hsize <<= 1;
 
 	h->hash_table = NULL;
 	while (h->hash_table == NULL) {
 		h->hash_table_mask = hsize - 1;
 
 		h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
 		if (h->hash_table == NULL)
 			hsize >>= 1;
 
 		ASSERT3U(hsize, >=, 1ULL << 10);
 	}
 
 	/*
 	 * The hash table buckets are protected by an array of mutexes where
 	 * each mutex is reponsible for protecting 128 buckets.  A minimum
 	 * array size of 8192 is targeted to avoid contention.
 	 */
 	if (dbuf_mutex_cache_shift == 0)
 		hmsize = MAX(hsize >> 7, 1ULL << 13);
 	else
 		hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
 
 	h->hash_mutexes = NULL;
 	while (h->hash_mutexes == NULL) {
 		h->hash_mutex_mask = hmsize - 1;
 
 		h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
 		    KM_SLEEP);
 		if (h->hash_mutexes == NULL)
 			hmsize >>= 1;
 	}
 
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 	dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t",
 	    sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	for (int i = 0; i < hmsize; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
 
 	dbuf_stats_init(h);
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 	 * configuration is not required.
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		multilist_create(&dbuf_caches[dcs].cache,
 		    sizeof (dmu_buf_impl_t),
 		    offsetof(dmu_buf_impl_t, db_cache_link),
 		    dbuf_cache_multilist_index_func);
 		zfs_refcount_create(&dbuf_caches[dcs].size);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
 	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
 	    NULL, 0, &p0, TS_RUN, minclsyspri);
 
 	wmsum_init(&dbuf_sums.cache_count, 0);
 	wmsum_init(&dbuf_sums.cache_total_evicts, 0);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_init(&dbuf_sums.cache_levels[i], 0);
 		wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
 	}
 	wmsum_init(&dbuf_sums.hash_hits, 0);
 	wmsum_init(&dbuf_sums.hash_misses, 0);
 	wmsum_init(&dbuf_sums.hash_collisions, 0);
 	wmsum_init(&dbuf_sums.hash_elements, 0);
 	wmsum_init(&dbuf_sums.hash_chains, 0);
 	wmsum_init(&dbuf_sums.hash_insert_race, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_count, 0);
 	wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
 
 	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (dbuf_ksp != NULL) {
 		for (int i = 0; i < DN_MAX_LEVELS; i++) {
 			snprintf(dbuf_stats.cache_levels[i].name,
 			    KSTAT_STRLEN, "cache_level_%d", i);
 			dbuf_stats.cache_levels[i].data_type =
 			    KSTAT_DATA_UINT64;
 			snprintf(dbuf_stats.cache_levels_bytes[i].name,
 			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
 			dbuf_stats.cache_levels_bytes[i].data_type =
 			    KSTAT_DATA_UINT64;
 		}
 		dbuf_ksp->ks_data = &dbuf_stats;
 		dbuf_ksp->ks_update = dbuf_kstat_update;
 		kstat_install(dbuf_ksp);
 	}
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 
 	dbuf_stats_destroy();
 
 	for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 
 	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
 	    sizeof (kmutex_t));
 
 	kmem_cache_destroy(dbuf_kmem_cache);
 	kmem_cache_destroy(dbuf_dirty_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
 	mutex_enter(&dbuf_evict_lock);
 	dbuf_evict_thread_exit = B_TRUE;
 	while (dbuf_evict_thread_exit) {
 		cv_signal(&dbuf_evict_cv);
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
 	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 		zfs_refcount_destroy(&dbuf_caches[dcs].size);
 		multilist_destroy(&dbuf_caches[dcs].cache);
 	}
 
 	if (dbuf_ksp != NULL) {
 		kstat_delete(dbuf_ksp);
 		dbuf_ksp = NULL;
 	}
 
 	wmsum_fini(&dbuf_sums.cache_count);
 	wmsum_fini(&dbuf_sums.cache_total_evicts);
 	for (int i = 0; i < DN_MAX_LEVELS; i++) {
 		wmsum_fini(&dbuf_sums.cache_levels[i]);
 		wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
 	}
 	wmsum_fini(&dbuf_sums.hash_hits);
 	wmsum_fini(&dbuf_sums.hash_misses);
 	wmsum_fini(&dbuf_sums.hash_collisions);
 	wmsum_fini(&dbuf_sums.hash_elements);
 	wmsum_fini(&dbuf_sums.hash_chains);
 	wmsum_fini(&dbuf_sums.hash_insert_race);
 	wmsum_fini(&dbuf_sums.metadata_cache_count);
 	wmsum_fini(&dbuf_sums.metadata_cache_overflow);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 	uint32_t txg_prev;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	if ((dr = list_head(&db->db_dirty_records)) != NULL) {
 		ASSERT(dr->dr_dbuf == db);
 		txg_prev = dr->dr_txg;
 		for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
 		    dr = list_next(&db->db_dirty_records, dr)) {
 			ASSERT(dr->dr_dbuf == db);
 			ASSERT(txg_prev > dr->dr_txg);
 			txg_prev = dr->dr_txg;
 		}
 	}
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb __maybe_unused = db->db_parent->db.db_size >>
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the parent's rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 *
 		 * There is an exception to this rule for indirect blocks; in
 		 * this case, if the indirect block is a hole, we fill in a few
 		 * fields on each of the child blocks (importantly, birth time)
 		 * to prevent hole birth times from being lost when you
 		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
 			if (db->db_level == 0) {
 				uint64_t *buf = db->db.db_data;
 				int i;
 
 				for (i = 0; i < db->db.db_size >> 3; i++) {
 					ASSERT(buf[i] == 0);
 				}
 			} else {
 				blkptr_t *bps = db->db.db_data;
 				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
 				    db->db.db_size);
 				/*
 				 * We want to verify that all the blkptrs in the
 				 * indirect block are holes, but we may have
 				 * automatically set up a few fields for them.
 				 * We iterate through each blkptr and verify
 				 * they only have those fields set.
 				 */
 				for (int i = 0;
 				    i < db->db.db_size / sizeof (blkptr_t);
 				    i++) {
 					blkptr_t *bp = &bps[i];
 					ASSERT(ZIO_CHECKSUM_IS_ZERO(
 					    &bp->blk_cksum));
 					ASSERT(
 					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
 					ASSERT0(bp->blk_fill);
 					ASSERT0(bp->blk_pad[0]);
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
 					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	dbuf_evict_user(db);
 	ASSERT3P(db->db_buf, ==, NULL);
 	db->db.db_data = NULL;
 	if (db->db_state != DB_NOFILL) {
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "clear data");
 	}
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(buf != NULL);
 
 	db->db_buf = buf;
 	ASSERT(buf->b_data != NULL);
 	db->db.db_data = buf->b_data;
 }
 
 static arc_buf_t *
 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
 {
 	spa_t *spa = db->db_objset->os_spa;
 
 	return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
 		memcpy(abuf->b_data, db->db.db_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 /*
  * Calculate which level n block references the data at the level 0 offset
  * provided.
  */
 uint64_t
 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 {
 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
 		/*
 		 * The level n blkid is equal to the level 0 blkid divided by
 		 * the number of level 0s in a level n block.
 		 *
 		 * The level 0 blkid is offset >> datablkshift =
 		 * offset / 2^datablkshift.
 		 *
 		 * The number of level 0s in a level n is the number of block
 		 * pointers in an indirect block, raised to the power of level.
 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
 		 *
 		 * Thus, the level n blkid is: offset /
 		 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
 		 * = offset / 2^(datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 * = offset >> (datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 */
 
 		const unsigned exp = dn->dn_datablkshift +
 		    level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 
 		if (exp >= 8 * sizeof (offset)) {
 			/* This only happens on the highest indirection level */
 			ASSERT3U(level, ==, dn->dn_nlevels - 1);
 			return (0);
 		}
 
 		ASSERT3U(exp, <, 8 * sizeof (offset));
 
 		return (offset >> exp);
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 /*
  * This function is used to lock the parent of the provided dbuf. This should be
  * used when modifying or reading db_blkptr.
  */
 db_lock_type_t
 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
 {
 	enum db_lock_type ret = DLT_NONE;
 	if (db->db_parent != NULL) {
 		rw_enter(&db->db_parent->db_rwlock, rw);
 		ret = DLT_PARENT;
 	} else if (dmu_objset_ds(db->db_objset) != NULL) {
 		rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
 		    tag);
 		ret = DLT_OBJSET;
 	}
 	/*
 	 * We only return a DLT_NONE lock when it's the top-most indirect block
 	 * of the meta-dnode of the MOS.
 	 */
 	return (ret);
 }
 
 /*
  * We need to pass the lock type in because it's possible that the block will
  * move from being the topmost indirect block in a dnode (and thus, have no
  * parent) to not the top-most via an indirection increase. This would cause a
  * panic if we didn't pass the lock type in.
  */
 void
 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
 {
 	if (type == DLT_PARENT)
 		rw_exit(&db->db_parent->db_rwlock);
 	else if (type == DLT_OBJSET)
 		rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
 }
 
 static void
 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *vdb)
 {
 	(void) zb, (void) bp;
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (buf == NULL) {
 		/* i/o error */
 		ASSERT(zio == NULL || zio->io_error != 0);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "i/o error");
 	} else if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* freed in flight */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		arc_release(buf, db);
 		memset(buf->b_data, 0, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "freed in flight");
 	} else {
 		/* success */
 		ASSERT(zio == NULL || zio->io_error == 0);
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "successful read");
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 /*
  * Shortcut for performing reads on bonus dbufs.  Returns
  * an error if we fail to verify the dnode associated with
  * a decrypted block. Otherwise success.
  */
 static int
 dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
 {
 	int bonuslen, max_bonuslen;
 
 	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(DB_DNODE_HELD(db));
 	ASSERT3U(bonuslen, <=, db->db.db_size);
 	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
 	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
 	if (bonuslen < max_bonuslen)
 		memset(db->db.db_data, 0, max_bonuslen);
 	if (bonuslen)
 		memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
 	db->db_state = DB_CACHED;
 	DTRACE_SET_STATE(db, "bonus buffer filled");
 	return (0);
 }
 
 static void
 dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
 	blkptr_t *bps = db->db.db_data;
 	uint32_t indbs = 1ULL << dn->dn_indblkshift;
 	int n_bps = indbs >> SPA_BLKPTRSHIFT;
 
 	for (int i = 0; i < n_bps; i++) {
 		blkptr_t *bp = &bps[i];
 
 		ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
 		BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
 		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
 		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
 		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
 		BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
 	}
 }
 
 /*
  * Handle reads on dbufs that are holes, if necessary.  This function
  * requires that the dbuf's mutex is held. Returns success (0) if action
  * was taken, ENOENT if no action was taken.
  */
 static int
 dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	int is_hole = bp == NULL || BP_IS_HOLE(bp);
 	/*
 	 * For level 0 blocks only, if the above check fails:
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (!is_hole && db->db_level == 0)
 		is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
 	if (is_hole) {
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		memset(db->db.db_data, 0, db->db.db_size);
 
 		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
 		    BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
 		DTRACE_SET_STATE(db, "hole read satisfied");
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * This function ensures that, when doing a decrypting read of a block,
  * we make sure we have decrypted the dnode associated with it. We must do
  * this so that we ensure we are fully authenticating the checksum-of-MACs
  * tree from the root of the objset down to this block. Indirect blocks are
  * always verified against their secure checksum-of-MACs assuming that the
  * dnode containing them is correct. Now that we are doing a decrypting read,
  * we can be sure that the key is loaded and verify that assumption. This is
  * especially important considering that we always read encrypted dnode
  * blocks as raw data (without verifying their MACs) to start, and
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
 	objset_t *os = db->db_objset;
 	dmu_buf_impl_t *dndb;
 	arc_buf_t *dnbuf;
 	zbookmark_phys_t zb;
 	int err;
 
 	if ((flags & DB_RF_NO_DECRYPT) != 0 ||
 	    !os->os_encrypted || os->os_raw_receive ||
 	    (dndb = dn->dn_dbuf) == NULL)
 		return (0);
 
 	dnbuf = dndb->db_buf;
 	if (!arc_is_encrypted(dnbuf))
 		return (0);
 
 	mutex_enter(&dndb->db_mtx);
 
 	/*
 	 * Since dnode buffer is modified by sync process, there can be only
 	 * one copy of it.  It means we can not modify (decrypt) it while it
 	 * is being written.  I don't see how this may happen now, since
 	 * encrypted dnode writes by receive should be completed before any
 	 * plain-text reads due to txg wait, but better be safe than sorry.
 	 */
 	while (1) {
 		if (!arc_is_encrypted(dnbuf)) {
 			mutex_exit(&dndb->db_mtx);
 			return (0);
 		}
 		dbuf_dirty_record_t *dr = dndb->db_data_pending;
 		if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
 			break;
 		cv_wait(&dndb->db_changed, &dndb->db_mtx);
 	};
 
 	SET_BOOKMARK(&zb, dmu_objset_id(os),
 	    DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
 	err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
 
 	/*
 	 * An error code of EACCES tells us that the key is still not
 	 * available. This is ok if we are only reading authenticated
 	 * (and therefore non-encrypted) blocks.
 	 */
 	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
 	    (db->db_blkid == DMU_BONUS_BLKID &&
 	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
 		err = 0;
 
 	mutex_exit(&dndb->db_mtx);
 
 	return (err);
 }
 
 /*
  * Drops db_mtx and the parent lock specified by dblt and tag before
  * returning.
  */
 static int
 dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
     db_lock_type_t dblt, blkptr_t *bp, const void *tag)
 {
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_FLAG_NOWAIT;
 	int err, zio_flags;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_parent == NULL ||
 	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		err = dbuf_read_bonus(db, dn);
 		goto early_unlock;
 	}
 
 	err = dbuf_read_hole(db, dn, bp);
 	if (err == 0)
 		goto early_unlock;
 
 	ASSERT(bp != NULL);
 
 	/*
 	 * Any attempt to read a redacted block should result in an error. This
 	 * will never happen under normal conditions, but can be useful for
 	 * debugging purposes.
 	 */
 	if (BP_IS_REDACTED(bp)) {
 		ASSERT(dsl_dataset_feature_is_active(
 		    db->db_objset->os_dsl_dataset,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	/*
 	 * All bps of an encrypted os should have the encryption bit set.
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
 		spa_log_error(db->db_objset->os_spa, &zb,
 		    BP_GET_LOGICAL_BIRTH(bp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
 
 	db->db_state = DB_READ;
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
 
 	if (!DBUF_IS_CACHEABLE(db))
 		aflags |= ARC_FLAG_UNCACHED;
 	else if (dbuf_is_l2cacheable(db, bp))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
 
 	zio_flags = (flags & DB_RF_CANFAIL) ?
 	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
 
 	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
 
 	/*
 	 * The zio layer will copy the provided blkptr later, but we need to
 	 * do this now so that we can release the parent's rwlock. We have to
 	 * do that now so that if dbuf_read_done is called synchronously (on
 	 * an l1 cache hit) we don't acquire the db_mtx while holding the
 	 * parent's rwlock, which would be a lock ordering violation.
 	 */
 	blkptr_t copy = *bp;
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (arc_read(zio, db->db_objset->os_spa, &copy,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
 	    &aflags, &zb));
 
 early_unlock:
 	mutex_exit(&db->db_mtx);
 	dmu_buf_unlock_parent(db, dblt, tag);
 	return (err);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of buffers that
  * have been modified in a previous transaction group before we access them in
  * the current active group.
  *
  * This function is used in three places: when we are dirtying a buffer for the
  * first time in a txg, when we are freeing a range in a dnode that includes
  * this buffer, and when we are accessing a buffer which was received compressed
  * and later referenced in a WRITE_BYREF record.
  *
  * Note that when we are called from dbuf_free_range() we do not put a hold on
  * the buffer, we just traverse the active dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT3U(dr->dr_txg, >=, txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dnode_t *dn = DB_DNODE(db);
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
 		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
 		memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
 	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		dnode_t *dn = DB_DNODE(db);
 		int size = arc_buf_size(db->db_buf);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 		enum zio_compress compress_type =
 		    arc_get_compression(db->db_buf);
 		uint8_t complevel = arc_get_complevel(db->db_buf);
 
 		if (arc_is_encrypted(db->db_buf)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(db->db_buf, &byteorder, salt,
 			    iv, mac);
 			dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
 			    dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
 			    mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
 			    compress_type, complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
 			    size, arc_buf_lsize(db->db_buf), compress_type,
 			    complevel);
 		} else {
 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
 		}
 		memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
 	dnode_t *dn;
 	boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
 	int err;
 
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/*
 	 * Ensure that this block's dnode has been decrypted if the caller
 	 * has requested decrypted data.
 	 */
 	err = dbuf_read_verify_dnode_crypt(db, dn, flags);
 	if (err != 0)
 		goto done;
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0;
 
 	mutex_enter(&db->db_mtx);
 	if (flags & DB_RF_PARTIAL_FIRST)
 		db->db_partial_read = B_TRUE;
 	else if (!(flags & DB_RF_PARTIAL_MORE))
 		db->db_partial_read = B_FALSE;
 	miss = (db->db_state != DB_CACHED);
 
 	if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 		/*
 		 * Another reader came in while the dbuf was in flight between
 		 * UNCACHED and CACHED.  Either a writer will finish filling
 		 * the buffer, sending the dbuf to CACHED, or the first reader's
 		 * request will reach the read_done callback and send the dbuf
 		 * to CACHED.  Otherwise, a failure occurred and the dbuf will
 		 * be sent to UNCACHED.
 		 */
 		if (flags & DB_RF_NEVERWAIT) {
 			mutex_exit(&db->db_mtx);
 			DB_DNODE_EXIT(db);
 			goto done;
 		}
 		do {
 			ASSERT(db->db_state == DB_READ ||
 			    (flags & DB_RF_HAVESTRUCT) == 0);
 			DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
 			    zio_t *, pio);
 			cv_wait(&db->db_changed, &db->db_mtx);
 		} while (db->db_state == DB_READ || db->db_state == DB_FILL);
 		if (db->db_state == DB_UNCACHED) {
 			err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			DB_DNODE_EXIT(db);
 			goto done;
 		}
 	}
 
 	if (db->db_state == DB_CACHED) {
 		/*
 		 * If the arc buf is compressed or encrypted and the caller
 		 * requested uncompressed data, we need to untransform it
 		 * before returning. We also call arc_untransform() on any
 		 * unauthenticated blocks, which will verify their MAC if
 		 * the key is now available.
 		 */
 		if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
 		    (arc_is_encrypted(db->db_buf) ||
 		    arc_is_unauthenticated(db->db_buf) ||
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
 			spa_t *spa = dn->dn_objset->os_spa;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 			    db->db.db_object, db->db_level, db->db_blkid);
 			dbuf_fix_old_data(db, spa_syncing_txg(spa));
 			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL);
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		blkptr_t *bp;
 
 		/*
 		 * If a block clone or Direct I/O write has occurred we will
 		 * get the dirty records overridden BP so we get the most
 		 * recent data.
 		 */
 		err = dmu_buf_get_bp_from_dbuf(db, &bp);
 
 		if (!err) {
 			if (pio == NULL && (db->db_state == DB_NOFILL ||
 			    (bp != NULL && !BP_IS_HOLE(bp)))) {
 				spa_t *spa = dn->dn_objset->os_spa;
 				pio =
 				    zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 				need_wait = B_TRUE;
 			}
 
 			err =
 			    dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
 		} else {
 			mutex_exit(&db->db_mtx);
 			dmu_buf_unlock_parent(db, dblt, FTAG);
 		}
 		/* dbuf_read_impl drops db_mtx and parent's rwlock. */
 		miss = (db->db_state != DB_CACHED);
 	}
 
 	if (err == 0 && prefetch) {
 		dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
 		    flags & DB_RF_HAVESTRUCT);
 	}
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we created a zio we must execute it to avoid leaking it, even if
 	 * it isn't attached to any work due to an error in dbuf_read_impl().
 	 */
 	if (need_wait) {
 		if (err == 0)
 			err = zio_wait(pio);
 		else
 			(void) zio_wait(pio);
 		pio = NULL;
 	}
 
 done:
 	if (miss)
 		DBUF_STAT_BUMP(hash_misses);
 	else
 		DBUF_STAT_BUMP(hash_hits);
 	if (pio && err != 0) {
 		zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 		zio->io_error = err;
 		zio_nowait(zio);
 	}
 
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
 		db->db_state = DB_FILL;
 		DTRACE_SET_STATE(db, "assigning filled buffer");
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * This assert is valid because dmu_sync() expects to be called by
 	 * a zilog's get_data while holding a range lock.  This call only
 	 * comes from dbuf_dirty() callers who must also hold a range lock.
 	 */
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
 		ASSERT0P(dr->dt.dl.dr_data);
 		dr->dt.dl.dr_data = db->db_buf;
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 	dr->dt.dl.dr_brtwrite = B_FALSE;
 	dr->dt.dl.dr_diowrite = B_FALSE;
 	dr->dt.dl.dr_has_raw_params = B_FALSE;
 
 	/*
 	 * In the event that Direct I/O was used, we do not
 	 * need to release the buffer from the ARC.
 	 *
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	if (dr->dt.dl.dr_data)
 		arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 	dbuf_dirty_record_t *dr;
 
 	if (end_blkid > dn->dn_maxblkid &&
 	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
 	    (u_longlong_t)end_blkid);
 
 	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
 	db_search->db_level = 0;
 	db_search->db_blkid = start_blkid;
 	db_search->db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db = avl_find(&dn->dn_dbufs, db_search, &where);
 	ASSERT3P(db, ==, NULL);
 
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (zfs_refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		dr = list_head(&db->db_dirty_records);
 		if (dr != NULL) {
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			rw_enter(&db->db_rwlock, RW_WRITER);
 			memset(db->db.db_data, 0, db->db.db_size);
 			rw_exit(&db->db_rwlock);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 
 	mutex_exit(&dn->dn_dbufs_mtx);
 	kmem_free(db_search, sizeof (dmu_buf_impl_t));
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *old_buf;
 	dbuf_dirty_record_t *dr;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	VERIFY3P(db->db_buf, !=, NULL);
 
 	/* create the data buffer for the new block */
 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
 
 	/* copy old block data to the new block */
 	old_buf = db->db_buf;
 	memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		memset((uint8_t *)buf->b_data + osize, 0, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	arc_buf_destroy(old_buf, db);
 	db->db.db_size = size;
 
 	dr = list_head(&db->db_dirty_records);
 	/* dirty record added by dmu_buf_will_dirty() */
 	VERIFY(dr != NULL);
 	if (db->db_level == 0)
 		dr->dt.dl.dr_data = buf;
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	ASSERT3U(dr->dr_accounted, ==, osize);
 	dr->dr_accounted = size;
 	mutex_exit(&db->db_mtx);
 
 	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os __maybe_unused = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 /*
  * We already have a dirty record for this TXG, and we are being
  * dirtied again.
  */
 static void
 dbuf_redirty(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this buffer has already been written out,
 		 * we now need to reset its state.
 		 */
 		dbuf_unoverride(dr);
 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 		    db->db_state != DB_NOFILL) {
 			/* Already released on initial dirty, so just thaw. */
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
 	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
 	dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
 	ASSERT(dn->dn_maxblkid >= blkid);
 
 	dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	dr->dr_txg = tx->tx_txg;
 	dr->dt.dll.dr_blkid = blkid;
 	dr->dr_accounted = dn->dn_datablksz;
 
 	/*
 	 * There should not be any dbuf for the block that we're dirtying.
 	 * Otherwise the buffer contents could be inconsistent between the
 	 * dbuf and the lightweight dirty record.
 	 */
 	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
 	    NULL));
 
 	mutex_enter(&dn->dn_mtx);
 	int txgoff = tx->tx_txg & TXG_MASK;
 	if (dn->dn_free_ranges[txgoff] != NULL) {
 		zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
 	}
 
 	if (dn->dn_nlevels == 1) {
 		ASSERT3U(blkid, <, dn->dn_nblkptr);
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_setdirty(dn, tx);
 	} else {
 		mutex_exit(&dn->dn_mtx);
 
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
 		    1, blkid >> epbs, FTAG);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (parent_db == NULL) {
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 		int err = dbuf_read(parent_db, NULL,
 		    (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err != 0) {
 			dbuf_rele(parent_db, FTAG);
 			kmem_free(dr, sizeof (*dr));
 			return (NULL);
 		}
 
 		dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
 		dbuf_rele(parent_db, FTAG);
 		mutex_enter(&parent_dr->dt.di.dr_mtx);
 		ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
 		list_insert_tail(&parent_dr->dt.di.dr_children, dr);
 		mutex_exit(&parent_dr->dt.di.dr_mtx);
 		dr->dr_parent = parent_dr;
 	}
 
 	dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
 
 	return (dr);
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t *dr, *dr_next, *dr_head;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	boolean_t drop_struct_rwlock = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL) {
 		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 		    RW_READER, FTAG);
 	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	dnode_set_dirtyctx(dn, tx, db);
 	if (tx->tx_txg > dn->dn_dirty_txg)
 		dn->dn_dirty_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	dr_head = list_head(&db->db_dirty_records);
 	ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
 	if (dr_next && dr_next->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		dbuf_redirty(dr_next);
 		mutex_exit(&db->db_mtx);
 		return (dr_next);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
 #ifdef ZFS_DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP);
 	memset(dr, 0, sizeof (*dr));
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
 		dr->dr_accounted = db->db.db_size;
 	}
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	list_insert_before(&db->db_dirty_records, dr_next, dr);
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			zfs_range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_rwlock = B_TRUE;
 	}
 
 	/*
 	 * If we are overwriting a dedup BP, then unless it is snapshotted,
 	 * when we get to syncing context we will need to decrement its
 	 * refcount in the DDT.  Prefetch the relevant DDT block so that
 	 * syncing context won't have to wait for the i/o.
 	 */
 	if (db->db_blkptr != NULL) {
 		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 		ddt_prefetch(os->os_spa, db->db_blkptr);
 		dmu_buf_unlock_parent(db, dblt, FTAG);
 	}
 
 	/*
 	 * We need to hold the dn_struct_rwlock to make this assertion,
 	 * because it protects dn_phys / dn_next_nlevels from changing.
 	 */
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 
 	if (db->db_level == 0) {
 		ASSERT(!db->db_objset->os_raw_receive ||
 		    dn->dn_maxblkid >= db->db_blkid);
 		dnode_new_blkid(dn, db->db_blkid, tx,
 		    drop_struct_rwlock, B_FALSE);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level + 1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (list_head(&db->db_dirty_records) == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level + 1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_rwlock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 static void
 dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	if (dr->dt.dl.dr_data != db->db.db_data) {
 		struct dnode *dn = dr->dr_dnode;
 		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 
 		kmem_free(dr->dt.dl.dr_data, max_bonuslen);
 		arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
 	}
 	db->db_data_pending = NULL;
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 	if (dr->dr_dbuf->db_level != 0) {
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 	ASSERT3U(db->db_dirtycnt, >, 0);
 	db->db_dirtycnt -= 1;
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
 	boolean_t brtwrite;
 	boolean_t diowrite;
 
 	ASSERT(txg != 0);
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
 	 * in open context, unless we are operating on the MOS.
 	 * From syncing context, dn_nlevels may be different from the
 	 * dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
 	if (dr == NULL)
 		return (B_FALSE);
 	ASSERT(dr->dr_dbuf == db);
 
 	brtwrite = dr->dt.dl.dr_brtwrite;
 	diowrite = dr->dt.dl.dr_diowrite;
 	if (brtwrite) {
 		ASSERT3B(diowrite, ==, B_FALSE);
 		/*
 		 * We are freeing a block that we cloned in the same
 		 * transaction group.
 		 */
 		blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_remove(dmu_objset_spa(db->db_objset),
 			    bp, tx);
 		}
 	}
 
 	dnode_t *dn = dr->dr_dnode;
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
 	    dr->dr_accounted, txg);
 
 	list_remove(&db->db_dirty_records, dr);
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (db->db_state != DB_NOFILL && !brtwrite) {
 		dbuf_unoverride(dr);
 
 		if (dr->dt.dl.dr_data != db->db_buf) {
 			ASSERT(db->db_buf != NULL);
 			ASSERT(dr->dt.dl.dr_data != NULL);
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	}
 
 	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
 		    arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	boolean_t undirty = B_FALSE;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	/*
 	 * Quick check for dirtiness to improve performance for some workloads
 	 * (e.g. file deletion with indirect blocks cached).
 	 */
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
 		/*
 		 * It's possible that the dbuf is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
 		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 		if (dr != NULL) {
 			if (db->db_level == 0 &&
 			    dr->dt.dl.dr_brtwrite) {
 				/*
 				 * Block cloning: If we are dirtying a cloned
 				 * level 0 block, we cannot simply redirty it,
 				 * because this dr has no associated data.
 				 * We will go through a full undirtying below,
 				 * before dirtying it again.
 				 */
 				undirty = B_TRUE;
 			} else {
 				/* This dbuf is already dirty and cached. */
 				dbuf_redirty(dr);
 				mutex_exit(&db->db_mtx);
 				return;
 			}
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		flags |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
 	 * want to make sure dbuf_read() will read the pending cloned block and
 	 * not the uderlying block that is being replaced. dbuf_undirty() will
 	 * do brt_pending_remove() before removing the dirty record.
 	 */
 	(void) dbuf_read(db, NULL, flags);
 	if (undirty) {
 		mutex_enter(&db->db_mtx);
 		VERIFY(!dbuf_undirty(db, tx));
 		mutex_exit(&db->db_mtx);
 	}
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
 }
 
 boolean_t
 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	mutex_enter(&db->db_mtx);
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 	mutex_exit(&db->db_mtx);
 	return (dr != NULL);
 }
 
 /*
  * Normally the db_blkptr points to the most recent on-disk content for the
  * dbuf (and anything newer will be cached in the dbuf). However, a pending
  * block clone or not yet synced Direct I/O write will have a dirty record BP
  * pointing to the most recent data.
  */
 int
 dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	int error = 0;
 
 	if (db->db_level != 0) {
 		*bp = db->db_blkptr;
 		return (0);
 	}
 
 	*bp = db->db_blkptr;
 	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 	if (dr && db->db_state == DB_NOFILL) {
 		/* Block clone */
 		if (!dr->dt.dl.dr_brtwrite)
 			error = EIO;
 		else
 			*bp = &dr->dt.dl.dr_overridden_by;
 	} else if (dr && db->db_state == DB_UNCACHED) {
 		/* Direct I/O write */
 		if (dr->dt.dl.dr_diowrite)
 			*bp = &dr->dt.dl.dr_overridden_by;
 	}
 
 	return (error);
 }
 
 /*
  * Direct I/O reads can read directly from the ARC, but the data has
  * to be untransformed in order to copy it over into user pages.
  */
 int
 dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
 {
 	int err = 0;
 	DB_DNODE_ENTER(db);
 	dnode_t *dn = DB_DNODE(db);
 
 	ASSERT3S(db->db_state, ==, DB_CACHED);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * Ensure that this block's dnode has been decrypted if
 	 * the caller has requested decrypted data.
 	 */
 	err = dbuf_read_verify_dnode_crypt(db, dn, 0);
 
 	/*
 	 * If the arc buf is compressed or encrypted and the caller
 	 * requested uncompressed data, we need to untransform it
 	 * before returning. We also call arc_untransform() on any
 	 * unauthenticated blocks, which will verify their MAC if
 	 * the key is now available.
 	 */
 	if (err == 0 && db->db_buf != NULL &&
 	    (arc_is_encrypted(db->db_buf) ||
 	    arc_is_unauthenticated(db->db_buf) ||
 	    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
 		zbookmark_phys_t zb;
 
 		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 		    db->db.db_object, db->db_level, db->db_blkid);
 		dbuf_fix_old_data(db, spa_syncing_txg(spa));
 		err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
 		dbuf_set_data(db, db->db_buf);
 	}
 	DB_DNODE_EXIT(db);
 	DBUF_STAT_BUMP(hash_hits);
 
 	return (err);
 }
 
 void
 dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	/*
 	 * Block clones and Direct I/O writes always happen in open-context.
 	 */
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT0(db->db_level);
 	ASSERT(!dmu_tx_is_syncing(tx));
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	/*
 	 * We are going to clone or issue a Direct I/O write on this block, so
 	 * undirty modifications done to this block so far in this txg. This
 	 * includes writes and clones into this block.
 	 *
 	 * If there dirty record associated with this txg from a previous Direct
 	 * I/O write then space accounting cleanup takes place. It is important
 	 * to go ahead free up the space accounting through dbuf_undirty() ->
 	 * dbuf_unoverride() -> zio_free(). Space accountiung for determining
 	 * if a write can occur in zfs_write() happens through dmu_tx_assign().
 	 * This can cause an issue with Direct I/O writes in the case of
 	 * overwriting the same block, because all DVA allocations are being
 	 * done in open-context. Constantly allowing Direct I/O overwrites to
 	 * the same block can exhaust the pools available space leading to
 	 * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
 	 * will eventually suspend the pool. By cleaning up sapce acccounting
 	 * now, the ENOSPC error can be avoided.
 	 *
 	 * Since we are undirtying the record in open-context, we must have a
 	 * hold on the db, so it should never be evicted after calling
 	 * dbuf_undirty().
 	 */
 	VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
 	ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
 
 	if (db->db_buf != NULL) {
 		/*
 		 * If there is an associated ARC buffer with this dbuf we can
 		 * only destroy it if the previous dirty record does not
 		 * reference it.
 		 */
 		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 		if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
 			arc_buf_destroy(db->db_buf, db);
 
 		/*
 		 * Setting the dbuf's data pointers to NULL will force all
 		 * future reads down to the devices to get the most up to date
 		 * version of the data after a Direct I/O write has completed.
 		 */
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 
 	ASSERT3P(db->db_buf, ==, NULL);
 	ASSERT3P(db->db.db_data, ==, NULL);
 
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db,
 	    "allocating NOFILL buffer for clone or direct I/O write");
 
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	db->db_state = DB_NOFILL;
 	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	mutex_enter(&db->db_mtx);
 	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 	if (db->db_state == DB_NOFILL ||
 	    (db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
 		/*
 		 * If the fill can fail we should have a way to return back to
 		 * the cloned or Direct I/O write data.
 		 */
 		if (canfail && dr) {
 			mutex_exit(&db->db_mtx);
 			dmu_buf_will_dirty(db_fake, tx);
 			return;
 		}
 		/*
 		 * Block cloning: We will be completely overwriting a block
 		 * cloned in this transaction group, so let's undirty the
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.
 		 */
 		if (db->db_state == DB_NOFILL) {
 			VERIFY(!dbuf_undirty(db, tx));
 			db->db_state = DB_UNCACHED;
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 /*
  * This function is effectively the same as dmu_buf_will_dirty(), but
  * indicates the caller expects raw encrypted data in the db, and provides
  * the crypt params (byteorder, salt, iv, mac) which should be stored in the
  * blkptr_t when this dbuf is written.  This is only used for blocks of
  * dnodes, during raw receive.
  */
 void
 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dbuf_dirty_record_t *dr;
 
 	/*
 	 * dr_has_raw_params is only processed for blocks of dnodes
 	 * (see dbuf_sync_dnode_leaf_crypt()).
 	 */
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT0(db->db_level);
 	ASSERT(db->db_objset->os_raw_receive);
 
 	dmu_buf_will_dirty_impl(db_fake,
 	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
 
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
 
 	dr->dt.dl.dr_has_raw_params = B_TRUE;
 	dr->dt.dl.dr_byteorder = byteorder;
 	memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 static void
 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
 	ASSERT0(db->db_level);
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	ASSERT0(dl->dr_has_raw_params);
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
 	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 boolean_t
 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
 	(void) tx;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			memset(db->db.db_data, 0, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db,
 			    "fill done handling freed in flight");
 			failed = B_FALSE;
 		} else if (failed) {
 			VERIFY(!dbuf_undirty(db, tx));
 			arc_buf_destroy(db->db_buf, db);
 			db->db_buf = NULL;
 			dbuf_clear_data(db);
 			DTRACE_SET_STATE(db, "fill failed");
 		} else {
 			db->db_state = DB_CACHED;
 			DTRACE_SET_STATE(db, "fill done");
 		}
 		cv_broadcast(&db->db_changed);
 	} else {
 		db->db_state = DB_CACHED;
 		failed = B_FALSE;
 	}
 	mutex_exit(&db->db_mtx);
 	return (failed);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 	dbuf_dirty_record_t *dr;
 
 	if (etype == BP_EMBEDDED_TYPE_DATA) {
 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
 		    SPA_FEATURE_EMBEDDED_DATA));
 	}
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
 	ASSERT0(dl->dr_has_raw_params);
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	dmu_object_type_t type;
 	ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	blkptr_t bp = { { { {0} } } };
 	BP_SET_TYPE(&bp, type);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_BIRTH(&bp, tx->tx_txg, 0);
 	BP_SET_REDACTED(&bp);
 	BPE_SET_LSIZE(&bp, dbuf->db_size);
 
 	dbuf_override_impl(db, &bp, tx);
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
 	ASSERT(buf != NULL);
 	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
 	    db->db_state == DB_NOFILL);
 
 	if (db->db_state == DB_CACHED &&
 	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		/*
 		 * In practice, we will never have a case where we have an
 		 * encrypted arc buffer while additional holds exist on the
 		 * dbuf. We don't handle this here so we simply assert that
 		 * fact instead.
 		 */
 		ASSERT(!arc_is_encrypted(buf));
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		memcpy(db->db.db_data, buf->b_data, db->db.db_size);
 		arc_buf_destroy(buf, db);
 		return;
 	}
 
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	} else if (db->db_state == DB_NOFILL) {
 		/*
 		 * We will be completely replacing the cloned block.  In case
 		 * it was cloned in this transaction group, let's undirty the
 		 * pending clone and mark the block as uncached. This will be
 		 * as if the clone was never done.
 		 */
 		VERIFY(!dbuf_undirty(db, tx));
 		db->db_state = DB_UNCACHED;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	DTRACE_SET_STATE(db, "filling assigned arcbuf");
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 	}
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int slots = DB_DNODE(db)->dn_num_slots;
 		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
 		if (db->db.db_data != NULL) {
 			kmem_free(db->db.db_data, bonuslen);
 			arc_space_return(bonuslen, ARC_SPACE_BONUS);
 			db->db_state = DB_UNCACHED;
 			DTRACE_SET_STATE(db, "buffer cleared");
 		}
 	}
 
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
 
 		ASSERT0(dmu_buf_user_size(&db->db));
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 			    db->db.db_size);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT(list_is_empty(&db->db_dirty_records));
 
 	db->db_state = DB_EVICTING;
 	DTRACE_SET_STATE(db, "buffer eviction started");
 	db->db_blkptr = NULL;
 
 	/*
 	 * Now that db_state is DB_EVICTING, nobody else can find this via
 	 * the hash table.  We can now drop db_mtx, which allows us to
 	 * acquire the dn_dbufs_mtx.
 	 */
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
 		if (needlock)
 			mutex_enter_nested(&dn->dn_dbufs_mtx,
 			    NESTED_SINGLE);
 		avl_remove(&dn->dn_dbufs, db);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		if (needlock)
 			mutex_exit(&dn->dn_dbufs_mtx);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		mutex_enter(&dn->dn_mtx);
 		dnode_rele_and_unlock(dn, db, B_TRUE);
 #ifdef USE_DNODE_HANDLE
 		db->db_dnode_handle = NULL;
 #else
 		db->db_dnode = NULL;
 #endif
 
 		dbuf_hash_remove(db);
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 
 	db->db_parent = NULL;
 
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb) {
 		mutex_enter(&parent->db_mtx);
 		dbuf_rele_and_unlock(parent, db, B_TRUE);
 	}
 
 	kmem_cache_free(dbuf_kmem_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 }
 
 /*
  * Note: While bpp will always be updated if the function returns success,
  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
  * this happens when the dnode is the meta-dnode, or {user|group|project}used
  * object.
  */
 __attribute__((always_inline))
 static inline int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	int nlevels =
 	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	/*
 	 * This assertion shouldn't trip as long as the max indirect block size
 	 * is less than 1M.  The reason for this is that up to that point,
 	 * the number of levels required to address an entire object with blocks
 	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.	 In
 	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
 	 * (i.e. we can address the entire object), objects will all use at most
 	 * N-1 levels and the assertion won't overflow.	 However, once epbs is
 	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
 	 * enough to address an entire object, so objects will have 5 levels,
 	 * but then this assertion will overflow.
 	 *
 	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
 	 * need to redo this logic to handle overflows.
 	 */
 	ASSERT(level >= nlevels ||
 	    ((nlevels - level - 1) * epbs) +
 	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
 	if (level >= nlevels ||
 	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
 	    ((nlevels - level - 1) * epbs)) ||
 	    (fail_sparse &&
 	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err;
 
 		err = dbuf_hold_impl(dn, level + 1,
 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		rw_enter(&(*parentp)->db_rwlock, RW_READER);
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
 			ASSERT(BP_IS_HOLE(*bpp));
 		rw_exit(&(*parentp)->db_rwlock);
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
 	    offsetof(dbuf_dirty_record_t, dr_dbuf_node));
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_dirtycnt = 0;
 #ifdef USE_DNODE_HANDLE
 	db->db_dnode_handle = dn->dn_handle;
 #else
 	db->db_dnode = dn;
 #endif
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 	db->db_hash = hash;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
 	db->db_freed_in_flight = FALSE;
 	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		DTRACE_SET_STATE(db, "bonus buffer created");
 		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before it's added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING; /* not worth logging this state change */
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		mutex_exit(&dn->dn_dbufs_mtx);
 		kmem_cache_free(dbuf_kmem_cache, db);
 		DBUF_STAT_BUMP(hash_insert_race);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
 	DTRACE_SET_STATE(db, "regular buffer created");
 	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    zfs_refcount_count(&dn->dn_holds) > 0);
 	(void) zfs_refcount_add(&dn->dn_holds, db);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 /*
  * This function returns a block pointer and information about the object,
  * given a dnode and a block.  This is a publicly accessible version of
  * dbuf_findbp that only returns some information, rather than the
  * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
  * should be locked as (at least) a reader.
  */
 int
 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
 {
 	dmu_buf_impl_t *dbp = NULL;
 	blkptr_t *bp2;
 	int err = 0;
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
 	if (err == 0) {
 		ASSERT3P(bp2, !=, NULL);
 		*bp = *bp2;
 		if (dbp != NULL)
 			dbuf_rele(dbp, NULL);
 		if (datablkszsec != NULL)
 			*datablkszsec = dn->dn_phys->dn_datablkszsec;
 		if (indblkshift != NULL)
 			*indblkshift = dn->dn_phys->dn_indblkshift;
 	}
 
 	return (err);
 }
 
 typedef struct dbuf_prefetch_arg {
 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
 	int dpa_curlevel; /* The current level that we're reading */
 	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
 	dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
 	void *dpa_arg; /* prefetch completion arg */
 } dbuf_prefetch_arg_t;
 
 static void
 dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
 {
 	if (dpa->dpa_cb != NULL) {
 		dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
 		    dpa->dpa_zb.zb_blkid, io_done);
 	}
 	kmem_free(dpa, sizeof (*dpa));
 }
 
 static void
 dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zio, (void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	if (abuf != NULL)
 		arc_buf_destroy(abuf, private);
 
 	dbuf_prefetch_fini(dpa, B_TRUE);
 }
 
 /*
  * Actually issue the prefetch read for the block given.
  */
 static void
 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
 	ASSERT(!BP_IS_REDACTED(bp) ||
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (dbuf_prefetch_fini(dpa, B_FALSE));
 
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags =
 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_NO_BUF;
 
 	/* dnodes are always read as raw and then converted later */
 	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
 	    dpa->dpa_curlevel == 0)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
 	ASSERT(dpa->dpa_zio != NULL);
 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
 	    dbuf_issue_final_prefetch_done, dpa,
 	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
 }
 
 /*
  * Called when an indirect block above our prefetch target is read in.  This
  * will either read in the next indirect block down the tree or issue the actual
  * prefetch if the next block down is our target.
  */
 static void
 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
 	(void) zb, (void) iobp;
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
 	if (abuf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	}
 	ASSERT(zio == NULL || zio->io_error == 0);
 
 	/*
 	 * The dpa_dnode is only valid if we are called with a NULL
 	 * zio. This indicates that the arc_read() returned without
 	 * first calling zio_read() to issue a physical read. Once
 	 * a physical read is made the dpa_dnode must be invalidated
 	 * as the locks guarding it may have been dropped. If the
 	 * dpa_dnode is still valid, then we want to add it to the dbuf
 	 * cache. To do so, we must hold the dbuf associated with the block
 	 * we just prefetched, read its contents so that we associate it
 	 * with an arc_buf_t, and then release it.
 	 */
 	if (zio != NULL) {
 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
 		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
 			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
 		} else {
 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 
 		dpa->dpa_dnode = NULL;
 	} else if (dpa->dpa_dnode != NULL) {
 		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
 		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
 		    dpa->dpa_zb.zb_level));
 		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
 		    dpa->dpa_curlevel, curblkid, FTAG);
 		if (db == NULL) {
 			arc_buf_destroy(abuf, private);
 			dbuf_prefetch_fini(dpa, B_TRUE);
 			return;
 		}
 		(void) dbuf_read(db, NULL,
 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
 		dbuf_rele(db, FTAG);
 	}
 
 	dpa->dpa_curlevel--;
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
 	ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
 	    dsl_dataset_feature_is_active(
 	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS)));
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
 		arc_buf_destroy(abuf, private);
 		dbuf_prefetch_fini(dpa, B_TRUE);
 		return;
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 
 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 
 	arc_buf_destroy(abuf, private);
 }
 
 /*
  * Issue prefetch reads for the given block on the given level.  If the indirect
  * blocks above that block are not in memory, we will read them in
  * asynchronously.  As a result, this call never blocks waiting for a read to
  * complete. Note that the prefetch might fail if the dataset is encrypted and
  * the encryption key is unmapped before the IO completes.
  */
 int
 dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
     void *arg)
 {
 	blkptr_t bp;
 	int epbs, nlevels, curlevel;
 	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (blkid > dn->dn_maxblkid)
 		goto no_issue;
 
 	if (level == 0 && dnode_block_freed(dn, blkid))
 		goto no_issue;
 
 	/*
 	 * This dnode hasn't been written to disk yet, so there's nothing to
 	 * prefetch.
 	 */
 	nlevels = dn->dn_phys->dn_nlevels;
 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
 		goto no_issue;
 
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
 		goto no_issue;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
 	    level, blkid, NULL);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
 		 * This dbuf already exists.  It is either CACHED, or
 		 * (we assume) about to be read or filled.
 		 */
 		goto no_issue;
 	}
 
 	/*
 	 * Find the closest ancestor (indirect block) of the target block
 	 * that is present in the cache.  In this indirect block, we will
 	 * find the bp that is at curlevel, curblkid.
 	 */
 	curlevel = level;
 	curblkid = blkid;
 	while (curlevel < nlevels - 1) {
 		int parent_level = curlevel + 1;
 		uint64_t parent_blkid = curblkid >> epbs;
 		dmu_buf_impl_t *db;
 
 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
 		    FALSE, TRUE, FTAG, &db) == 0) {
 			blkptr_t *bpp = db->db_buf->b_data;
 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
 			dbuf_rele(db, FTAG);
 			break;
 		}
 
 		curlevel = parent_level;
 		curblkid = parent_blkid;
 	}
 
 	if (curlevel == nlevels - 1) {
 		/* No cached indirect blocks found. */
 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
 		bp = dn->dn_phys->dn_blkptr[curblkid];
 	}
 	ASSERT(!BP_IS_REDACTED(&bp) ||
 	    dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
 	    SPA_FEATURE_REDACTED_DATASETS));
 	if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
 		goto no_issue;
 
 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 	    dn->dn_object, level, blkid);
 	dpa->dpa_curlevel = curlevel;
 	dpa->dpa_prio = prio;
 	dpa->dpa_aflags = aflags;
 	dpa->dpa_spa = dn->dn_objset->os_spa;
 	dpa->dpa_dnode = dn;
 	dpa->dpa_epbs = epbs;
 	dpa->dpa_zio = pio;
 	dpa->dpa_cb = cb;
 	dpa->dpa_arg = arg;
 
 	if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
 	else if (dnode_level_is_l2cacheable(&bp, dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
 	 * If we have the indirect just above us, no need to do the asynchronous
 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
 	 * a higher level, though, we want to issue the prefetches for all the
 	 * indirect blocks asynchronously, so we can go on with whatever we were
 	 * doing.
 	 */
 	if (curlevel == level) {
 		ASSERT3U(curblkid, ==, blkid);
 		dbuf_issue_final_prefetch(dpa, &bp);
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dnode_level_is_l2cacheable(&bp, dn, level))
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    &bp, dbuf_prefetch_indirect_done, dpa,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	/*
 	 * We use pio here instead of dpa_zio since it's possible that
 	 * dpa may have already been freed.
 	 */
 	zio_nowait(pio);
 	return (1);
 no_issue:
 	if (cb != NULL)
 		cb(arg, level, blkid, B_FALSE);
 	return (0);
 }
 
 int
 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
     arc_flags_t aflags)
 {
 
 	return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
 }
 
 /*
  * Helper function for dbuf_hold_impl() to copy a buffer. Handles
  * the case of encrypted, compressed and uncompressed buffers by
  * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
  * arc_alloc_compressed_buf() or arc_alloc_buf().*
  *
  * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
  */
 noinline static void
 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	arc_buf_t *data = dr->dt.dl.dr_data;
 	enum zio_compress compress_type = arc_get_compression(data);
 	uint8_t complevel = arc_get_complevel(data);
 
 	if (arc_is_encrypted(data)) {
 		boolean_t byteorder;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 
 		arc_get_raw_params(data, &byteorder, salt, iv, mac);
 		dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
 		    dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
 		    dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
 		    compress_type, complevel));
 	} else if (compress_type != ZIO_COMPRESS_OFF) {
 		dbuf_set_data(db, arc_alloc_compressed_buf(
 		    dn->dn_objset->os_spa, db, arc_buf_size(data),
 		    arc_buf_lsize(data), compress_type, complevel));
 	} else {
 		dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
 		    DBUF_GET_BUFC_TYPE(db), db->db.db_size));
 	}
 
 	rw_enter(&db->db_rwlock, RW_WRITER);
 	memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
 	rw_exit(&db->db_rwlock);
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     const void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 	uint64_t hv;
 
 	/* If the pool has been created, verify the tx_sync_lock is not held */
 	spa_t *spa = dn->dn_objset->os_spa;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	if (dp != NULL) {
 		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
 	}
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		if (fail_uncached)
 			return (SET_ERROR(ENOENT));
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp, hv);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (db->db_buf != NULL) {
 		arc_buf_access(db->db_buf);
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			ASSERT3P(db->db_buf, !=, NULL);
 			dbuf_hold_copy(dn, db);
 		}
 	}
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(zfs_refcount_is_zero(&db->db_holds));
 		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
 		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
 		multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
 
 		uint64_t size = db->db.db_size;
 		uint64_t usize = dmu_buf_user_size(&db->db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size, size, db);
 		(void) zfs_refcount_remove_many(
 		    &dbuf_caches[db->db_caching_status].size, usize,
 		    db->db_user);
 
 		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
 			DBUF_STAT_BUMPDOWN(metadata_cache_count);
 		} else {
 			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 			DBUF_STAT_BUMPDOWN(cache_count);
 			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 			    size + usize);
 		}
 		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) zfs_refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
 	    dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	dbuf_new_size(db, blksz, tx);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
 {
 	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
 	VERIFY3S(holds, >, 1);
 }
 
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
     const void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
 	boolean_t result = B_FALSE;
 
 	if (blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
 		found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
 			(void) zfs_refcount_add(&db->db_holds, tag);
 			result = B_TRUE;
 		}
 		mutex_exit(&found_db->db_mtx);
 	}
 	return (result);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, const void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, const void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
  * argument should be set if we are already in the dbuf-evicting code
  * path, in which case we don't want to recursively evict.  This allows us to
  * avoid deeply nested stacks that would have a call flow similar to this:
  *
  * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
  *	^						|
  *	|						|
  *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
  *
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 {
 	int64_t holds;
 	uint64_t size;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = zfs_refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf != NULL &&
 	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
 	}
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			dnode_t *dn;
 			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
 			 * If the dnode moves here, we cannot cross this
 			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 
 			dn = DB_DNODE(db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 
 			/*
 			 * Decrementing the dbuf count means that the bonus
 			 * buffer's dnode hold is no longer discounted in
 			 * dnode_move(). The dnode cannot move until after
 			 * the dnode_rele() below.
 			 */
 			DB_DNODE_EXIT(db);
 
 			/*
 			 * Do not reference db after its lock is dropped.
 			 * Another thread may evict it.
 			 */
 			mutex_exit(&db->db_mtx);
 
 			if (evict_dbuf)
 				dnode_evict_bonus(dn);
 
 			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_destroy(db);
 		} else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
 		    db->db_pending_evict) {
 			dbuf_destroy(db);
 		} else if (!multilist_link_active(&db->db_cache_link)) {
 			ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 
 			dbuf_cached_state_t dcs =
 			    dbuf_include_in_metadata_cache(db) ?
 			    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
 			db->db_caching_status = dcs;
 
 			multilist_insert(&dbuf_caches[dcs].cache, db);
 			uint64_t db_size = db->db.db_size;
 			uint64_t dbu_size = dmu_buf_user_size(&db->db);
 			(void) zfs_refcount_add_many(
 			    &dbuf_caches[dcs].size, db_size, db);
 			size = zfs_refcount_add_many(
 			    &dbuf_caches[dcs].size, dbu_size, db->db_user);
 			uint8_t db_level = db->db_level;
 			mutex_exit(&db->db_mtx);
 
 			if (dcs == DB_DBUF_METADATA_CACHE) {
 				DBUF_STAT_BUMP(metadata_cache_count);
 				DBUF_STAT_MAX(metadata_cache_size_bytes_max,
 				    size);
 			} else {
 				DBUF_STAT_BUMP(cache_count);
 				DBUF_STAT_MAX(cache_size_bytes_max, size);
 				DBUF_STAT_BUMP(cache_levels[db_level]);
 				DBUF_STAT_INCR(cache_levels_bytes[db_level],
 				    db_size + dbu_size);
 			}
 
 			if (dcs == DB_DBUF_CACHE && !evicting)
 				dbuf_evict_notify(size);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (zfs_refcount_count(&db->db_holds));
 }
 
 uint64_t
 dmu_buf_user_refcount(dmu_buf_t *db_fake)
 {
 	uint64_t holds;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
 	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
 	mutex_exit(&db->db_mtx);
 
 	return (holds);
 }
 
 void *
 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
     dmu_buf_user_t *new_user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	if (db->db_user == old_user)
 		db->db_user = new_user;
 	else
 		old_user = db->db_user;
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	mutex_exit(&db->db_mtx);
 
 	return (old_user);
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_user_immediate_evict = TRUE;
 	return (dmu_buf_set_user(db_fake, user));
 }
 
 void *
 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	return (db->db_user);
 }
 
 uint64_t
 dmu_buf_user_size(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	if (db->db_user == NULL)
 		return (0);
 	return (atomic_load_64(&db->db_user->dbu_size));
 }
 
 void
 dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT3P(db->db_user, !=, NULL);
 	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
 	atomic_add_64(&db->db_user->dbu_size, nadd);
 }
 
 void
 dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT3P(db->db_user, !=, NULL);
 	ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
 	atomic_sub_64(&db->db_user->dbu_size, nsub);
 }
 
 void
 dmu_buf_user_evict_wait(void)
 {
 	taskq_wait(dbu_evict_taskq);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 objset_t *
 dmu_buf_get_objset(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_objset);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mismatch).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	void *data = dr->dt.dl.dr_data;
 
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_blkid == DMU_BONUS_BLKID);
 	ASSERT(data != NULL);
 
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
 	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
 	memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
 
 	dbuf_sync_leaf_verify_bonus_dnode(dr);
 
 	dbuf_undirty_bonus(dr);
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 /*
  * When syncing out a blocks of dnodes, adjust the block to deal with
  * encryption.  Normally, we make sure the block is decrypted before writing
  * it.  If we have crypt params, then we are writing a raw (encrypted) block,
  * from a raw receive.  In this case, set the ARC buf's crypt params so
  * that the BP will be filled with the correct byteorder, salt, iv, and mac.
  */
 static void
 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
 {
 	int err;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
 	ASSERT3U(db->db_level, ==, 0);
 
 	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
 		zbookmark_phys_t zb;
 
 		/*
 		 * Unfortunately, there is currently no mechanism for
 		 * syncing context to handle decryption errors. An error
 		 * here is only possible if an attacker maliciously
 		 * changed a dnode block and updated the associated
 		 * checksums going up the block tree.
 		 */
 		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
 		    db->db.db_object, db->db_level, db->db_blkid);
 		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
 		    &zb, B_TRUE);
 		if (err)
 			panic("Invalid dnode block MAC");
 	} else if (dr->dt.dl.dr_has_raw_params) {
 		(void) arc_release(dr->dt.dl.dr_data, db);
 		arc_convert_to_raw(dr->dt.dl.dr_data,
 		    dmu_objset_id(db->db_objset),
 		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
 		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
 	}
 }
 
 /*
  * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
  * is critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio_t *zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 /*
  * Verify that the size of the data in our bonus buffer does not exceed
  * its recorded size.
  *
  * The purpose of this verification is to catch any cases in development
  * where the size of a phys structure (i.e space_map_phys_t) grows and,
  * due to incorrect feature management, older pools expect to read more
  * data even though they didn't actually write it to begin with.
  *
  * For a example, this would catch an error in the feature logic where we
  * open an older pool and we expect to write the space map histogram of
  * a space map with size SPACE_MAP_SIZE_V0.
  */
 static void
 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
 {
 #ifdef ZFS_DEBUG
 	dnode_t *dn = dr->dr_dnode;
 
 	/*
 	 * Encrypted bonus buffers can have data past their bonuslen.
 	 * Skip the verification of these blocks.
 	 */
 	if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
 		return;
 
 	uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
 	uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 	ASSERT3U(bonuslen, <=, maxbonuslen);
 
 	arc_buf_t *datap = dr->dt.dl.dr_data;
 	char *datap_end = ((char *)datap) + bonuslen;
 	char *datap_max = ((char *)datap) + maxbonuslen;
 
 	/* ensure that everything is zero after our data */
 	for (; datap_end < datap_max; datap_end++)
 		ASSERT(*datap_end == 0);
 #endif
 }
 
 static blkptr_t *
 dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
 {
 	/* This must be a lightweight dirty record. */
 	ASSERT3P(dr->dr_dbuf, ==, NULL);
 	dnode_t *dn = dr->dr_dnode;
 
 	if (dn->dn_phys->dn_nlevels == 1) {
 		VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
 		return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
 	} else {
 		dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
 		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 		VERIFY3U(parent_db->db_level, ==, 1);
 		VERIFY3P(DB_DNODE(parent_db), ==, dn);
 		VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
 		blkptr_t *bp = parent_db->db.db_data;
 		return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
 	}
 }
 
 static void
 dbuf_lightweight_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error != 0)
 		return;
 
 	dnode_t *dn = dr->dr_dnode;
 
 	blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	int64_t delta = bp_get_dsize_sync(spa, bp) -
 	    bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta);
 
 	uint64_t blkid = dr->dt.dll.dr_blkid;
 	mutex_enter(&dn->dn_mtx);
 	if (blkid > dn->dn_phys->dn_maxblkid) {
 		ASSERT0(dn->dn_objset->os_raw_receive);
 		dn->dn_phys->dn_maxblkid = blkid;
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
 		BP_SET_FILL(bp, fill);
 	}
 
 	dmu_buf_impl_t *parent_db;
 	EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
 	if (dr->dr_parent == NULL) {
 		parent_db = dn->dn_dbuf;
 	} else {
 		parent_db = dr->dr_parent->dr_dbuf;
 	}
 	rw_enter(&parent_db->db_rwlock, RW_WRITER);
 	*bp_orig = *bp;
 	rw_exit(&parent_db->db_rwlock);
 }
 
 static void
 dbuf_lightweight_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 
 	VERIFY0(zio->io_error);
 
 	objset_t *os = dr->dr_dnode->dn_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, zio->io_bp, tx);
 	}
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	abd_free(dr->dt.dll.dr_abd);
 	kmem_free(dr, sizeof (*dr));
 }
 
 noinline static void
 dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dnode_t *dn = dr->dr_dnode;
 	zio_t *pio;
 	if (dn->dn_phys->dn_nlevels == 1) {
 		pio = dn->dn_zio;
 	} else {
 		pio = dr->dr_parent->dr_zio;
 	}
 
 	zbookmark_phys_t zb = {
 		.zb_objset = dmu_objset_id(dn->dn_objset),
 		.zb_object = dn->dn_object,
 		.zb_level = 0,
 		.zb_blkid = dr->dt.dll.dr_blkid,
 	};
 
 	/*
 	 * See comment in dbuf_write().  This is so that zio->io_bp_orig
 	 * will have the old BP in dbuf_lightweight_done().
 	 */
 	dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
 
 	dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
 	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
 	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
 	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
 	    dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
 	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
 	zio_nowait(dr->dr_zio);
 }
 
 /*
  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
  * critical the we not allow the compiler to inline this function in to
  * dbuf_sync_list() thereby drastically bloating the stack usage.
  */
 noinline static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we might have been freed
 	 * after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT3P(db->db.db_data, ==, NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else if (db->db_state == DB_READ) {
 		/*
 		 * This buffer was either cloned or had a Direct I/O write
 		 * occur and has an in-flgiht read on the BP. It is safe to
 		 * issue the write here, because the read has already been
 		 * issued and the contents won't change.
 		 *
 		 * We can verify the case of both the clone and Direct I/O
 		 * write by making sure the first dirty record for the dbuf
 		 * has no ARC buffer associated with it.
 		 */
 		dbuf_dirty_record_t *dr_head =
 		    list_head(&db->db_dirty_records);
 		ASSERT3P(db->db_buf, ==, NULL);
 		ASSERT3P(db->db.db_data, ==, NULL);
 		ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
 		ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 			/*
 			 * In the previous transaction group, the bonus buffer
 			 * was entirely used to store the attributes for the
 			 * dnode which overrode the dn_spill field.  However,
 			 * when adding more attributes to the file a spill
 			 * block was required to hold the extra attributes.
 			 *
 			 * Make sure to clear the garbage left in the dn_spill
 			 * field from the previous attributes in the bonus
 			 * buffer.  Otherwise, after writing out the spill
 			 * block to the new allocated dva, it will free
 			 * the old block pointed to by the invalid dn_spill.
 			 */
 			db->db_blkptr = NULL;
 		}
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dr->dr_dbuf == db);
 		dbuf_sync_bonus(dr, tx);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write, wait for the
 	 * synchronous IO to complete.
 	 *
 	 * This is also valid even with Direct I/O writes setting a dirty
 	 * records override state into DR_IN_DMU_SYNC, because all
 	 * Direct I/O writes happen in open-context.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 	}
 
 	/*
 	 * If this is a dnode block, ensure it is appropriately encrypted
 	 * or decrypted, depending on what we are writing to it this txg.
 	 */
 	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
 		dbuf_prepare_encrypted_dnode_leaf(dr);
 
 	if (*datap != NULL && *datap == db->db_buf &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    zfs_refcount_count(&db->db_holds) > 1) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int psize = arc_buf_size(*datap);
 		int lsize = arc_buf_lsize(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		enum zio_compress compress_type = arc_get_compression(*datap);
 		uint8_t complevel = arc_get_complevel(*datap);
 
 		if (arc_is_encrypted(*datap)) {
 			boolean_t byteorder;
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t mac[ZIO_DATA_MAC_LEN];
 
 			arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
 			*datap = arc_alloc_raw_buf(os->os_spa, db,
 			    dmu_objset_id(os), byteorder, salt, iv, mac,
 			    dn->dn_type, psize, lsize, compress_type,
 			    complevel);
 		} else if (compress_type != ZIO_COMPRESS_OFF) {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			*datap = arc_alloc_compressed_buf(os->os_spa, db,
 			    psize, lsize, compress_type, complevel);
 		} else {
 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
 		}
 		memcpy((*datap)->b_data, db->db.db_data, psize);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 	} else {
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 /*
  * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
  * called recursively from dbuf_sync_indirect().
  */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while ((dr = list_head(list))) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf == NULL) {
 			dbuf_sync_lightweight(dr, tx);
 		} else {
 			if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				VERIFY3U(dr->dr_dbuf->db_level, ==, level);
 			}
 			if (dr->dr_dbuf->db_level > 0)
 				dbuf_sync_indirect(dr, tx);
 			else
 				dbuf_sync_leaf(dr, tx);
 		}
 	}
 }
 
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, !=, NULL);
 	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID) {
 			ASSERT0(db->db_objset->os_raw_receive);
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		}
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			i = 0;
 			while (i < db->db.db_size) {
 				dnode_phys_t *dnp =
 				    (void *)(((char *)db->db.db_data) + i);
 
 				i += DNODE_MIN_SIZE;
 				if (dnp->dn_type != DMU_OT_NONE) {
 					fill++;
 					for (int j = 0; j < dnp->dn_nblkptr;
 					    j++) {
 						(void) zfs_blkptr_verify(spa,
 						    &dnp->dn_blkptr[j],
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					if (dnp->dn_flags &
 					    DNODE_FLAG_SPILL_BLKPTR) {
 						(void) zfs_blkptr_verify(spa,
 						    DN_SPILL_BLKPTR(dnp),
 						    BLK_CONFIG_SKIP,
 						    BLK_VERIFY_HALT);
 					}
 					i += dnp->dn_extra_slots *
 					    DNODE_MIN_SIZE;
 				}
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			(void) zfs_blkptr_verify(spa, ibp,
 			    BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		BP_SET_FILL(bp, fill);
 
 	mutex_exit(&db->db_mtx);
 
 	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
 	*db->db_blkptr = *bp;
 	dmu_buf_unlock_parent(db, dblt, FTAG);
 }
 
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
  * holes, then we want this indirect to be compressed away to a hole. In
  * order to do that we must zero out any information about the holes that
  * this indirect points to prior to before we try to compress it.
  */
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) zio, (void) buf;
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp;
 	unsigned int epbs, i;
 
 	ASSERT3U(db->db_level, >, 0);
 	DB_DNODE_ENTER(db);
 	epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	DB_DNODE_EXIT(db);
 	ASSERT3U(epbs, <, 31);
 
 	/* Determine if all our children are holes */
 	for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
 		if (!BP_IS_HOLE(bp))
 			break;
 	}
 
 	/*
 	 * If all the children are holes, then zero them all out so that
 	 * we may get compressed away.
 	 */
 	if (i == 1ULL << epbs) {
 		/*
 		 * We only found holes. Grab the rwlock to prevent
 		 * anybody from reading the blocks we're about to
 		 * zero out.
 		 */
 		rw_enter(&db->db_rwlock, RW_WRITER);
 		memset(db->db.db_data, 0, db->db.db_size);
 		rw_exit(&db->db_rwlock);
 	}
 }
 
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	(void) buf;
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	dbuf_dirty_record_t *dr = db->db_data_pending;
 	dnode_t *dn = dr->dr_dnode;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
 	list_remove(&db->db_dirty_records, dr);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 
 		/* no dr_data if this is a NO_FILL or Direct I/O */
 		if (dr->dt.dl.dr_data != NULL &&
 		    dr->dt.dl.dr_data != db->db_buf) {
 			ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
 			ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
 			    SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 		}
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
 	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 typedef struct dbuf_remap_impl_callback_arg {
 	objset_t	*drica_os;
 	uint64_t	drica_blk_birth;
 	dmu_tx_t	*drica_tx;
 } dbuf_remap_impl_callback_arg_t;
 
 static void
 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg)
 {
 	dbuf_remap_impl_callback_arg_t *drica = arg;
 	objset_t *os = drica->drica_os;
 	spa_t *spa = dmu_objset_spa(os);
 	dmu_tx_t *tx = drica->drica_tx;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (os == spa_meta_objset(spa)) {
 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
 	} else {
 		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
 		    size, drica->drica_blk_birth, tx);
 	}
 }
 
 static void
 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 {
 	blkptr_t bp_copy = *bp;
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	dbuf_remap_impl_callback_arg_t drica;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
 	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
 		/*
 		 * If the blkptr being remapped is tracked by a livelist,
 		 * then we need to make sure the livelist reflects the update.
 		 * First, cancel out the old blkptr by appending a 'FREE'
 		 * entry. Next, add an 'ALLOC' to track the new version. This
 		 * way we avoid trying to free an inaccurate blkptr at delete.
 		 * Note that embedded blkptrs are not tracked in livelists.
 		 */
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 			    BP_GET_LOGICAL_BIRTH(bp) >
 			    ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LIVELIST));
 				bplist_append(&ds->ds_dir->dd_pending_frees,
 				    bp);
 				bplist_append(&ds->ds_dir->dd_pending_allocs,
 				    &bp_copy);
 			}
 		}
 
 		/*
 		 * The db_rwlock prevents dbuf_read_impl() from
 		 * dereferencing the BP while we are changing it.  To
 		 * avoid lock contention, only grab it when we are actually
 		 * changing the BP.
 		 */
 		if (rw != NULL)
 			rw_enter(rw, RW_WRITER);
 		*bp = bp_copy;
 		if (rw != NULL)
 			rw_exit(rw);
 	}
 }
 
 /*
  * Remap any existing BP's to concrete vdevs, if possible.
  */
 static void
 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(db->db_objset);
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return;
 
 	if (db->db_level > 0) {
 		blkptr_t *bp = db->db.db_data;
 		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
 			dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
 		}
 	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dnode_phys_t *dnp = db->db.db_data;
 		ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE);
 		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
 		    i += dnp[i].dn_extra_slots + 1) {
 			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
 				krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
 				    &dn->dn_dbuf->db_rwlock);
 				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
 				    tx);
 			}
 		}
 	}
 }
 
 
 /*
  * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
  * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
  */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = dr->dr_dnode;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *pio; /* parent I/O */
 	int wp_flag = 0;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	os = dn->dn_objset;
 
 	if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 		/*
 		 * Private object buffers are released here rather than in
 		 * dbuf_dirty() since they are only modified in the syncing
 		 * context and we don't want the overhead of making multiple
 		 * copies of the data.
 		 */
 		if (BP_IS_HOLE(db->db_blkptr))
 			arc_buf_thaw(data);
 		else
 			dbuf_release_bp(db);
 		dbuf_remap(dn, db, tx);
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		pio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		pio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (data == NULL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 
 	/*
 	 * We copy the blkptr now (rather than when we instantiate the dirty
 	 * record), because its value can change between open context and
 	 * syncing context. We do not need to hold dn_struct_rwlock to read
 	 * db_blkptr because we are in syncing context.
 	 */
 	dr->dr_bp_copy = *db->db_blkptr;
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync(), dmu_write_direct(),
 		 *  or dmu_buf_write_embedded()).
 		 */
 		abd_t *contents = (data != NULL) ?
 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
 
 		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
-		    dr->dt.dl.dr_brtwrite);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
+		    dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (data == NULL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 
 		/*
 		 * For indirect blocks, we want to setup the children
 		 * ready callback so that we can properly handle an indirect
 		 * block that only contains holes.
 		 */
 		arc_write_done_func_t *children_ready_cb = NULL;
 		if (db->db_level != 0)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
 		    dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
 		    children_ready_cb, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
 
 EXPORT_SYMBOL(dbuf_find);
 EXPORT_SYMBOL(dbuf_is_metadata);
 EXPORT_SYMBOL(dbuf_destroy);
 EXPORT_SYMBOL(dbuf_loan_arcbuf);
 EXPORT_SYMBOL(dbuf_whichblock);
 EXPORT_SYMBOL(dbuf_read);
 EXPORT_SYMBOL(dbuf_unoverride);
 EXPORT_SYMBOL(dbuf_free_range);
 EXPORT_SYMBOL(dbuf_new_size);
 EXPORT_SYMBOL(dbuf_release_bp);
 EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
 EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
 EXPORT_SYMBOL(dmu_buf_rele);
 EXPORT_SYMBOL(dbuf_assign_arcbuf);
 EXPORT_SYMBOL(dbuf_prefetch);
 EXPORT_SYMBOL(dbuf_hold_impl);
 EXPORT_SYMBOL(dbuf_hold);
 EXPORT_SYMBOL(dbuf_hold_level);
 EXPORT_SYMBOL(dbuf_create_bonus);
 EXPORT_SYMBOL(dbuf_spill_set_blksz);
 EXPORT_SYMBOL(dbuf_rm_spill);
 EXPORT_SYMBOL(dbuf_add_ref);
 EXPORT_SYMBOL(dbuf_rele);
 EXPORT_SYMBOL(dbuf_rele_and_unlock);
 EXPORT_SYMBOL(dbuf_refcount);
 EXPORT_SYMBOL(dbuf_sync_list);
 EXPORT_SYMBOL(dmu_buf_set_user);
 EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
 	"Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
 	"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
 	"Maximum size in bytes of dbuf metadata cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
 	"Set size of dbuf metadata cache to log2 fraction of arc size.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
 	"Set size of dbuf cache mutex array as log2 shift.");
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index bddb90f295af..2b52ae139bac 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1,2950 +1,2969 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, 2023, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty checking for holes with lseek().
  * By default this is enabled to ensure accurate hole reporting, it can result
  * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
  * Disabling this option will result in holes never being reported in dirty
  * files which is always safe.
  */
 static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 #ifdef _ILP32
 uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 #else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Override copies= for dedup state objects. 0 means the traditional behaviour
  * (ie the default for the containing objset ie 3 for the MOS).
  */
 uint_t dmu_ddt_copies = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (newsize < 0 || newsize > db_fake->db_size)
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (!DMU_OT_IS_VALID(type))
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	int err;
 	uint32_t db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
     uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
 	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	if ((flags & DMU_READ_NO_DECRYPT) != 0)
 		dbuf_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
 		    >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (read)
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
 		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
 		    B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			if (zs) {
 				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
 				    B_TRUE);
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
 				zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/*
 		 * Initiate async demand data read.
 		 * We check the db_state after calling dbuf_read() because
 		 * (1) dbuf_read() may change the state to CACHED due to a
 		 * hit in the ARC, and (2) on a cache miss, a child will
 		 * have been added to "zio" but not yet completed, so the
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
 			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
 			    offset + length < db->db.db_offset +
 			    db->db.db_size) {
 				if (offset <= db->db.db_offset)
 					dbuf_flags |= DB_RF_PARTIAL_FIRST;
 				else
 					dbuf_flags |= DB_RF_PARTIAL_MORE;
 			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
 		}
 		dbp[i] = &db->db;
 	}
 
 	/*
 	 * If we are doing O_DIRECT we still hold the dbufs, even for reads,
 	 * but we do not issue any reads here. We do not want to account for
 	 * writes in this case.
 	 *
 	 * O_DIRECT write/read accounting takes place in
 	 * dmu_{write/read}_abd().
 	 */
 	if (!read && ((flags & DMU_DIRECTIO) == 0))
 		zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
 
 	if (zs)
 		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
 		/* wait for async read i/o */
 		err = zio_wait(zio);
 		if (err) {
 			dmu_buf_rele_array(dbp, nblks, tag);
 			return (err);
 		}
 
 		/* wait for other io to complete */
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read,
 	    tag, numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.  If the range
  * is too long, prefetch the first dmu_prefetch_max bytes as requested, while
  * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
  * should primarily help random reads, since for long sequential reads there is
  * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
  * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
 
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	int64_t level2 = level;
 	uint64_t start, end, start2, end2;
 
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift != 0) {
 		/*
 		 * The object has multiple blocks.  Calculate the full range
 		 * of blocks [start, end2) and then split it into two parts,
 		 * so that the first [start, end) fits into dmu_prefetch_max.
 		 */
 		start = dbuf_whichblock(dn, level, offset);
 		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
 		uint8_t ibs = dn->dn_indblkshift;
 		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
 		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
 		start2 = end = MIN(end2, start + limit);
 
 		/*
 		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
 			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
 		} while (end2 - start2 > limit);
 	} else {
 		/* There is only one block.  Prefetch it or nothing. */
 		start = start2 = end2 = 0;
 		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
 	for (uint64_t i = start; i < end; i++)
 		dbuf_prefetch(dn, level, i, pri, 0);
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 typedef struct {
 	kmutex_t	dpa_lock;
 	kcondvar_t	dpa_cv;
 	uint64_t	dpa_pending_io;
 } dmu_prefetch_arg_t;
 
 static void
 dmu_prefetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t issued)
 {
 	(void) level; (void) blkid; (void)issued;
 	dmu_prefetch_arg_t *dpa = arg;
 
 	ASSERT0(level);
 
 	mutex_enter(&dpa->dpa_lock);
 	ASSERT3U(dpa->dpa_pending_io, >, 0);
 	if (--dpa->dpa_pending_io == 0)
 		cv_broadcast(&dpa->dpa_cv);
 	mutex_exit(&dpa->dpa_lock);
 }
 
 static void
 dmu_prefetch_wait_by_dnode(dnode_t *dn, uint64_t offset, uint64_t len)
 {
 	dmu_prefetch_arg_t dpa;
 
 	mutex_init(&dpa.dpa_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dpa.dpa_cv, NULL, CV_DEFAULT, NULL);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	uint64_t start = dbuf_whichblock(dn, 0, offset);
 	uint64_t end = dbuf_whichblock(dn, 0, offset + len - 1) + 1;
 	dpa.dpa_pending_io = end - start;
 
 	for (uint64_t blk = start; blk < end; blk++) {
 		(void) dbuf_prefetch_impl(dn, 0, blk, ZIO_PRIORITY_ASYNC_READ,
 		    0, dmu_prefetch_done, &dpa);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for prefetch L0 reads to finish */
 	mutex_enter(&dpa.dpa_lock);
 	while (dpa.dpa_pending_io > 0) {
 		cv_wait(&dpa.dpa_cv, &dpa.dpa_lock);
 
 	}
 	mutex_exit(&dpa.dpa_lock);
 
 	mutex_destroy(&dpa.dpa_lock);
 	cv_destroy(&dpa.dpa_cv);
 }
 
 /*
  * Issue prefetch I/Os for the given L0 block range and wait for the I/O
  * to complete. This does not enforce dmu_prefetch_max and will prefetch
  * the entire range. The blocks are read from disk into the ARC but no
  * decompression occurs (i.e., the dbuf cache is not required).
  */
 int
 dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size)
 {
 	dnode_t *dn;
 	int err = 0;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * Chunk the requests (16 indirects worth) so that we can be interrupted
 	 */
 	uint64_t chunksize;
 	if (dn->dn_indblkshift) {
 		uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 		chunksize = (nbps * 16) << dn->dn_datablkshift;
 	} else {
 		chunksize = dn->dn_datablksz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, chunksize);
 
 		dmu_prefetch_wait_by_dnode(dn, offset, mylen);
 
 		offset += mylen;
 		size -= mylen;
 
 		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 	}
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 /*
  * Issue prefetch I/Os for the given object's dnode.
  */
 void
 dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 {
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return;
 
 	dnode_t *dn = DMU_META_DNODE(os);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
 	dbuf_prefetch(dn, 0, blkid, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crash in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/* dn_nlevels == 1 means we don't have any L1 blocks */
 	if (dn->dn_nlevels <= 1) {
 		*l1blks = 0;
 		*start = minimum;
 		return (0);
 	}
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #else
 	(void) os;
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block. If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
 	    zfs_dio_aligned(offset, size, PAGESIZE)) {
 		abd_t *data = abd_get_from_buf(buf, size);
 		err = dmu_read_abd(dn, offset, size, data, flags);
 		abd_free(data);
 		return (err);
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			ASSERT(db->db_data != NULL);
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		ASSERT(db->db_data != NULL);
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * This interface is not used internally by ZFS but is provided for
  * use by Lustre which is built on the DMU interfaces.
  */
 int
 dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int error;
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
 	    zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
 		abd_t *data = abd_get_from_buf((void *)buf, size);
 		error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
 		abd_free(data);
 		return (error);
 	}
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (0);
 }
 
 int
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
 	if (uio->uio_extflg & UIO_DIRECT)
 		return (dmu_read_uio_direct(dn, uio, size));
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
 		    UIO_READ, uio);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_read_uio_dnode(DB_DNODE(db), uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	uint64_t write_size;
 
 top:
 	write_size = size;
 
 	/*
 	 * We only allow Direct I/O writes to happen if we are block
 	 * sized aligned. Otherwise, we pass the write off to the ARC.
 	 */
 	if ((uio->uio_extflg & UIO_DIRECT) &&
 	    (write_size >= dn->dn_datablksz)) {
 		if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
 		    dn->dn_datablksz)) {
 			return (dmu_write_uio_direct(dn, uio, size, tx));
 		} else if (write_size > dn->dn_datablksz &&
 		    zfs_dio_offset_aligned(zfs_uio_offset(uio),
 		    dn->dn_datablksz)) {
 			write_size =
 			    dn->dn_datablksz * (write_size / dn->dn_datablksz);
 			err = dmu_write_uio_direct(dn, uio, write_size, tx);
 			if (err == 0) {
 				size -= write_size;
 				goto top;
 			} else {
 				return (err);
 			}
 		} else {
 			write_size =
 			    P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
 		}
 	}
 
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (int i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(write_size > 0);
 
 		offset_t off = zfs_uio_offset(uio);
 		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, write_size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
 		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
 			/* The fill was reverted.  Undo any uio progress. */
 			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
 		}
 
 		if (err)
 			break;
 
 		write_size -= tocpy;
 		size -= tocpy;
 	}
 
 	IMPLY(err == 0, write_size == 0);
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
 		goto top;
 	}
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 static void
 dmu_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	int cached_flags;
 
 	if (bps == NULL)
 		return;
 
 	for (size_t blk_off = 0; blk_off < nbps; blk_off++) {
 		blkptr_t *bp = &bps[blk_off];
 
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		cached_flags = arc_cached(spa, bp);
 		if (cached_flags == 0)
 			continue;
 
 		if ((cached_flags & (ARC_CACHED_IN_L1 | ARC_CACHED_IN_L2)) ==
 		    ARC_CACHED_IN_L2)
 			*l2sz += BP_GET_LSIZE(bp);
 		else
 			*l1sz += BP_GET_LSIZE(bp);
 	}
 }
 
 /*
  * Estimate DMU object cached size.
  */
 int
 dmu_object_cached_size(objset_t *os, uint64_t object,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	dnode_t *dn;
 	dmu_object_info_t doi;
 	int err = 0;
 
 	*l1sz = *l2sz = 0;
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return (0);
 
 	if (dn->dn_nlevels < 2) {
 		dnode_rele(dn, FTAG);
 		return (0);
 	}
 
 	dmu_object_info_from_dnode(dn, &doi);
 
 	for (uint64_t off = 0; off < doi.doi_max_offset;
 	    off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
 	 * Hold all valid L1 blocks, asking ARC the status of each BP
 	 * contained in each such L1 block.
 	 */
 	uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 	uint64_t l1blks = 1 + (dn->dn_maxblkid / nbps);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	for (uint64_t blk = 0; blk < l1blks; blk++) {
 		dmu_buf_impl_t *db = NULL;
 
 		if (issig()) {
 			/*
 			 * On interrupt, get out, and bubble up EINTR
 			 */
 			err = EINTR;
 			break;
 		}
 
 		/*
 		 * If we get an i/o error here, the L1 can't be read,
 		 * and nothing under it could be cached, so we just
 		 * continue. Ignoring the error from dbuf_hold_impl
 		 * or from dbuf_read is then a reasonable choice.
 		 */
 		err = dbuf_hold_impl(dn, 1, blk, B_TRUE, B_FALSE, FTAG, &db);
 		if (err != 0) {
 			/*
 			 * ignore error and continue
 			 */
 			err = 0;
 			continue;
 		}
 
 		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 		if (err == 0) {
 			dmu_cached_bps(dmu_objset_spa(os), db->db.db_data,
 			    nbps, l1sz, l2sz);
 		}
 		/*
 		 * error may be ignored, and we continue
 		 */
 		err = 0;
 		dbuf_rele(db, FTAG);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * A "lightweight" write is faster than a regular write (e.g.
  * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
  * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
  * data can not be read or overwritten until the transaction's txg has been
  * synced.  This makes it appropriate for workloads that are known to be
  * (temporarily) write-only, like "zfs receive".
  *
  * A single block is written, starting at the specified offset in bytes.  If
  * the call is successful, it returns 0 and the provided abd has been
  * consumed (the caller should not free it).
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
 	if (dr == NULL)
 		return (SET_ERROR(EIO));
 	dr->dt.dll.dr_abd = abd;
 	dr->dt.dll.dr_props = *zp;
 	dr->dt.dll.dr_flags = flags;
 	return (0);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		zfs_racct_write(os->os_spa, blksz, 1, 0);
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	int err;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 
 	if (zio->io_error == 0) {
 		dbuf_dirty_record_t *dr = dsa->dsa_dr;
 		blkptr_t *bp = zio->io_bp;
 
 		if (BP_IS_HOLE(bp)) {
 			dmu_buf_t *db = NULL;
 			if (dr)
 				db = &(dr->dr_dbuf->db);
 			else
 				db = dsa->dsa_zgd->zgd_db;
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zgd && zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		ASSERT0(dr->dt.dl.dr_has_raw_params);
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+		dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	if (dsa->dsa_done)
 		dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_free(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 	int error;
 
 	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
 	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	/*
 	 * This transaction does not produce any dirty data or log blocks, so
 	 * it should not be throttled.  All other cases wait for TXG sync, by
 	 * which time the log block we are writing will be obsolete, so we can
 	 * skip waiting and just return error here instead.
 	 */
 	if (dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	if (dr_next != NULL) {
 		zp.zp_nopwrite = B_FALSE;
 	} else {
 		DB_DNODE_ENTER(db);
 		if (dnode_block_freed(DB_DNODE(db), db->db_blkid))
 			zp.zp_nopwrite = B_FALSE;
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT0(dr->dt.dl.dr_has_raw_params);
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
 	    dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
 	    dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
 	    &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
+	int gang_copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
+			gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
+			if (level + 1 >=
+			    zfs_redundant_metadata_most_ditto_level ||
+			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
+				gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
-			if (DMU_OT_IS_CRITICAL(type))
+			if (DMU_OT_IS_CRITICAL(type)) {
 				copies++;
+				gang_copies++;
+			} else if (DMU_OT_IS_METADATA(type)) {
+				gang_copies++;
+			}
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
 
 		if (dmu_ddt_copies > 0) {
 			/*
 			 * If this tuneable is set, and this is a write for a
 			 * dedup entry store (zap or log), then we treat it
 			 * something like ZFS_REDUNDANT_METADATA_MOST on a
 			 * regular dataset: this many copies, and one more for
 			 * "higher" indirect blocks. This specific exception is
 			 * necessary because dedup objects are stored in the
 			 * MOS, which always has the highest possible copies.
 			 */
 			dmu_object_type_t stype =
 			    dn ? dn->dn_storage_type : DMU_OT_NONE;
 			if (stype == DMU_OT_NONE)
 				stype = type;
 			if (stype == DMU_OT_DDT_ZAP) {
 				copies = dmu_ddt_copies;
 				if (level >=
 				    zfs_redundant_metadata_most_ditto_level)
 					copies++;
 			}
 		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    zfs_redundant_metadata_most_ditto_level <= 1))
+			gang_copies++;
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+			gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+	zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 	zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * Reports the location of data and holes in an object.  In order to
  * accurately report holes all dirty data must be synced to disk.  This
  * causes extremely poor performance when seeking for holes in a dirty file.
  * As a compromise, only provide hole data when the dnode is clean.  When
  * a dnode is dirty report the dnode as having no holes by returning EBUSY
  * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
 		 * then hole reporting has been requested.  Dirty dnodes
 		 * must be synced to disk to accurately report holes.
 		 *
 		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
 		 * held by the caller only a single restart will be required.
 		 * We tolerate callers which do not hold the rangelock by
 		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 
 			if (restarted)
 				return (SET_ERROR(EBUSY));
 
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			restarted = 1;
 			goto restart;
 		}
 
 		err = SET_ERROR(EBUSY);
 	} else {
 		err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     blkptr_t *bps, size_t *nbpsp)
 {
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	blkptr_t *bp;
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
 		}
 		return (error);
 	}
 
 	ASSERT3U(numbufs, <=, *nbpsp);
 
 	for (int i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 
 		mutex_enter(&db->db_mtx);
 
 		if (!list_is_empty(&db->db_dirty_records)) {
 			dbuf_dirty_record_t *dr;
 
 			dr = list_head(&db->db_dirty_records);
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * This is very special case where we clone a
 				 * block and in the same transaction group we
 				 * read its BP (most likely to clone the clone).
 				 */
 				bp = &dr->dt.dl.dr_overridden_by;
 			} else {
 				/*
 				 * The block was modified in the same
 				 * transaction group.
 				 */
 				mutex_exit(&db->db_mtx);
 				error = SET_ERROR(EAGAIN);
 				goto out;
 			}
 		} else {
 			bp = db->db_blkptr;
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		if (bp == NULL) {
 			/*
 			 * The file size was increased, but the block was never
 			 * written, otherwise we would either have the block
 			 * pointer or the dirty record and would not get here.
 			 * It is effectively a hole, so report it as such.
 			 */
 			BP_ZERO(&bps[i]);
 			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
 		 */
 		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * If the block was allocated in transaction group that is not
 		 * yet synced, we could clone it, but we couldn't write this
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
 		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 
 		bps[i] = *bp;
 	}
 
 	*nbpsp = numbufs;
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 	const blkptr_t *bp;
 	int error = 0, i, numbufs;
 
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
 	 * Before we start cloning make sure that the dbufs sizes match new BPs
 	 * sizes. If they don't, that's a no-go, as we are not able to shrink
 	 * dbufs.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 
 		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
 			error = SET_ERROR(EXDEV);
 			goto out;
 		}
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		dmu_buf_will_clone_or_dio(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
 		dr = list_head(&db->db_dirty_records);
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		ASSERT0(dl->dr_has_raw_params);
 		dl->dr_overridden_by = *bp;
 		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
 				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
 				    BP_GET_BIRTH(bp));
 			} else {
 				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
 				    dr->dr_txg);
 			}
 		}
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	*dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 	(void) vbuf, (void) size;
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_prefetch_by_dnode);
 EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_read_uio);
 EXPORT_SYMBOL(dmu_read_uio_dbuf);
 EXPORT_SYMBOL(dmu_read_uio_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_write_by_dnode_flags);
 EXPORT_SYMBOL(dmu_write_uio);
 EXPORT_SYMBOL(dmu_write_uio_dbuf);
 EXPORT_SYMBOL(dmu_write_uio_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
 
 ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
 	"Override copies= for dedup objects");
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 91e3ca1cf277..a636ae73bbd7 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1,3846 +1,3849 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, 2024, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2022 Axcient.
  */
 
 #include <sys/arc.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zvol.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 #include <sys/bqueue.h>
 #include <sys/objlist.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/zfs_file.h>
 
 static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
 static uint_t zfs_recv_queue_ff = 20;
 static uint_t zfs_recv_write_batch_size = 1024 * 1024;
 static int zfs_recv_best_effort_corrective = 0;
 
 static const void *const dmu_recv_tag = "dmu_recv_tag";
 const char *const recv_clone_name = "%recv";
 
 typedef enum {
 	ORNS_NO,
 	ORNS_YES,
 	ORNS_MAYBE
 } or_need_sync_t;
 
 static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
     void *buf);
 
 struct receive_record_arg {
 	dmu_replay_record_t header;
 	void *payload; /* Pointer to a buffer containing the payload */
 	/*
 	 * If the record is a WRITE or SPILL, pointer to the abd containing the
 	 * payload.
 	 */
 	abd_t *abd;
 	int payload_size;
 	uint64_t bytes_read; /* bytes read from stream when record created */
 	boolean_t eos_marker; /* Marks the end of the stream */
 	bqueue_node_t node;
 };
 
 struct receive_writer_arg {
 	objset_t *os;
 	boolean_t byteswap;
 	bqueue_t q;
 
 	/*
 	 * These three members are used to signal to the main thread when
 	 * we're done.
 	 */
 	kmutex_t mutex;
 	kcondvar_t cv;
 	boolean_t done;
 
 	int err;
 	const char *tofs;
 	boolean_t heal;
 	boolean_t resumable;
 	boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
 	boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
 	boolean_t full;  /* this is a full send stream */
 	uint64_t last_object;
 	uint64_t last_offset;
 	uint64_t max_object; /* highest object ID referenced in stream */
 	uint64_t bytes_read; /* bytes read when current record created */
 
 	list_t write_batch;
 
 	/* Encryption parameters for the last received DRR_OBJECT_RANGE */
 	boolean_t or_crypt_params_present;
 	uint64_t or_firstobj;
 	uint64_t or_numslots;
 	uint8_t or_salt[ZIO_DATA_SALT_LEN];
 	uint8_t or_iv[ZIO_DATA_IV_LEN];
 	uint8_t or_mac[ZIO_DATA_MAC_LEN];
 	boolean_t or_byteorder;
 	zio_t *heal_pio;
 
 	/* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
 	or_need_sync_t or_need_sync;
 };
 
 typedef struct dmu_recv_begin_arg {
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
 	proc_t *drba_proc;
 	dsl_crypto_params_t *drba_dcp;
 } dmu_recv_begin_arg_t;
 
 static void
 byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
 		DO32(drr_object.drr_raw_bonuslen);
 		DO64(drr_object.drr_toguid);
 		DO64(drr_object.drr_maxblkid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
 		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_logical_size);
 		DO64(drr_write.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		DO64(drr_write.drr_compressed_size);
 		break;
 	case DRR_WRITE_EMBEDDED:
 		DO64(drr_write_embedded.drr_object);
 		DO64(drr_write_embedded.drr_offset);
 		DO64(drr_write_embedded.drr_length);
 		DO64(drr_write_embedded.drr_toguid);
 		DO32(drr_write_embedded.drr_lsize);
 		DO32(drr_write_embedded.drr_psize);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_SPILL:
 		DO64(drr_spill.drr_object);
 		DO64(drr_spill.drr_length);
 		DO64(drr_spill.drr_toguid);
 		DO64(drr_spill.drr_compressed_size);
 		DO32(drr_spill.drr_type);
 		break;
 	case DRR_OBJECT_RANGE:
 		DO64(drr_object_range.drr_firstobj);
 		DO64(drr_object_range.drr_numslots);
 		DO64(drr_object_range.drr_toguid);
 		break;
 	case DRR_REDACT:
 		DO64(drr_redact.drr_object);
 		DO64(drr_redact.drr_offset);
 		DO64(drr_redact.drr_length);
 		DO64(drr_redact.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	default:
 		break;
 	}
 
 	if (drr->drr_type != DRR_BEGIN) {
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
 	}
 
 #undef DO64
 #undef DO32
 }
 
 static boolean_t
 redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
 {
 	for (int i = 0; i < num_snaps; i++) {
 		if (snaps[i] == guid)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Check that the new stream we're trying to receive is redacted with respect to
  * a subset of the snapshots that the origin was redacted with respect to.  For
  * the reasons behind this, see the man page on redacted zfs sends and receives.
  */
 static boolean_t
 compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
     uint64_t *redact_snaps, uint64_t num_redact_snaps)
 {
 	/*
 	 * Short circuit the comparison; if we are redacted with respect to
 	 * more snapshots than the origin, we can't be redacted with respect
 	 * to a subset.
 	 */
 	if (num_redact_snaps > origin_num_snaps) {
 		return (B_FALSE);
 	}
 
 	for (int i = 0; i < num_redact_snaps; i++) {
 		if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 		    redact_snaps[i])) {
 			return (B_FALSE);
 		}
 	}
 	return (B_TRUE);
 }
 
 static boolean_t
 redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
 {
 	uint64_t *origin_snaps;
 	uint64_t origin_num_snaps;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	int err = 0;
 	boolean_t ret = B_TRUE;
 	uint64_t *redact_snaps;
 	uint_t numredactsnaps;
 
 	/*
 	 * If this is a full send stream, we're safe no matter what.
 	 */
 	if (drrb->drr_fromguid == 0)
 		return (ret);
 
 	VERIFY(dsl_dataset_get_uint64_array_feature(origin,
 	    SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
 
 	if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 	    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
 	    0) {
 		/*
 		 * If the send stream was sent from the redaction bookmark or
 		 * the redacted version of the dataset, then we're safe.  Verify
 		 * that this is from the a compatible redaction bookmark or
 		 * redacted dataset.
 		 */
 		if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
 		    redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		/*
 		 * If the stream is redacted, it must be redacted with respect
 		 * to a subset of what the origin is redacted with respect to.
 		 * See case number 2 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
 
 		if (err != 0 || !compatible_redact_snaps(origin_snaps,
 		    origin_num_snaps, redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 	    drrb->drr_toguid)) {
 		/*
 		 * If the stream isn't redacted but the origin is, this must be
 		 * one of the snapshots the origin is redacted with respect to.
 		 * See case number 1 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = EINVAL;
 	}
 
 	if (err != 0)
 		ret = B_FALSE;
 	return (ret);
 }
 
 /*
  * If we previously received a stream with --large-block, we don't support
  * receiving an incremental on top of it without --large-block.  This avoids
  * forcing a read-modify-write or trying to re-aggregate a string of WRITE
  * records.
  */
 static int
 recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags)
 {
 	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) &&
 	    !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH));
 	return (0);
 }
 
 static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid, uint64_t featureflags)
 {
 	uint64_t obj;
 	uint64_t children;
 	int error;
 	dsl_dataset_t *snap;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
 	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
 	boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
 
 	/* Temporary clone name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &obj);
 	if (error != ENOENT)
 		return (error == 0 ? SET_ERROR(EBUSY) : error);
 
 	/* Resume state must not be set. */
 	if (dsl_dataset_has_resume_receive_state(ds))
 		return (SET_ERROR(EBUSY));
 
 	/* New snapshot name must not exist if we're not healing it. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    drba->drba_cookie->drc_tosnap, 8, 1, &obj);
 	if (drba->drba_cookie->drc_heal) {
 		if (error != 0)
 			return (error);
 	} else if (error != ENOENT) {
 		return (error == 0 ? SET_ERROR(EEXIST) : error);
 	}
 
 	/* Must not have children if receiving a ZVOL. */
 	error = zap_count(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
 	if (error != 0)
 		return (error);
 	if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
 	    children > 0)
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 
 	/*
 	 * Check snapshot limit before receiving. We'll recheck again at the
 	 * end, but might as well abort before receiving if we're already over
 	 * the limit.
 	 *
 	 * Note that we do not check the file system limit with
 	 * dsl_dir_fscount_check because the temporary %clones don't count
 	 * against that limit.
 	 */
 	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 	    NULL, drba->drba_cred, drba->drba_proc);
 	if (error != 0)
 		return (error);
 
 	if (drba->drba_cookie->drc_heal) {
 		/* Encryption is incompatible with embedded data. */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Healing is not supported when in 'force' mode. */
 		if (drba->drba_cookie->drc_force)
 			return (SET_ERROR(EINVAL));
 
 		/* Must have keys loaded if doing encrypted non-raw recv. */
 		if (encrypted && !raw) {
 			if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
 			    NULL, NULL) != 0)
 				return (SET_ERROR(EACCES));
 		}
 
 		error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * When not doing best effort corrective recv healing can only
 		 * be done if the send stream is for the same snapshot as the
 		 * one we are trying to heal.
 		 */
 		if (zfs_recv_best_effort_corrective == 0 &&
 		    drba->drba_cookie->drc_drrb->drr_toguid !=
 		    dsl_dataset_phys(snap)->ds_guid) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 		dsl_dataset_rele(snap, FTAG);
 	} else if (fromguid != 0) {
 		/* Sanity check the incremental recv */
 		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Can't perform a raw receive on top of a non-raw receive */
 		if (!encrypted && raw)
 			return (SET_ERROR(EINVAL));
 
 		/* Encryption is incompatible with embedded data */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Find snapshot in this dir that matches fromguid. */
 		while (obj != 0) {
 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
 			    &snap);
 			if (error != 0)
 				return (SET_ERROR(ENODEV));
 			if (snap->ds_dir != ds->ds_dir) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 				break;
 			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 			dsl_dataset_rele(snap, FTAG);
 		}
 		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
 		if (drba->drba_cookie->drc_force) {
 			drba->drba_cookie->drc_fromsnapobj = obj;
 		} else {
 			/*
 			 * If we are not forcing, there must be no
 			 * changes since fromsnap. Raw sends have an
 			 * additional constraint that requires that
 			 * no "noop" snapshots exist between fromsnap
 			 * and tosnap for the IVset checking code to
 			 * work properly.
 			 */
 			if (dsl_dataset_modified_since_snap(ds, snap) ||
 			    (raw &&
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj !=
 			    snap->ds_object)) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ETXTBSY));
 			}
 			drba->drba_cookie->drc_fromsnapobj =
 			    ds->ds_prev->ds_object;
 		}
 
 		if (dsl_dataset_feature_is_active(snap,
 		    SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
 		    snap)) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_check_large_blocks(snap, featureflags);
 		if (error != 0) {
 			dsl_dataset_rele(snap, FTAG);
 			return (error);
 		}
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
 		/* If full and not healing then must be forced. */
 		if (!drba->drba_cookie->drc_force)
 			return (SET_ERROR(EEXIST));
 
 		/*
 		 * We don't support using zfs recv -F to blow away
 		 * encrypted filesystems. This would require the
 		 * dsl dir to point to the old encryption key and
 		 * the new one at the same time during the receive.
 		 */
 		if ((!encrypted && raw) || encrypted)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Perform the same encryption checks we would if
 		 * we were creating a new dataset from scratch.
 		 */
 		if (!raw) {
 			boolean_t will_encrypt;
 
 			error = dmu_objset_create_crypt_check(
 			    ds->ds_dir->dd_parent, drba->drba_dcp,
 			    &will_encrypt);
 			if (error != 0)
 				return (error);
 
 			if (will_encrypt && embed)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check that any feature flags used in the data stream we're receiving are
  * supported by the pool we are receiving into.
  *
  * Note that some of the features we explicitly check here have additional
  * (implicit) features they depend on, but those dependencies are enforced
  * through the zfeature_register() calls declaring the features that we
  * explicitly check.
  */
 static int
 recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
 {
 	/*
 	 * Check if there are any unsupported feature flags.
 	 */
 	if (!DMU_STREAM_SUPPORTED(featureflags)) {
 		return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
 	}
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks,
 	 * and large_dnodes in the stream can only be used if those pool
 	 * features are enabled because we don't attempt to decompress /
 	 * un-embed / un-mooch / split up the blocks / dnodes during the
 	 * receive process.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Receiving redacted streams requires that redacted datasets are
 	 * enabled.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If the LONGNAME is not enabled on the target, fail that request.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LONGNAME) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LONGNAME))
 		return (SET_ERROR(ENOTSUP));
 
 	return (0);
 }
 
 static int
 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 		return (SET_ERROR(EINVAL));
 
 	error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* Resumable receives require extensible datasets */
 	if (drba->drba_cookie->drc_resumable &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
 		return (SET_ERROR(ENOTSUP));
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require the encryption feature */
 		if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
 			return (SET_ERROR(ENOTSUP));
 
 		/* embedded data is incompatible with encryption and raw recv */
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (SET_ERROR(EINVAL));
 
 		/* raw receives require spill block allocation flag */
 		if (!(flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		/*
 		 * We support unencrypted datasets below encrypted ones now,
 		 * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing
 		 * with a dataset we may encrypt.
 		 */
 		if (drba->drba_dcp == NULL ||
 		    drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) {
 			dsflags |= DS_HOLD_FLAG_DECRYPT;
 		}
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
 		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_begin_check_existing_impl(drba, ds, fromguid,
 		    featureflags);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		objset_t *os;
 
 		/* healing recv must be done "into" an existing snapshot */
 		if (drba->drba_cookie->drc_heal == B_TRUE)
 			return (SET_ERROR(ENOTSUP));
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
 		if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * If we're receiving a full send as a clone, and it doesn't
 		 * contain all the necessary free records and freeobject
 		 * records, reject it.
 		 */
 		if (fromguid == 0 && drba->drba_origin != NULL &&
 		    !(flags & DRR_FLAG_FREERECORDS))
 			return (SET_ERROR(EINVAL));
 
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, sizeof (buf));
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    drba->drba_origin == NULL) {
 			boolean_t will_encrypt;
 
 			/*
 			 * Check that we aren't breaking any encryption rules
 			 * and that we have all the parameters we need to
 			 * create an encrypted dataset if necessary. If we are
 			 * making an encrypted dataset the stream can't have
 			 * embedded data.
 			 */
 			error = dmu_objset_create_crypt_check(ds->ds_dir,
 			    drba->drba_dcp, &will_encrypt);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 
 			if (will_encrypt &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 
 		/*
 		 * Check filesystem and snapshot limits before receiving. We'll
 		 * recheck snapshot limits again at the end (we create the
 		 * filesystems and increment those counts during begin_sync).
 		 */
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		/* can't recv below anything but filesystems (eg. no ZVOLs) */
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 		}
 
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold_flags(dp, drba->drba_origin,
 			    dsflags, FTAG, &origin);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 			    fromguid != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 
 			if (origin->ds_dir->dd_crypto_obj != 0 &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			/*
 			 * If the origin is redacted we need to verify that this
 			 * send stream can safely be received on top of the
 			 * origin.
 			 */
 			if (dsl_dataset_feature_is_active(origin,
 			    SPA_FEATURE_REDACTED_DATASETS)) {
 				if (!redact_check(drba, origin)) {
 					dsl_dataset_rele_flags(origin, dsflags,
 					    FTAG);
 					dsl_dataset_rele_flags(ds, dsflags,
 					    FTAG);
 					return (SET_ERROR(EINVAL));
 				}
 			}
 
 			error = recv_check_large_blocks(ds, featureflags);
 			if (error != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (error);
 			}
 
 			dsl_dataset_rele_flags(origin, dsflags, FTAG);
 		}
 
 		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
 }
 
 static void
 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	const char *tofs = drc->drc_tofs;
 	uint64_t featureflags = drc->drc_featureflags;
 	dsl_dataset_t *ds, *newds;
 	objset_t *os;
 	uint64_t dsobj;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t crflags = 0;
 	dsl_crypto_params_t dummy_dcp = { 0 };
 	dsl_crypto_params_t *dcp = drba->drba_dcp;
 
 	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
 		crflags |= DS_FLAG_CI_DATASET;
 
 	if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 
 	/*
 	 * Raw, non-incremental recvs always use a dummy dcp with
 	 * the raw cmd set. Raw incremental recvs do not use a dcp
 	 * since the encryption parameters are already set in stone.
 	 */
 	if (dcp == NULL && drrb->drr_fromguid == 0 &&
 	    drba->drba_origin == NULL) {
 		ASSERT3P(dcp, ==, NULL);
 		dcp = &dummy_dcp;
 
 		if (featureflags & DMU_BACKUP_FEATURE_RAW)
 			dcp->cp_cmd = DCP_CMD_RAW_RECV;
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* Create temporary clone unless we're doing corrective recv */
 		dsl_dataset_t *snap = NULL;
 
 		if (drba->drba_cookie->drc_fromsnapobj != 0) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
 			ASSERT3P(dcp, ==, NULL);
 		}
 		if (drc->drc_heal) {
 			/* When healing we want to use the provided snapshot */
 			VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
 			    &dsobj));
 		} else {
 			dsobj = dsl_dataset_create_sync(ds->ds_dir,
 			    recv_clone_name, snap, crflags, drba->drba_cred,
 			    dcp, tx);
 		}
 		if (drba->drba_cookie->drc_fromsnapobj != 0)
 			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else {
 		dsl_dir_t *dd;
 		const char *tail;
 		dsl_dataset_t *origin = NULL;
 
 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 
 		if (drba->drba_origin != NULL) {
 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin));
 			ASSERT3P(dcp, ==, NULL);
 		}
 
 		/* Create new dataset. */
 		dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
 		    origin, crflags, drba->drba_cred, dcp, tx);
 		if (origin != NULL)
 			dsl_dataset_rele(origin, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		drc->drc_newfs = B_TRUE;
 	}
 	VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
 	    &newds));
 	if (dsl_dataset_feature_is_active(newds,
 	    SPA_FEATURE_REDACTED_DATASETS)) {
 		/*
 		 * If the origin dataset is redacted, the child will be redacted
 		 * when we create it.  We clear the new dataset's
 		 * redaction info; if it should be redacted, we'll fill
 		 * in its information later.
 		 */
 		dsl_dataset_deactivate_feature(newds,
 		    SPA_FEATURE_REDACTED_DATASETS, tx);
 	}
 	VERIFY0(dmu_objset_from_ds(newds, &os));
 
 	if (drc->drc_resumable) {
 		dsl_dataset_zapify(newds, tx);
 		if (drrb->drr_fromguid != 0) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
 			    8, 1, &drrb->drr_fromguid, tx));
 		}
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
 		    8, 1, &drrb->drr_toguid, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
 		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
 		uint64_t one = 1;
 		uint64_t zero = 0;
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
 		    8, 1, &one, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
 		    8, 1, &zero, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
 		    8, 1, &zero, tx));
 		if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
 			    8, 1, &one, tx));
 		}
 
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
 		    &numredactsnaps) == 0) {
 			VERIFY0(zap_add(mos, dsobj,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
 			    sizeof (*redact_snaps), numredactsnaps,
 			    redact_snaps, tx));
 		}
 	}
 
 	/*
 	 * Usually the os->os_encrypted value is tied to the presence of a
 	 * DSL Crypto Key object in the dd. However, that will not be received
 	 * until dmu_recv_stream(), so we set the value manually for now.
 	 */
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		os->os_encrypted = B_TRUE;
 		drba->drba_cookie->drc_raw = B_TRUE;
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
 		dsl_dataset_activate_redaction(newds, redact_snaps,
 		    numredactsnaps, tx);
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) {
 		/*
 		 * The source has seen a large microzap at least once in its
 		 * life, so we activate the feature here to match. It's not
 		 * strictly necessary since a large microzap is usable without
 		 * the feature active, but if that object is sent on from here,
 		 * we need this info to know to add the stream feature.
 		 *
 		 * There may be no large microzap in the incoming stream, or
 		 * ever again, but this is a very niche feature and its very
 		 * difficult to spot a large microzap in the stream, so its
 		 * not worth the effort of trying harder to activate the
 		 * feature at first use.
 		 */
 		dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP,
 		    (void *)B_TRUE, tx);
 	}
 
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * Activate longname feature if received
 	 */
 	if (featureflags & DMU_BACKUP_FEATURE_LONGNAME &&
 	    !dsl_dataset_feature_is_active(newds, SPA_FEATURE_LONGNAME)) {
 		dsl_dataset_activate_feature(newds->ds_object,
 		    SPA_FEATURE_LONGNAME, (void *)B_TRUE, tx);
 		newds->ds_feature[SPA_FEATURE_LONGNAME] = (void *)B_TRUE;
 	}
 
 	/*
 	 * If we actually created a non-clone, we need to create the objset
 	 * in our new dataset. If this is a raw send we postpone this until
 	 * dmu_recv_stream() so that we can allocate the metadnode with the
 	 * properties from the DRR_BEGIN payload.
 	 */
 	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
 	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 	    !drc->drc_heal) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 	rrw_exit(&newds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = newds;
 	drba->drba_cookie->drc_os = os;
 
 	spa_history_log_internal_ds(newds, "receive", tx, " ");
 }
 
 static int
 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drc->drc_drrb;
 	int error;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	dsl_dataset_t *ds;
 	const char *tofs = drc->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is mostly a sanity check since we should have already done these
 	 * checks during a previous attempt to receive the data.
 	 */
 	error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
 	    dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
 	    tofs, recv_clone_name);
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require spill block allocation flag */
 		if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	boolean_t recvexist = B_TRUE;
 	if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
 		recvexist = B_FALSE;
 		error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Resume of full/newfs recv on existing dataset should be done with
 	 * force flag
 	 */
 	if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(ZFS_ERR_RESUME_EXISTS));
 	}
 
 	/* check that ds is marked inconsistent */
 	if (!DS_IS_INCONSISTENT(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* check that there is resuming data, and that the toguid matches */
 	if (!dsl_dataset_is_zapified(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	uint64_t val;
 	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
 	if (error != 0 || drrb->drr_toguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check if the receive is still running.  If so, it will be owned.
 	 * Note that nothing else can own the dataset (e.g. after the receive
 	 * fails) because it will be marked inconsistent.
 	 */
 	if (dsl_dataset_has_owner(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/* There should not be any snapshots of this fs yet. */
 	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: resume point will be checked when we process the first WRITE
 	 * record.
 	 */
 
 	/* check that the origin matches */
 	val = 0;
 	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
 	if (drrb->drr_fromguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (ds->ds_prev != NULL && drrb->drr_fromguid != 0)
 		drc->drc_fromsnapobj = ds->ds_prev->ds_object;
 
 	/*
 	 * If we're resuming, and the send is redacted, then the original send
 	 * must have been redacted, and must have been redacted with respect to
 	 * the same snapshots.
 	 */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t num_ds_redact_snaps;
 		uint64_t *ds_redact_snaps;
 
 		uint_t num_stream_redact_snaps;
 		uint64_t *stream_redact_snaps;
 
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
 		    &num_stream_redact_snaps) != 0) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (!dsl_dataset_get_uint64_array_feature(ds,
 		    SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
 		    &ds_redact_snaps)) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		for (int i = 0; i < num_ds_redact_snaps; i++) {
 			if (!redact_snaps_contains(ds_redact_snaps,
 			    num_ds_redact_snaps, stream_redact_snaps[i])) {
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	error = recv_check_large_blocks(ds, drc->drc_featureflags);
 	if (error != 0) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	return (0);
 }
 
 static void
 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
 	    recv_clone_name);
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		drba->drba_cookie->drc_raw = B_TRUE;
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
 	    != 0) {
 		/* %recv does not exist; continue in tofs */
 		VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
 		    &ds));
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 
 	ASSERT(DS_IS_INCONSISTENT(ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
 	    drba->drba_cookie->drc_raw);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = ds;
 	VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
 	drba->drba_cookie->drc_should_save = B_TRUE;
 
 	spa_history_log_internal_ds(ds, "resume receive", tx, " ");
 }
 
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(const char *tofs, const char *tosnap,
     dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal,
     boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args,
     const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp,
     offset_t *voffp)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
 	int err = 0;
 
 	memset(drc, 0, sizeof (dmu_recv_cookie_t));
 	drc->drc_drr_begin = drr_begin;
 	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 	drc->drc_heal = heal;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
 	drc->drc_proc = curproc;
 	drc->drc_clone = (origin != NULL);
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
 		(void) fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 		byteswap_record(drr_begin);
 	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
 		(void) fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 
 	drc->drc_fp = fp;
 	drc->drc_voff = *voffp;
 	drc->drc_featureflags =
 	    DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
 
 	/*
 	 * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace
 	 * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard
 	 * upper limit. Systems with less than 1GB of RAM will see a lower
 	 * limit from `arc_all_memory() / 4`.
 	 */
 	if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4)))
 		return (E2BIG);
 
 
 	if (payloadlen != 0) {
 		void *payload = vmem_alloc(payloadlen, KM_SLEEP);
 		/*
 		 * For compatibility with recursive send streams, we don't do
 		 * this here if the stream could be part of a package. Instead,
 		 * we'll do it in dmu_recv_stream. If we pull the next header
 		 * too early, and it's the END record, we break the `recv_skip`
 		 * logic.
 		 */
 
 		err = receive_read_payload_and_next_header(drc, payloadlen,
 		    payload);
 		if (err != 0) {
 			vmem_free(payload, payloadlen);
 			return (err);
 		}
 		err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
 		    KM_SLEEP);
 		vmem_free(payload, payloadlen);
 		if (err != 0) {
 			kmem_free(drc->drc_next_rrd,
 			    sizeof (*drc->drc_next_rrd));
 			return (err);
 		}
 	}
 
 	if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
 		drc->drc_spill = B_TRUE;
 
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 	drba.drba_proc = curproc;
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = dsl_sync_task(tofs,
 		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 	} else {
 		/*
 		 * For non-raw, non-incremental, non-resuming receives the
 		 * user can specify encryption parameters on the command line
 		 * with "zfs recv -o". For these receives we create a dcp and
 		 * pass it to the sync task. Creating the dcp will implicitly
 		 * remove the encryption params from the localprops nvlist,
 		 * which avoids errors when trying to set these normally
 		 * read-only properties. Any other kind of receive that
 		 * attempts to set these properties will fail as a result.
 		 */
 		if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 		    DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
 			err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 			    localprops, hidden_args, &drba.drba_dcp);
 		}
 
 		if (err == 0) {
 			err = dsl_sync_task(tofs,
 			    dmu_recv_begin_check, dmu_recv_begin_sync,
 			    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 			dsl_crypto_params_free(drba.drba_dcp, !!err);
 		}
 	}
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		nvlist_free(drc->drc_begin_nvl);
 	}
 	return (err);
 }
 
 /*
  * Holds data need for corrective recv callback
  */
 typedef struct cr_cb_data {
 	uint64_t size;
 	zbookmark_phys_t zb;
 	spa_t *spa;
 } cr_cb_data_t;
 
 static void
 corrective_read_done(zio_t *zio)
 {
 	cr_cb_data_t *data = zio->io_private;
 	/* Corruption corrected; update error log if needed */
 	if (zio->io_error == 0) {
 		spa_remove_error(data->spa, &data->zb,
 		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
 }
 
 /*
  * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
  */
 static int
 do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
     struct receive_record_arg *rrd, blkptr_t *bp)
 {
 	int err;
 	zio_t *io;
 	zbookmark_phys_t zb;
 	dnode_t *dn;
 	abd_t *abd = rrd->abd;
 	zio_cksum_t bp_cksum = bp->blk_cksum;
 	zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_CANFAIL;
 
 	if (rwa->raw)
 		flags |= ZIO_FLAG_RAW;
 
 	err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
 	    dbuf_whichblock(dn, 0, drrw->drr_offset));
 	dnode_rele(dn, FTAG);
 
 	if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
 		/* Decompress the stream data */
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
 		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);
 
 		if (err != 0) {
 			abd_free(dabd);
 			return (err);
 		}
 		/* Swap in the newly decompressed data into the abd */
 		abd_free(abd);
 		abd = dabd;
 	}
 
 	if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
 		    abd, &cabd, abd_get_size(abd), BP_GET_PSIZE(bp),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
 		abd_free(abd);
 		abd = cabd;
 		flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 
 	/*
 	 * The stream is not encrypted but the data on-disk is.
 	 * We need to re-encrypt the buf using the same
 	 * encryption type, salt, iv, and mac that was used to encrypt
 	 * the block previosly.
 	 */
 	if (!rwa->raw && BP_USES_CRYPT(bp)) {
 		dsl_dataset_t *ds;
 		dsl_crypto_key_t *dck = NULL;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 		boolean_t no_crypt = B_FALSE;
 		dsl_pool_t *dp = dmu_objset_pool(rwa->os);
 		abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		dsl_pool_config_enter(dp, FTAG);
 		err = dsl_dataset_hold_flags(dp, rwa->tofs,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (err != 0) {
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		/* Look up the key from the spa's keystore */
 		err = spa_keystore_lookup_key(rwa->os->os_spa,
 		    zb.zb_objset, FTAG, &dck);
 		if (err != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
 			    FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
 		    mac, abd_get_size(abd), abd, eabd, &no_crypt);
 
 		spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		ASSERT0(no_crypt);
 		if (err != 0) {
 			abd_free(eabd);
 			return (err);
 		}
 		/* Swap in the newly encrypted data into the abd */
 		abd_free(abd);
 		abd = eabd;
 
 		/*
 		 * We want to prevent zio_rewrite() from trying to
 		 * encrypt the data again
 		 */
 		flags |= ZIO_FLAG_RAW_ENCRYPT;
 	}
 	rrd->abd = abd;
 
 	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
 	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
 	    &zb);
 
 	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
 	    abd_get_size(abd) == BP_GET_PSIZE(bp));
 
 	/* compute new bp checksum value and make sure it matches the old one */
 	zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
 	if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
 		zio_destroy(io);
 		if (zfs_recv_best_effort_corrective != 0)
 			return (0);
 		return (SET_ERROR(ECKSUM));
 	}
 
 	/* Correct the corruption in place */
 	err = zio_wait(io);
 	if (err == 0) {
 		cr_cb_data_t *cb_data =
 		    kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
 		cb_data->spa = rwa->os->os_spa;
 		cb_data->size = drrw->drr_logical_size;
 		cb_data->zb = zb;
 		/* Test if healing worked by re-reading the bp */
 		err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
 		    abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
 		    drrw->drr_logical_size, corrective_read_done,
 		    cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
 	}
 	if (err != 0 && zfs_recv_best_effort_corrective != 0)
 		err = 0;
 
 	return (err);
 }
 
 static int
 receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int done = 0;
 
 	/*
 	 * The code doesn't rely on this (lengths being multiples of 8).  See
 	 * comment in dump_bytes.
 	 */
 	ASSERT(len % 8 == 0 ||
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 
 	while (done < len) {
 		ssize_t resid = len - done;
 		zfs_file_t *fp = drc->drc_fp;
 		int err = zfs_file_read(fp, (char *)buf + done,
 		    len - done, &resid);
 		if (err == 0 && resid == len - done) {
 			/*
 			 * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
 			 * that the receive was interrupted and can
 			 * potentially be resumed.
 			 */
 			err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED);
 		}
 		drc->drc_voff += len - done - resid;
 		done = len - resid;
 		if (err != 0)
 			return (err);
 	}
 
 	drc->drc_bytes_read += len;
 
 	ASSERT3U(done, ==, len);
 	return (0);
 }
 
 static inline uint8_t
 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 {
 	if (bonus_type == DMU_OT_SA) {
 		return (1);
 	} else {
 		return (1 +
 		    ((DN_OLD_MAX_BONUSLEN -
 		    MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
 	}
 }
 
 static void
 save_resume_state(struct receive_writer_arg *rwa,
     uint64_t object, uint64_t offset, dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (!rwa->resumable)
 		return;
 
 	/*
 	 * We use ds_resume_bytes[] != 0 to indicate that we need to
 	 * update this on disk, so it must not be 0.
 	 */
 	ASSERT(rwa->bytes_read != 0);
 
 	/*
 	 * We only resume from write records, which have a valid
 	 * (non-meta-dnode) object number.
 	 */
 	ASSERT(object != 0);
 
 	/*
 	 * For resuming to work correctly, we must receive records in order,
 	 * sorted by object,offset.  This is checked by the callers, but
 	 * assert it here for good measure.
 	 */
 	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
 	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
 	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
 	ASSERT3U(rwa->bytes_read, >=,
 	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
 
 	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
 	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
 	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
 }
 
 static int
 receive_object_is_same_generation(objset_t *os, uint64_t object,
     dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type,
     const void *new_bonus, boolean_t *samegenp)
 {
 	zfs_file_info_t zoi;
 	int err;
 
 	dmu_buf_t *old_bonus_dbuf;
 	err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf);
 	if (err != 0)
 		return (err);
 	err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data,
 	    &zoi);
 	dmu_buf_rele(old_bonus_dbuf, FTAG);
 	if (err != 0)
 		return (err);
 	uint64_t old_gen = zoi.zfi_generation;
 
 	err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi);
 	if (err != 0)
 		return (err);
 	uint64_t new_gen = zoi.zfi_generation;
 
 	*samegenp = (old_gen == new_gen);
 	return (0);
 }
 
 static int
 receive_handle_existing_object(const struct receive_writer_arg *rwa,
     const struct drr_object *drro, const dmu_object_info_t *doi,
     const void *bonus_data,
     uint64_t *object_to_hold, uint32_t *new_blksz)
 {
 	uint32_t indblksz = drro->drr_indblkshift ?
 	    1ULL << drro->drr_indblkshift : 0;
 	int nblkptr = deduce_nblkptr(drro->drr_bonustype,
 	    drro->drr_bonuslen);
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 	boolean_t do_free_range = B_FALSE;
 	int err;
 
 	*object_to_hold = drro->drr_object;
 
 	/* nblkptr should be bounded by the bonus size and type */
 	if (rwa->raw && nblkptr != drro->drr_nblkptr)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * After the previous send stream, the sending system may
 	 * have freed this object, and then happened to re-allocate
 	 * this object number in a later txg. In this case, we are
 	 * receiving a different logical file, and the block size may
 	 * appear to be different.  i.e. we may have a different
 	 * block size for this object than what the send stream says.
 	 * In this case we need to remove the object's contents,
 	 * so that its structure can be changed and then its contents
 	 * entirely replaced by subsequent WRITE records.
 	 *
 	 * If this is a -L (--large-block) incremental stream, and
 	 * the previous stream was not -L, the block size may appear
 	 * to increase.  i.e. we may have a smaller block size for
 	 * this object than what the send stream says.  In this case
 	 * we need to keep the object's contents and block size
 	 * intact, so that we don't lose parts of the object's
 	 * contents that are not changed by this incremental send
 	 * stream.
 	 *
 	 * We can distinguish between the two above cases by using
 	 * the ZPL's generation number (see
 	 * receive_object_is_same_generation()).  However, we only
 	 * want to rely on the generation number when absolutely
 	 * necessary, because with raw receives, the generation is
 	 * encrypted.  We also want to minimize dependence on the
 	 * ZPL, so that other types of datasets can also be received
 	 * (e.g. ZVOLs, although note that ZVOLS currently do not
 	 * reallocate their objects or change their structure).
 	 * Therefore, we check a number of different cases where we
 	 * know it is safe to discard the object's contents, before
 	 * using the ZPL's generation number to make the above
 	 * distinction.
 	 */
 	if (drro->drr_blksz != doi->doi_data_block_size) {
 		if (rwa->raw) {
 			/*
 			 * RAW streams always have large blocks, so
 			 * we are sure that the data is not needed
 			 * due to changing --large-block to be on.
 			 * Which is fortunate since the bonus buffer
 			 * (which contains the ZPL generation) is
 			 * encrypted, and the key might not be
 			 * loaded.
 			 */
 			do_free_range = B_TRUE;
 		} else if (rwa->full) {
 			/*
 			 * This is a full send stream, so it always
 			 * replaces what we have.  Even if the
 			 * generation numbers happen to match, this
 			 * can not actually be the same logical file.
 			 * This is relevant when receiving a full
 			 * send as a clone.
 			 */
 			do_free_range = B_TRUE;
 		} else if (drro->drr_type !=
 		    DMU_OT_PLAIN_FILE_CONTENTS ||
 		    doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) {
 			/*
 			 * PLAIN_FILE_CONTENTS are the only type of
 			 * objects that have ever been stored with
 			 * large blocks, so we don't need the special
 			 * logic below.  ZAP blocks can shrink (when
 			 * there's only one block), so we don't want
 			 * to hit the error below about block size
 			 * only increasing.
 			 */
 			do_free_range = B_TRUE;
 		} else if (doi->doi_max_offset <=
 		    doi->doi_data_block_size) {
 			/*
 			 * There is only one block.  We can free it,
 			 * because its contents will be replaced by a
 			 * WRITE record.  This can not be the no-L ->
 			 * -L case, because the no-L case would have
 			 * resulted in multiple blocks.  If we
 			 * supported -L -> no-L, it would not be safe
 			 * to free the file's contents.  Fortunately,
 			 * that is not allowed (see
 			 * recv_check_large_blocks()).
 			 */
 			do_free_range = B_TRUE;
 		} else {
 			boolean_t is_same_gen;
 			err = receive_object_is_same_generation(rwa->os,
 			    drro->drr_object, doi->doi_bonus_type,
 			    drro->drr_bonustype, bonus_data, &is_same_gen);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 
 			if (is_same_gen) {
 				/*
 				 * This is the same logical file, and
 				 * the block size must be increasing.
 				 * It could only decrease if
 				 * --large-block was changed to be
 				 * off, which is checked in
 				 * recv_check_large_blocks().
 				 */
 				if (drro->drr_blksz <=
 				    doi->doi_data_block_size)
 					return (SET_ERROR(EINVAL));
 				/*
 				 * We keep the existing blocksize and
 				 * contents.
 				 */
 				*new_blksz =
 				    doi->doi_data_block_size;
 			} else {
 				do_free_range = B_TRUE;
 			}
 		}
 	}
 
 	/* nblkptr can only decrease if the object was reallocated */
 	if (nblkptr < doi->doi_nblkptr)
 		do_free_range = B_TRUE;
 
 	/* number of slots can only change on reallocation */
 	if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT)
 		do_free_range = B_TRUE;
 
 	/*
 	 * For raw sends we also check a few other fields to
 	 * ensure we are preserving the objset structure exactly
 	 * as it was on the receive side:
 	 *     - A changed indirect block size
 	 *     - A smaller nlevels
 	 */
 	if (rwa->raw) {
 		if (indblksz != doi->doi_metadata_block_size)
 			do_free_range = B_TRUE;
 		if (drro->drr_nlevels < doi->doi_indirection)
 			do_free_range = B_TRUE;
 	}
 
 	if (do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    0, DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The dmu does not currently support decreasing nlevels or changing
 	 * indirect block size if there is already one, same as changing the
 	 * number of of dnode slots on an object.  For non-raw sends this
 	 * does not matter and the new object can just use the previous one's
 	 * parameters.  For raw sends, however, the structure of the received
 	 * dnode (including indirects and dnode slots) must match that of the
 	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
 	 * must free the object completely and call dmu_object_claim_dnsize()
 	 * instead.
 	 */
 	if ((rwa->raw && ((doi->doi_indirection > 1 &&
 	    indblksz != doi->doi_metadata_block_size) ||
 	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 		*object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/*
 	 * For raw receives, free everything beyond the new incoming
 	 * maxblkid. Normally this would be done with a DRR_FREE
 	 * record that would come after this DRR_OBJECT record is
 	 * processed. However, for raw receives we manually set the
 	 * maxblkid from the drr_maxblkid and so we must first free
 	 * everything above that blkid to ensure the DMU is always
 	 * consistent with itself. We will never free the first block
 	 * of the object here because a maxblkid of 0 could indicate
 	 * an object with a single block or one with no blocks. This
 	 * free may be skipped when dmu_free_long_range() was called
 	 * above since it covers the entire object's contents.
 	 */
 	if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    (drro->drr_maxblkid + 1) * doi->doi_data_block_size,
 		    DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 	return (0);
 }
 
 noinline static int
 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
     void *data)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	int err;
 	uint32_t new_blksz = drro->drr_blksz;
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
 	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen >
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
 	    dn_slots >
 	    (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->raw) {
 		/*
 		 * We should have received a DRR_OBJECT_RANGE record
 		 * containing this block and stored it in rwa.
 		 */
 		if (drro->drr_object < rwa->or_firstobj ||
 		    drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
 		    drro->drr_raw_bonuslen < drro->drr_bonuslen ||
 		    drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
 		    drro->drr_nlevels > DN_MAX_LEVELS ||
 		    drro->drr_nblkptr > DN_MAX_NBLKPTR ||
 		    DN_SLOTS_TO_BONUSLEN(dn_slots) <
 		    drro->drr_raw_bonuslen)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
 		 * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
 		 */
 		if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
 		    (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
 		    drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT && err != EEXIST)
 		return (SET_ERROR(EINVAL));
 
 	if (drro->drr_object > rwa->max_object)
 		rwa->max_object = drro->drr_object;
 
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
 	 * contents before we can change this type of metadata in the dnode.
 	 * Raw receives will also check that the indirect structure of the
 	 * dnode hasn't changed.
 	 */
 	uint64_t object_to_hold;
 	if (err == 0) {
 		err = receive_handle_existing_object(rwa, drro, &doi, data,
 		    &object_to_hold, &new_blksz);
 		if (err != 0)
 			return (err);
 	} else if (err == EEXIST) {
 		/*
 		 * The object requested is currently an interior slot of a
 		 * multi-slot dnode. This will be resolved when the next txg
 		 * is synced out, since the send stream will have told us
 		 * to free this slot when we freed the associated dnode
 		 * earlier in the stream.
 		 */
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		/* object was freed and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	} else {
 		/*
 		 * If the only record in this range so far was DRR_FREEOBJECTS
 		 * with at least one actually freed object, it's possible that
 		 * the block will now be converted to a hole. We need to wait
 		 * for the txg to sync to prevent races.
 		 */
 		if (rwa->or_need_sync == ORNS_YES)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		/* object is free and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/* Only relevant for the first object in the range */
 	rwa->or_need_sync = ORNS_NO;
 
 	/*
 	 * If this is a multi-slot dnode there is a chance that this
 	 * object will expand into a slot that is already used by
 	 * another object from the previous snapshot. We must free
 	 * these objects before we attempt to allocate the new dnode.
 	 */
 	if (dn_slots > 1) {
 		boolean_t need_sync = B_FALSE;
 
 		for (uint64_t slot = drro->drr_object + 1;
 		    slot < drro->drr_object + dn_slots;
 		    slot++) {
 			dmu_object_info_t slot_doi;
 
 			err = dmu_object_info(rwa->os, slot, &slot_doi);
 			if (err == ENOENT || err == EEXIST)
 				continue;
 			else if (err != 0)
 				return (err);
 
 			err = dmu_free_long_object(rwa->os, slot);
 			if (err != 0)
 				return (err);
 
 			need_sync = B_TRUE;
 		}
 
 		if (need_sync)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 	}
 
 	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object_to_hold);
 	dmu_tx_hold_write(tx, object_to_hold, 0, 0);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	if (object_to_hold == DMU_NEW_OBJECT) {
 		/* Currently free, wants to be allocated */
 		err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, tx);
 	} else if (drro->drr_type != doi.doi_type ||
 	    new_blksz != doi.doi_data_block_size ||
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* Currently allocated, but with different properties */
 		err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, rwa->spill ?
 		    DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
 	} else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
 		/*
 		 * Currently allocated, the existing version of this object
 		 * may reference a spill block that is no longer allocated
 		 * at the source and needs to be freed.
 		 */
 		err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
 	}
 
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->or_crypt_params_present) {
 		/*
 		 * Set the crypt params for the buffer associated with this
 		 * range of dnodes.  This causes the blkptr_t to have the
 		 * same crypt params (byteorder, salt, iv, mac) as on the
 		 * sending side.
 		 *
 		 * Since we are committing this tx now, it is possible for
 		 * the dnode block to end up on-disk with the incorrect MAC,
 		 * if subsequent objects in this block are received in a
 		 * different txg.  However, since the dataset is marked as
 		 * inconsistent, no code paths will do a non-raw read (or
 		 * decrypt the block / verify the MAC). The receive code and
 		 * scrub code can safely do raw reads and verify the
 		 * checksum.  They don't need to verify the MAC.
 		 */
 		dmu_buf_t *db = NULL;
 		uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
 
 		err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
 		    offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
 		if (err != 0) {
 			dmu_tx_commit(tx);
 			return (SET_ERROR(EINVAL));
 		}
 
 		dmu_buf_set_crypt_params(db, rwa->or_byteorder,
 		    rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
 
 		dmu_buf_rele(db, FTAG);
 
 		rwa->or_crypt_params_present = B_FALSE;
 	}
 
 	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
 	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	/* handle more restrictive dnode structuring for raw recvs */
 	if (rwa->raw) {
 		/*
 		 * Set the indirect block size, block shift, nlevels.
 		 * This will not fail because we ensured all of the
 		 * blocks were freed earlier if this is a new object.
 		 * For non-new objects block size and indirect block
 		 * shift cannot change and nlevels can only increase.
 		 */
 		ASSERT3U(new_blksz, ==, drro->drr_blksz);
 		VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
 		    drro->drr_blksz, drro->drr_indblkshift, tx));
 		VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
 		    drro->drr_nlevels, tx));
 
 		/*
 		 * Set the maxblkid. This will always succeed because
 		 * we freed all blocks beyond the new maxblkid above.
 		 */
 		VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
 		    drro->drr_maxblkid, tx));
 	}
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 		dnode_t *dn;
 		uint32_t flags = DMU_READ_NO_PREFETCH;
 
 		if (rwa->raw)
 			flags |= DMU_READ_NO_DECRYPT;
 
 		VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
 		VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
 
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro));
 
 		/*
 		 * Raw bonus buffers have their byteorder determined by the
 		 * DRR_OBJECT_RANGE record.
 		 */
 		if (rwa->byteswap && !rwa->raw) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    DRR_OBJECT_PAYLOAD_SIZE(drro));
 		}
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
 
 	/*
 	 * If the receive fails, we want the resume stream to start with the
 	 * same record that we last successfully received. There is no way to
 	 * request resume from the object record, but we can benefit from the
 	 * fact that sender always sends object record before anything else,
 	 * after which it will "resend" data at offset 0 and resume normally.
 	 */
 	save_resume_state(rwa, drro->drr_object, 0, tx);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 noinline static int
 receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
 	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
 	    obj < DN_MAX_OBJECT && next_err == 0;
 	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		dmu_object_info_t doi;
 		int err;
 
 		err = dmu_object_info(rwa->os, obj, &doi);
 		if (err == ENOENT)
 			continue;
 		else if (err != 0)
 			return (err);
 
 		err = dmu_free_long_object(rwa->os, obj);
 
 		if (err != 0)
 			return (err);
 
 		if (rwa->or_need_sync == ORNS_MAYBE)
 			rwa->or_need_sync = ORNS_YES;
 	}
 	if (next_err != ESRCH)
 		return (next_err);
 	return (0);
 }
 
 /*
  * Note: if this fails, the caller will clean up any records left on the
  * rwa->write_batch list.
  */
 static int
 flush_write_batch_impl(struct receive_writer_arg *rwa)
 {
 	dnode_t *dn;
 	int err;
 
 	if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0)
 		return (SET_ERROR(EINVAL));
 
 	struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch);
 	struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write;
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 
 	ASSERT3U(rwa->last_object, ==, last_drrw->drr_object);
 	ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset);
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset,
 	    last_drrw->drr_offset - first_drrw->drr_offset +
 	    last_drrw->drr_logical_size);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		dnode_rele(dn, FTAG);
 		return (err);
 	}
 
 	struct receive_record_arg *rrd;
 	while ((rrd = list_head(&rwa->write_batch)) != NULL) {
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		abd_t *abd = rrd->abd;
 
 		ASSERT3U(drrw->drr_object, ==, rwa->last_object);
 
 		if (drrw->drr_logical_size != dn->dn_datablksz) {
 			/*
 			 * The WRITE record is larger than the object's block
 			 * size.  We must be receiving an incremental
 			 * large-block stream into a dataset that previously did
 			 * a non-large-block receive.  Lightweight writes must
 			 * be exactly one block, so we need to decompress the
 			 * data (if compressed) and do a normal dmu_write().
 			 */
 			ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
 			if (DRR_WRITE_COMPRESSED(drrw)) {
 				abd_t *decomp_abd =
 				    abd_alloc_linear(drrw->drr_logical_size,
 				    B_FALSE);
 
 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
 				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);
 
 				if (err == 0) {
 					dmu_write_by_dnode(dn,
 					    drrw->drr_offset,
 					    drrw->drr_logical_size,
 					    abd_to_buf(decomp_abd), tx);
 				}
 				abd_free(decomp_abd);
 			} else {
 				dmu_write_by_dnode(dn,
 				    drrw->drr_offset,
 				    drrw->drr_logical_size,
 				    abd_to_buf(abd), tx);
 			}
 			if (err == 0)
 				abd_free(abd);
 		} else {
 			zio_prop_t zp = {0};
 			dmu_write_policy(rwa->os, dn, 0, 0, &zp);
 
 			zio_flag_t zio_flags = 0;
 
 			if (rwa->raw) {
 				zp.zp_encrypt = B_TRUE;
 				zp.zp_compress = drrw->drr_compressiontype;
 				zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
 				    !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
 				    rwa->byteswap;
 				memcpy(zp.zp_salt, drrw->drr_salt,
 				    ZIO_DATA_SALT_LEN);
 				memcpy(zp.zp_iv, drrw->drr_iv,
 				    ZIO_DATA_IV_LEN);
 				memcpy(zp.zp_mac, drrw->drr_mac,
 				    ZIO_DATA_MAC_LEN);
 				if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
 					zp.zp_nopwrite = B_FALSE;
 					zp.zp_copies = MIN(zp.zp_copies,
 					    SPA_DVAS_PER_BP - 1);
+					zp.zp_gang_copies =
+					    MIN(zp.zp_gang_copies,
+					    SPA_DVAS_PER_BP - 1);
 				}
 				zio_flags |= ZIO_FLAG_RAW;
 			} else if (DRR_WRITE_COMPRESSED(drrw)) {
 				ASSERT3U(drrw->drr_compressed_size, >, 0);
 				ASSERT3U(drrw->drr_logical_size, >=,
 				    drrw->drr_compressed_size);
 				zp.zp_compress = drrw->drr_compressiontype;
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			} else if (rwa->byteswap) {
 				/*
 				 * Note: compressed blocks never need to be
 				 * byteswapped, because WRITE records for
 				 * metadata blocks are never compressed. The
 				 * exception is raw streams, which are written
 				 * in the original byteorder, and the byteorder
 				 * bit is preserved in the BP by setting
 				 * zp_byteorder above.
 				 */
 				dmu_object_byteswap_t byteswap =
 				    DMU_OT_BYTESWAP(drrw->drr_type);
 				dmu_ot_byteswap[byteswap].ob_func(
 				    abd_to_buf(abd),
 				    DRR_WRITE_PAYLOAD_SIZE(drrw));
 			}
 
 			/*
 			 * Since this data can't be read until the receive
 			 * completes, we can do a "lightweight" write for
 			 * improved performance.
 			 */
 			err = dmu_lightweight_write_by_dnode(dn,
 			    drrw->drr_offset, abd, &zp, zio_flags, tx);
 		}
 
 		if (err != 0) {
 			/*
 			 * This rrd is left on the list, so the caller will
 			 * free it (and the abd).
 			 */
 			break;
 		}
 
 		/*
 		 * Note: If the receive fails, we want the resume stream to
 		 * start with the same record that we last successfully
 		 * received (as opposed to the next record), so that we can
 		 * verify that we are resuming from the correct location.
 		 */
 		save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 
 		list_remove(&rwa->write_batch, rrd);
 		kmem_free(rrd, sizeof (*rrd));
 	}
 
 	dmu_tx_commit(tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 noinline static int
 flush_write_batch(struct receive_writer_arg *rwa)
 {
 	if (list_is_empty(&rwa->write_batch))
 		return (0);
 	int err = rwa->err;
 	if (err == 0)
 		err = flush_write_batch_impl(rwa);
 	if (err != 0) {
 		struct receive_record_arg *rrd;
 		while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
 			abd_free(rrd->abd);
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	ASSERT(list_is_empty(&rwa->write_batch));
 	return (err);
 }
 
 noinline static int
 receive_process_write_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err = 0;
 
 	ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE);
 	struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 
 	if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
 			flags |= DB_RF_NO_DECRYPT;
 
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrw->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
 			    DRR_WRITE_PAYLOAD_SIZE(drrw));
 		}
 
 		err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
 		    drrw->drr_offset, FTAG, &dbp);
 		if (err != 0)
 			return (err);
 
 		/* Try to read the object to see if it needs healing */
 		err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
 		/*
 		 * We only try to heal when dbuf_read() returns a ECKSUMs.
 		 * Other errors (even EIO) get returned to caller.
 		 * EIO indicates that the device is not present/accessible,
 		 * so writing to it will likely fail.
 		 * If the block is healthy, we don't want to overwrite it
 		 * unnecessarily.
 		 */
 		if (err != ECKSUM) {
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Make sure the on-disk block and recv record sizes match */
 		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
 
 	/*
 	 * For resuming to work, records must be in increasing order
 	 * by (object, offset).
 	 */
 	if (drrw->drr_object < rwa->last_object ||
 	    (drrw->drr_object == rwa->last_object &&
 	    drrw->drr_offset < rwa->last_offset)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 	uint64_t batch_size =
 	    MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2);
 	if (first_rrd != NULL &&
 	    (drrw->drr_object != first_drrw->drr_object ||
 	    drrw->drr_offset >= first_drrw->drr_offset + batch_size)) {
 		err = flush_write_batch(rwa);
 		if (err != 0)
 			return (err);
 	}
 
 	rwa->last_object = drrw->drr_object;
 	rwa->last_offset = drrw->drr_offset;
 
 	if (rwa->last_object > rwa->max_object)
 		rwa->max_object = rwa->last_object;
 
 	list_insert_tail(&rwa->write_batch, rrd);
 	/*
 	 * Return EAGAIN to indicate that we will use this rrd again,
 	 * so the caller should not free it
 	 */
 	return (EAGAIN);
 }
 
 static int
 receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (SET_ERROR(EINVAL));
 	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (SET_ERROR(EINVAL));
 	if (rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_object > rwa->max_object)
 		rwa->max_object = drrwe->drr_object;
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwe->drr_object,
 	    drrwe->drr_offset, drrwe->drr_length);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	dmu_write_embedded(rwa->os, drrwe->drr_object,
 	    drrwe->drr_offset, data, drrwe->drr_etype,
 	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
 	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	/* See comment in restore_write. */
 	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
     abd_t *abd)
 {
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
 	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is an unmodified spill block which was added to the stream
 	 * to resolve an issue with incorrectly removing spill blocks.  It
 	 * should be ignored by current versions of the code which support
 	 * the DRR_FLAG_SPILL_BLOCK flag.
 	 */
 	if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
 		abd_free(abd);
 		return (0);
 	}
 
 	if (rwa->raw) {
 		if (!DMU_OT_IS_VALID(drrs->drr_type) ||
 		    drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
 		    drrs->drr_compressed_size == 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrs->drr_object > rwa->max_object)
 		rwa->max_object = drrs->drr_object;
 
 	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
 	    &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	/*
 	 * Spill blocks may both grow and shrink.  When a change in size
 	 * occurs any existing dbuf must be updated to match the logical
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
 		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}
 
 	arc_buf_t *abuf;
 	if (rwa->raw) {
 		boolean_t byteorder = ZFS_HOST_BYTEORDER ^
 		    !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
 		    rwa->byteswap;
 
 		abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
 		    drrs->drr_object, byteorder, drrs->drr_salt,
 		    drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
 		    drrs->drr_compressed_size, drrs->drr_length,
 		    drrs->drr_compressiontype, 0);
 	} else {
 		abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
 		    DMU_OT_IS_METADATA(drrs->drr_type),
 		    drrs->drr_length);
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrs->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
 			    DRR_SPILL_PAYLOAD_SIZE(drrs));
 		}
 	}
 
 	memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
 	abd_free(abd);
 	dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
 
 	dmu_buf_rele(db, FTAG);
 	dmu_buf_rele(db_spill, FTAG);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 noinline static int
 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrf->drr_object > rwa->max_object)
 		rwa->max_object = drrf->drr_object;
 
 	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 
 	return (err);
 }
 
 static int
 receive_object_range(struct receive_writer_arg *rwa,
     struct drr_object_range *drror)
 {
 	/*
 	 * By default, we assume this block is in our native format
 	 * (ZFS_HOST_BYTEORDER). We then take into account whether
 	 * the send stream is byteswapped (rwa->byteswap). Finally,
 	 * we need to byteswap again if this particular block was
 	 * in non-native format on the send side.
 	 */
 	boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
 	    !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
 
 	/*
 	 * Since dnode block sizes are constant, we should not need to worry
 	 * about making sure that the dnode block size is the same on the
 	 * sending and receiving sides for the time being. For non-raw sends,
 	 * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
 	 * record at all). Raw sends require this record type because the
 	 * encryption parameters are used to protect an entire block of bonus
 	 * buffers. If the size of dnode blocks ever becomes variable,
 	 * handling will need to be added to ensure that dnode block sizes
 	 * match on the sending and receiving side.
 	 */
 	if (drror->drr_numslots != DNODES_PER_BLOCK ||
 	    P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
 	    !rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drror->drr_firstobj > rwa->max_object)
 		rwa->max_object = drror->drr_firstobj;
 
 	/*
 	 * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
 	 * so that the block of dnodes is not written out when it's empty,
 	 * and converted to a HOLE BP.
 	 */
 	rwa->or_crypt_params_present = B_TRUE;
 	rwa->or_firstobj = drror->drr_firstobj;
 	rwa->or_numslots = drror->drr_numslots;
 	memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN);
 	memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN);
 	memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN);
 	rwa->or_byteorder = byteorder;
 
 	rwa->or_need_sync = ORNS_MAYBE;
 
 	return (0);
 }
 
 /*
  * Until we have the ability to redact large ranges of data efficiently, we
  * process these records as frees.
  */
 noinline static int
 receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
 {
 	struct drr_free drrf = {0};
 	drrf.drr_length = drrr->drr_length;
 	drrf.drr_object = drrr->drr_object;
 	drrf.drr_offset = drrr->drr_offset;
 	drrf.drr_toguid = drrr->drr_toguid;
 	return (receive_free(rwa, &drrf));
 }
 
 /* used to destroy the drc_ds on error */
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
 	dsl_dataset_t *ds = drc->drc_ds;
 	ds_hold_flags_t dsflags;
 
 	dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
 	/*
 	 * Wait for the txg sync before cleaning up the receive. For
 	 * resumable receives, this ensures that our resume state has
 	 * been written out to disk. For raw receives, this ensures
 	 * that the user accounting code will not attempt to do anything
 	 * after we stopped receiving the dataset.
 	 */
 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
 	ds->ds_objset->os_raw_receive = B_FALSE;
 
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	if (drc->drc_resumable && drc->drc_should_save &&
 	    !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 	} else {
 		char name[ZFS_MAX_DATASET_NAME_LEN];
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_name(ds, name);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 		if (!drc->drc_heal)
 			(void) dsl_destroy_head(name);
 	}
 }
 
 static void
 receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	if (drc->drc_byteswap) {
 		(void) fletcher_4_incremental_byteswap(buf, len,
 		    &drc->drc_cksum);
 	} else {
 		(void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
 	}
 }
 
 /*
  * Read the payload into a buffer of size len, and update the current record's
  * payload field.
  * Allocate drc->drc_next_rrd and read the next record's header into
  * drc->drc_next_rrd->header.
  * Verify checksum of payload and next record.
  */
 static int
 receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int err;
 
 	if (len != 0) {
 		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
 		err = receive_read(drc, len, buf);
 		if (err != 0)
 			return (err);
 		receive_cksum(drc, len, buf);
 
 		/* note: rrd is NULL when reading the begin record's payload */
 		if (drc->drc_rrd != NULL) {
 			drc->drc_rrd->payload = buf;
 			drc->drc_rrd->payload_size = len;
 			drc->drc_rrd->bytes_read = drc->drc_bytes_read;
 		}
 	} else {
 		ASSERT3P(buf, ==, NULL);
 	}
 
 	drc->drc_prev_cksum = drc->drc_cksum;
 
 	drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
 	err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
 	    &drc->drc_next_rrd->header);
 	drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (err);
 	}
 	if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: checksum is of everything up to but not including the
 	 * checksum itself.
 	 */
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	receive_cksum(drc,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &drc->drc_next_rrd->header);
 
 	zio_cksum_t cksum_orig =
 	    drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 	zio_cksum_t *cksump =
 	    &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 
 	if (drc->drc_byteswap)
 		byteswap_record(&drc->drc_next_rrd->header);
 
 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
 	    !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(ECKSUM));
 	}
 
 	receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
 
 	return (0);
 }
 
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
  * We use the object ignore list to tell us whether or not to issue prefetches
  * for a given object.  We do this for both correctness (in case the blocksize
  * of an object has changed) and performance (if the object doesn't exist, don't
  * needlessly try to issue prefetches).  We also trim the list as we go through
  * the stream to prevent it from growing to an unbounded size.
  *
  * The object numbers within will always be in sorted order, and any write
  * records we see will also be in sorted order, but they're not sorted with
  * respect to each other (i.e. we can get several object records before
  * receiving each object's write records).  As a result, once we've reached a
  * given object number, we can safely remove any reference to lower object
  * numbers in the ignore list. In practice, we receive up to 32 object records
  * before receiving write records, so the list can have up to 32 nodes in it.
  */
 static void
 receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	if (!objlist_exists(drc->drc_ignore_objlist, object)) {
 		dmu_prefetch(drc->drc_os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 }
 
 /*
  * Read records off the stream, issuing any necessary prefetches.
  */
 static int
 receive_read_record(dmu_recv_cookie_t *drc)
 {
 	int err;
 
 	switch (drc->drc_rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro =
 		    &drc->drc_rrd->header.drr_u.drr_object;
 		uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
 		void *buf = NULL;
 		dmu_object_info_t doi;
 
 		if (size != 0)
 			buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 		err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
 		/*
 		 * See receive_read_prefetch for an explanation why we're
 		 * storing this object in the ignore_obj_list.
 		 */
 		if (err == ENOENT || err == EEXIST ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
 			objlist_insert(drc->drc_ignore_objlist,
 			    drro->drr_object);
 			err = 0;
 		}
 		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
 		int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0) {
 			abd_free(abd);
 			return (err);
 		}
 		drc->drc_rrd->abd = abd;
 		receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
 		    drrw->drr_logical_size);
 		return (err);
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &drc->drc_rrd->header.drr_u.drr_write_embedded;
 		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
 		void *buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 
 		receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
 		    drrwe->drr_length);
 		return (err);
 	}
 	case DRR_FREE:
 	case DRR_REDACT:
 	{
 		/*
 		 * It might be beneficial to prefetch indirect blocks here, but
 		 * we don't really have the data to decide for sure.
 		 */
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_END:
 	{
 		struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
 		    drre->drr_checksum))
 			return (SET_ERROR(ECKSUM));
 		return (0);
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
 		int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0)
 			abd_free(abd);
 		else
 			drc->drc_rrd->abd = abd;
 		return (err);
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 
 	}
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 }
 
 
 
 static void
 dprintf_drr(struct receive_record_arg *rrd, int err)
 {
 #ifdef ZFS_DEBUG
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		dprintf("drr_type = OBJECT obj = %llu type = %u "
 		    "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
 		    "compress = %u dn_slots = %u err = %d\n",
 		    (u_longlong_t)drro->drr_object, drro->drr_type,
 		    drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen,
 		    drro->drr_checksumtype, drro->drr_compress,
 		    drro->drr_dn_slots, err);
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		dprintf("drr_type = FREEOBJECTS firstobj = %llu "
 		    "numobjs = %llu err = %d\n",
 		    (u_longlong_t)drrfo->drr_firstobj,
 		    (u_longlong_t)drrfo->drr_numobjs, err);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
 		    "lsize = %llu cksumtype = %u flags = %u "
 		    "compress = %u psize = %llu err = %d\n",
 		    (u_longlong_t)drrw->drr_object, drrw->drr_type,
 		    (u_longlong_t)drrw->drr_offset,
 		    (u_longlong_t)drrw->drr_logical_size,
 		    drrw->drr_checksumtype, drrw->drr_flags,
 		    drrw->drr_compressiontype,
 		    (u_longlong_t)drrw->drr_compressed_size, err);
 		break;
 	}
 	case DRR_WRITE_BYREF:
 	{
 		struct drr_write_byref *drrwbr =
 		    &rrd->header.drr_u.drr_write_byref;
 		dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
 		    "length = %llu toguid = %llx refguid = %llx "
 		    "refobject = %llu refoffset = %llu cksumtype = %u "
 		    "flags = %u err = %d\n",
 		    (u_longlong_t)drrwbr->drr_object,
 		    (u_longlong_t)drrwbr->drr_offset,
 		    (u_longlong_t)drrwbr->drr_length,
 		    (u_longlong_t)drrwbr->drr_toguid,
 		    (u_longlong_t)drrwbr->drr_refguid,
 		    (u_longlong_t)drrwbr->drr_refobject,
 		    (u_longlong_t)drrwbr->drr_refoffset,
 		    drrwbr->drr_checksumtype, drrwbr->drr_flags, err);
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
 		    "length = %llu compress = %u etype = %u lsize = %u "
 		    "psize = %u err = %d\n",
 		    (u_longlong_t)drrwe->drr_object,
 		    (u_longlong_t)drrwe->drr_offset,
 		    (u_longlong_t)drrwe->drr_length,
 		    drrwe->drr_compression, drrwe->drr_etype,
 		    drrwe->drr_lsize, drrwe->drr_psize, err);
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		dprintf("drr_type = FREE obj = %llu offset = %llu "
 		    "length = %lld err = %d\n",
 		    (u_longlong_t)drrf->drr_object,
 		    (u_longlong_t)drrf->drr_offset,
 		    (longlong_t)drrf->drr_length,
 		    err);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		dprintf("drr_type = SPILL obj = %llu length = %llu "
 		    "err = %d\n", (u_longlong_t)drrs->drr_object,
 		    (u_longlong_t)drrs->drr_length, err);
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
 		    "numslots = %llu flags = %u err = %d\n",
 		    (u_longlong_t)drror->drr_firstobj,
 		    (u_longlong_t)drror->drr_numslots,
 		    drror->drr_flags, err);
 		break;
 	}
 	default:
 		return;
 	}
 #endif
 }
 
 /*
  * Commit the records to the pool.
  */
 static int
 receive_process_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err;
 
 	/* Processing in order, therefore bytes_read should be increasing. */
 	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
 	rwa->bytes_read = rrd->bytes_read;
 
 	/* We can only heal write records; other ones get ignored */
 	if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		return (0);
 	}
 
 	if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		err = flush_write_batch(rwa);
 		if (err != 0) {
 			if (rrd->abd != NULL) {
 				abd_free(rrd->abd);
 				rrd->abd = NULL;
 				rrd->payload = NULL;
 			} else if (rrd->payload != NULL) {
 				kmem_free(rrd->payload, rrd->payload_size);
 				rrd->payload = NULL;
 			}
 
 			return (err);
 		}
 	}
 
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		err = receive_object(rwa, drro, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		err = receive_freeobjects(rwa, drrfo);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		err = receive_process_write_record(rwa, rrd);
 		if (rwa->heal) {
 			/*
 			 * If healing - always free the abd after processing
 			 */
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (err != EAGAIN) {
 			/*
 			 * On success, a non-healing
 			 * receive_process_write_record() returns
 			 * EAGAIN to indicate that we do not want to free
 			 * the rrd or arc_buf.
 			 */
 			ASSERT(err != 0);
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		}
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		err = receive_write_embedded(rwa, drrwe, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		err = receive_free(rwa, drrf);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		err = receive_spill(rwa, drrs, rrd->abd);
 		if (err != 0)
 			abd_free(rrd->abd);
 		rrd->abd = NULL;
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		err = receive_object_range(rwa, drror);
 		break;
 	}
 	case DRR_REDACT:
 	{
 		struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
 		err = receive_redact(rwa, drrr);
 		break;
 	}
 	default:
 		err = (SET_ERROR(EINVAL));
 	}
 
 	if (err != 0)
 		dprintf_drr(rrd, err);
 
 	return (err);
 }
 
 /*
  * dmu_recv_stream's worker thread; pull records off the queue, and then call
  * receive_process_record  When we're done, signal the main thread and exit.
  */
 static __attribute__((noreturn)) void
 receive_writer_thread(void *arg)
 {
 	struct receive_writer_arg *rwa = arg;
 	struct receive_record_arg *rrd;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
 	    rrd = bqueue_dequeue(&rwa->q)) {
 		/*
 		 * If there's an error, the main thread will stop putting things
 		 * on the queue, but we need to clear everything in it before we
 		 * can exit.
 		 */
 		int err = 0;
 		if (rwa->err == 0) {
 			err = receive_process_record(rwa, rrd);
 		} else if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 			rrd->payload = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		/*
 		 * EAGAIN indicates that this record has been saved (on
 		 * raw->write_batch), and will be used again, so we don't
 		 * free it.
 		 * When healing data we always need to free the record.
 		 */
 		if (err != EAGAIN || rwa->heal) {
 			if (rwa->err == 0)
 				rwa->err = err;
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	kmem_free(rrd, sizeof (*rrd));
 
 	if (rwa->heal) {
 		zio_wait(rwa->heal_pio);
 	} else {
 		int err = flush_write_batch(rwa);
 		if (rwa->err == 0)
 			rwa->err = err;
 	}
 	mutex_enter(&rwa->mutex);
 	rwa->done = B_TRUE;
 	cv_signal(&rwa->cv);
 	mutex_exit(&rwa->mutex);
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 static int
 resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
 {
 	uint64_t val;
 	objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
 	uint64_t dsobj = dmu_objset_id(drc->drc_os);
 	uint64_t resume_obj, resume_off;
 
 	if (nvlist_lookup_uint64(begin_nvl,
 	    "resume_object", &resume_obj) != 0 ||
 	    nvlist_lookup_uint64(begin_nvl,
 	    "resume_offset", &resume_off) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
 	if (resume_obj != val)
 		return (SET_ERROR(EINVAL));
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
 	if (resume_off != val)
 		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
  * worker thread, read the records off the stream one by one, and issue
  * prefetches for any necessary indirect blocks.  It will then push the records
  * onto an internal blocking queue.  The worker thread will pull the records off
  * the queue, and actually write the data into the DMU.  This way, the worker
  * thread doesn't have to wait for reads to complete, since everything it needs
  * (the indirect blocks) will be prefetched.
  *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
 dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 {
 	int err = 0;
 	struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
 
 	if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) {
 		uint64_t bytes = 0;
 		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
 		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
 		    sizeof (bytes), 1, &bytes);
 		drc->drc_bytes_read += bytes;
 	}
 
 	drc->drc_ignore_objlist = objlist_create();
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 	ASSERT0(drc->drc_os->os_encrypted &&
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
 
 	/* handle DSL encryption key payload */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		nvlist_t *keynvl = NULL;
 
 		ASSERT(drc->drc_os->os_encrypted);
 		ASSERT(drc->drc_raw);
 
 		err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
 		    &keynvl);
 		if (err != 0)
 			goto out;
 
 		if (!drc->drc_heal) {
 			/*
 			 * If this is a new dataset we set the key immediately.
 			 * Otherwise we don't want to change the key until we
 			 * are sure the rest of the receive succeeded so we
 			 * stash the keynvl away until then.
 			 */
 			err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
 			    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
 			    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
 			if (err != 0)
 				goto out;
 		}
 
 		/* see comment in dmu_recv_end_sync() */
 		drc->drc_ivset_guid = 0;
 		(void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
 		    &drc->drc_ivset_guid);
 
 		if (!drc->drc_newfs)
 			drc->drc_keynvl = fnvlist_dup(keynvl);
 	}
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = resume_check(drc, drc->drc_begin_nvl);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * For compatibility with recursive send streams, we do this here,
 	 * rather than in dmu_recv_begin. If we pull the next header too
 	 * early, and it's the END record, we break the `recv_skip` logic.
 	 */
 	if (drc->drc_drr_begin->drr_payloadlen == 0) {
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * If we failed before this point we will clean up any new resume
 	 * state that was created. Now that we've gotten past the initial
 	 * checks we are ok to retain that resume state.
 	 */
 	drc->drc_should_save = B_TRUE;
 
 	(void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
 	    MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct receive_record_arg, node));
 	cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa->os = drc->drc_os;
 	rwa->byteswap = drc->drc_byteswap;
 	rwa->heal = drc->drc_heal;
 	rwa->tofs = drc->drc_tofs;
 	rwa->resumable = drc->drc_resumable;
 	rwa->raw = drc->drc_raw;
 	rwa->spill = drc->drc_spill;
 	rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
 	rwa->os->os_raw_receive = drc->drc_raw;
 	if (drc->drc_heal) {
 		rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
 		    ZIO_FLAG_GODFATHER);
 	}
 	list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
 	    offsetof(struct receive_record_arg, node.bqn_node));
 
 	(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
 	    TS_RUN, minclsyspri);
 	/*
 	 * We're reading rwa->err without locks, which is safe since we are the
 	 * only reader, and the worker thread is the only writer.  It's ok if we
 	 * miss a write for an iteration or two of the loop, since the writer
 	 * thread will keep freeing records we send it until we send it an eos
 	 * marker.
 	 *
 	 * We can leave this loop in 3 ways:  First, if rwa->err is
 	 * non-zero.  In that case, the writer thread will free the rrd we just
 	 * pushed.  Second, if  we're interrupted; in that case, either it's the
 	 * first loop and drc->drc_rrd was never allocated, or it's later, and
 	 * drc->drc_rrd has been handed off to the writer thread who will free
 	 * it.  Finally, if receive_read_record fails or we're at the end of the
 	 * stream, then we free drc->drc_rrd and exit.
 	 */
 	while (rwa->err == 0) {
 		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
 		ASSERT3P(drc->drc_rrd, ==, NULL);
 		drc->drc_rrd = drc->drc_next_rrd;
 		drc->drc_next_rrd = NULL;
 		/* Allocates and loads header into drc->drc_next_rrd */
 		err = receive_read_record(drc);
 
 		if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
 			kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
 			drc->drc_rrd = NULL;
 			break;
 		}
 
 		bqueue_enqueue(&rwa->q, drc->drc_rrd,
 		    sizeof (struct receive_record_arg) +
 		    drc->drc_rrd->payload_size);
 		drc->drc_rrd = NULL;
 	}
 
 	ASSERT3P(drc->drc_rrd, ==, NULL);
 	drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
 	drc->drc_rrd->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
 
 	mutex_enter(&rwa->mutex);
 	while (!rwa->done) {
 		/*
 		 * We need to use cv_wait_sig() so that any process that may
 		 * be sleeping here can still fork.
 		 */
 		(void) cv_wait_sig(&rwa->cv, &rwa->mutex);
 	}
 	mutex_exit(&rwa->mutex);
 
 	/*
 	 * If we are receiving a full stream as a clone, all object IDs which
 	 * are greater than the maximum ID referenced in the stream are
 	 * by definition unused and must be freed.
 	 */
 	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
 		uint64_t obj = rwa->max_object + 1;
 		int free_err = 0;
 		int next_err = 0;
 
 		while (next_err == 0) {
 			free_err = dmu_free_long_object(rwa->os, obj);
 			if (free_err != 0 && free_err != ENOENT)
 				break;
 
 			next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
 		}
 
 		if (err == 0) {
 			if (free_err != 0 && free_err != ENOENT)
 				err = free_err;
 			else if (next_err != ESRCH)
 				err = next_err;
 		}
 	}
 
 	cv_destroy(&rwa->cv);
 	mutex_destroy(&rwa->mutex);
 	bqueue_destroy(&rwa->q);
 	list_destroy(&rwa->write_batch);
 	if (err == 0)
 		err = rwa->err;
 
 out:
 	/*
 	 * If we hit an error before we started the receive_writer_thread
 	 * we need to clean up the next_rrd we create by processing the
 	 * DRR_BEGIN record.
 	 */
 	if (drc->drc_next_rrd != NULL)
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 
 	/*
 	 * The objset will be invalidated by dmu_recv_end() when we do
 	 * dsl_dataset_clone_swap_sync_impl().
 	 */
 	drc->drc_os = NULL;
 
 	kmem_free(rwa, sizeof (*rwa));
 	nvlist_free(drc->drc_begin_nvl);
 
 	if (err != 0) {
 		/*
 		 * Clean up references. If receive is not resumable,
 		 * destroy what we created, so we don't leave it in
 		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	}
 
 	objlist_destroy(drc->drc_ignore_objlist);
 	drc->drc_ignore_objlist = NULL;
 	*voffp = drc->drc_voff;
 	return (err);
 }
 
 static int
 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error;
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
 	if (drc->drc_heal) {
 		error = 0;
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
 		if (drc->drc_force) {
 			/*
 			 * We will destroy any snapshots in tofs (i.e. before
 			 * origin_head) that are after the origin (which is
 			 * the snap before drc_ds, because drc_ds can not
 			 * have any snaps of its own).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap);
 				if (error != 0)
 					break;
 				if (snap->ds_dir != origin_head->ds_dir)
 					error = SET_ERROR(EINVAL);
 				if (error == 0)  {
 					error = dsl_destroy_snapshot_check_impl(
 					    snap, B_FALSE);
 				}
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_dataset_rele(snap, FTAG);
 				if (error != 0)
 					break;
 			}
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
 		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
 
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 	}
 	return (error);
 }
 
 static void
 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
 	uint64_t newsnapobj = 0;
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 	drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
 
 	if (drc->drc_heal) {
 		if (drc->drc_keynvl != NULL) {
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
 
 		if (drc->drc_force) {
 			/*
 			 * Destroy any snapshots of drc_tofs (origin_head)
 			 * after the origin (the snap before drc_ds).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap));
 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_destroy_snapshot_sync_impl(snap,
 				    B_FALSE, tx);
 				dsl_dataset_rele(snap, FTAG);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			dsl_crypto_recv_raw_key_sync(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 
 		VERIFY3P(drc->drc_ds->ds_prev, ==,
 		    origin_head->ds_prev);
 
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		/*
 		 * The objset was evicted by dsl_dataset_clone_swap_sync_impl,
 		 * so drc_os is no longer valid.
 		 */
 		drc->drc_os = NULL;
 
 		dsl_dataset_snapshot_sync_impl(origin_head,
 		    drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		newsnapobj =
 		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
 
 		if (drc->drc_owner != NULL)
 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(ds->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 		if (dsl_dataset_has_resume_receive_state(ds)) {
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_FROMGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OBJECT, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OFFSET, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_BYTES, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TOGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TONAME, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
 		}
 		newsnapobj =
 		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	}
 
 	/*
 	 * If this is a raw receive, the crypt_keydata nvlist will include
 	 * a to_ivset_guid for us to set on the new snapshot. This value
 	 * will override the value generated by the snapshot code. However,
 	 * this value may not be present, because older implementations of
 	 * the raw send code did not include this value, and we are still
 	 * allowed to receive them if the zfs_disable_ivset_guid_check
 	 * tunable is set, in which case we will leave the newly-generated
 	 * value.
 	 */
 	if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
 		dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
 		    DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
 		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
 		    &drc->drc_ivset_guid, tx));
 	}
 
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode
 	 * we can evict its bonus buffer. Since the dataset may be destroyed
 	 * at this point (and therefore won't have a valid pointer to the spa)
 	 * we release the key mapping manually here while we do have a valid
 	 * pointer, if it exists.
 	 */
 	if (!drc->drc_raw && encrypted) {
 		(void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
 		    drc->drc_ds->ds_object, drc->drc_ds);
 	}
 	dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
 	drc->drc_ds = NULL;
 }
 
 static int dmu_recv_end_modified_blocks = 3;
 
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	int error;
 
 	drc->drc_owner = owner;
 
 	if (drc->drc_newfs)
 		error = dmu_recv_new_end(drc);
 	else
 		error = dmu_recv_existing_end(drc);
 
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	} else if (!drc->drc_heal) {
 		if (drc->drc_newfs) {
 			zvol_create_minor(drc->drc_tofs);
 		}
 		char *snapname = kmem_asprintf("%s@%s",
 		    drc->drc_tofs, drc->drc_tosnap);
 		zvol_create_minor(snapname);
 		kmem_strfree(snapname);
 	}
 	return (error);
 }
 
 /*
  * Return TRUE if this objset is currently being received into.
  */
 boolean_t
 dmu_objset_is_receiving(objset_t *os)
 {
 	return (os->os_dsl_dataset != NULL &&
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW,
 	"Maximum receive queue length");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW,
 	"Receive queue fill fraction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW,
 	"Maximum amount of writes to batch into one transaction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
 	"Ignore errors during corrective receive");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 50dbafa09172..63f57cf26301 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1,5837 +1,5838 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 #include <cityhash.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 static kmem_cache_t *zio_cache;
 static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
 static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  *
  * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
  * compression (including of metadata).  In practice, we don't have this
  * many sync passes, so this has no effect.
  *
  * The original intent was that disabling compression would help the sync
  * passes to converge. However, in practice disabling compression increases
  * the average number of sync passes, because when we turn compression off, a
  * lot of block's size will change and thus we have to re-allocate (not
  * overwrite) them. It also increases the number of 128KB allocations (e.g.
  * for indirect blocks and spacemaps) because these will not be compressed.
  * The 128K allocations are especially detrimental to performance on highly
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
 
 /* defer frees starting in this pass */
 uint_t zfs_sync_pass_deferred_free = 2;
 
 /* don't compress starting in this pass */
 static uint_t zfs_sync_pass_dont_compress = 8;
 
 /* rewrite new bps starting in this pass */
 static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 /*
  * Enable smaller cores by excluding metadata
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
 static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 static const int zio_buf_debug_limit = 16384;
 #else
 static const int zio_buf_debug_limit = 0;
 #endif
 
 typedef struct zio_stats {
 	kstat_named_t ziostat_total_allocations;
 	kstat_named_t ziostat_alloc_class_fallbacks;
 	kstat_named_t ziostat_gang_writes;
 	kstat_named_t ziostat_gang_multilevel;
 } zio_stats_t;
 
 static zio_stats_t zio_stats = {
 	{ "total_allocations",	KSTAT_DATA_UINT64 },
 	{ "alloc_class_fallbacks",	KSTAT_DATA_UINT64 },
 	{ "gang_writes",	KSTAT_DATA_UINT64 },
 	{ "gang_multilevel",	KSTAT_DATA_UINT64 },
 };
 
 struct {
 	wmsum_t ziostat_total_allocations;
 	wmsum_t ziostat_alloc_class_fallbacks;
 	wmsum_t ziostat_gang_writes;
 	wmsum_t ziostat_gang_multilevel;
 } ziostat_sums;
 
 #define	ZIOSTAT_BUMP(stat)	wmsum_add(&ziostat_sums.stat, 1);
 
 static kstat_t *zio_ksp;
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 static int
 zio_kstats_update(kstat_t *ksp, int rw)
 {
 	zio_stats_t *zs = ksp->ks_data;
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	zs->ziostat_total_allocations.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_total_allocations);
 	zs->ziostat_alloc_class_fallbacks.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_alloc_class_fallbacks);
 	zs->ziostat_gang_writes.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_gang_writes);
 	zs->ziostat_gang_multilevel.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_gang_multilevel);
 	return (0);
 }
 
 void
 zio_init(void)
 {
 	size_t c;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	wmsum_init(&ziostat_sums.ziostat_total_allocations, 0);
 	wmsum_init(&ziostat_sums.ziostat_alloc_class_fallbacks, 0);
 	wmsum_init(&ziostat_sums.ziostat_gang_writes, 0);
 	wmsum_init(&ziostat_sums.ziostat_gang_multilevel, 0);
 	zio_ksp = kstat_create("zfs", 0, "zio_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (zio_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (zio_ksp != NULL) {
 		zio_ksp->ks_data = &zio_stats;
 		zio_ksp->ks_update = zio_kstats_update;
 		kstat_install(zio_ksp);
 	}
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t align, cflags, data_cflags;
 		char name[32];
 
 		/*
 		 * Create cache for each half-power of 2 size, starting from
 		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
 		 * of ~7/8, sufficient for transient allocations mostly using
 		 * these caches.
 		 */
 		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 		if (!IS_P2ALIGNED(size, p2 / 2))
 			continue;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 
 		if (IS_P2ALIGNED(size, PAGESIZE))
 			align = PAGESIZE;
 		else
 			align = 1 << (highbit64(size ^ (size - 1)) - 1);
 
 		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
 		    KMC_NODEBUG : 0;
 		data_cflags = KMC_NODEBUG;
 		if (abd_size_alloc_linear(size)) {
 			cflags |= KMC_RECLAIMABLE;
 			data_cflags |= KMC_RECLAIMABLE;
 		}
 		if (cflags == data_cflags) {
 			/*
 			 * Resulting kmem caches would be identical.
 			 * Save memory by creating only one.
 			 */
 			(void) snprintf(name, sizeof (name),
 			    "zio_buf_comb_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size, align,
 			    NULL, NULL, NULL, NULL, NULL, cflags);
 			zio_data_buf_cache[c] = zio_buf_cache[c];
 			continue;
 		}
 		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
 		    (ulong_t)size);
 		zio_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, cflags);
 
 		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
 		    (ulong_t)size);
 		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
 
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	for (size_t i = 0; i < n; i++) {
 		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[i],
 			    (long long unsigned)zio_buf_cache_frees[i]);
 	}
 #endif
 
 	/*
 	 * The same kmem cache can show up multiple times in both zio_buf_cache
 	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
 	 * sort it out.
 	 */
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_buf_cache[j])
 				zio_buf_cache[j] = NULL;
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_data_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		VERIFY3P(zio_buf_cache[i], ==, NULL);
 		VERIFY3P(zio_data_buf_cache[i], ==, NULL);
 	}
 
 	if (zio_ksp != NULL) {
 		kstat_delete(zio_ksp);
 		zio_ksp = NULL;
 	}
 
 	wmsum_fini(&ziostat_sums.ziostat_total_allocations);
 	wmsum_fini(&ziostat_sums.ziostat_alloc_class_fallbacks);
 	wmsum_fini(&ziostat_sums.ziostat_gang_writes);
 	wmsum_fini(&ziostat_sums.ziostat_gang_multilevel);
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 #if defined(ZFS_DEBUG) && defined(_KERNEL)
 #define	ZFS_ZIO_BUF_CANARY	1
 #endif
 
 #ifdef ZFS_ZIO_BUF_CANARY
 static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 
 /*
  * Use empty space after the buffer to detect overflows.
  *
  * Since zio_init() creates kmem caches only for certain set of buffer sizes,
  * allocations of different sizes may have some unused space after the data.
  * Filling part of that space with a known pattern on allocation and checking
  * it on free should allow us to detect some buffer overflows.
  */
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t)) {
 		if (unlikely(*canary != zio_buf_canary)) {
 			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
 			    p, size, (canary - p) * sizeof (ulong_t),
 			    *canary, zio_buf_canary);
 		}
 	}
 }
 #endif
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
 #endif
 	return (p);
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
 #endif
 	return (p);
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 #endif
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 #endif
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	(void) size;
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, data, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	spa_t *spa = zio->io_spa;
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, abd, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
 				abd_free(abd);
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			abd_free(abd);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
 			ret = zio_handle_decrypt_injection(spa,
 			    &zio->io_bookmark, ot, ECKSUM);
 		}
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but when this was done,
 	 * we had run out of bits in what is now zio_flag_t. Future cleanup
 	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
 			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
 			    zio->io_abd, size, mac);
 			if (zio_injection_enabled && ret == 0) {
 				ret = zio_handle_decrypt_injection(spa,
 				    &zio->io_bookmark, ot, ECKSUM);
 			}
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
 	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
 	    zio->io_abd, &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	ASSERT(MUTEX_HELD(&pio->io_lock));
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
 void
 zio_add_child_first(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	ASSERT(list_is_empty(&cio->io_parent_list));
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
     zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	/*
 	 * Propogate the Direct I/O checksum verify failure to the parent.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 		pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
 		 * so. We do this if the parent's zio type matches the child's
 		 * type, or if it's a zio_null() with no done callback, and so
 		 * has no actual work to do. Otherwise dispatch the parent zio
 		 * in its own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
 		 * overhead, and has no recursion penalty.  Note that one
 		 * read from disk typically causes at least 3 zio's: a
 		 * zio_null(), the logical zio_read(), and then a physical
 		 * zio.  When the physical ZIO completes, we are able to call
 		 * zio_done() on all 3 of these zio's from one invocation of
 		 * zio_execute() by returning the parent back to
 		 * zio_execute().  Since the parent isn't executed until this
 		 * thread returns back to zio_execute(), the caller should do
 		 * so promptly.
 		 *
 		 * In other cases, dispatching the parent prevents
 		 * overflowing the stack when we have deeply nested
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
 		    (pio->io_type == zio->io_type ||
 		    (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
 		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT) {
 			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		} else {
 			zio->io_bp = (blkptr_t *)bp;
 		}
 		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 /*
  * ZIO intended to be between others.  Provides synchronization at READY
  * and DONE pipeline stages and calls the respective callbacks.
  */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 /*
  * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
  * READY pipeline stage (is ready on creation), so it should not be used
  * as child of any ZIO that may need waiting for grandchildren READY stage
  * (any other ZIO type).
  */
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
 
 	return (zio);
 }
 
 static int
 zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
     enum blk_verify_flag blk_verify, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("bad blkptr at %px: "
 	    "DVA[0]=%#llx/%#llx "
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
 	    "pad=%#llx,%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
 	    "cksum=%#llx/%#llx/%#llx/%#llx",
 	    bp,
 	    (long long)bp->blk_dva[0].dva_word[0],
 	    (long long)bp->blk_dva[0].dva_word[1],
 	    (long long)bp->blk_dva[1].dva_word[0],
 	    (long long)bp->blk_dva[1].dva_word[1],
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
 	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
 	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
 	    (long long)bp->blk_cksum.zc_word[2],
 	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
 		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_ONLY:
 		break;
 	}
 
 	return (1);
 }
 
 /*
  * Verify the block pointer fields contain reasonable values.  This means
  * it only contains known object types, checksum/compression identifiers,
  * block sizes within the maximum allowed limits, valid DVAs, etc.
  *
  * If everything checks out 0 is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
  * Values for blk_verify_flag:
  *   BLK_VERIFY_ONLY: evaluate the block
  *   BLK_VERIFY_LOG: evaluate the block and log problems
  *   BLK_VERIFY_HALT: call zfs_panic_recover on error
  *
  * Values for blk_config_flag:
  *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
  *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
  *   obtained for reader
  *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
  *   performance
  */
 int
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (unlikely(!DMU_OT_IS_VALID(BP_GET_TYPE(bp)))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (unlikely(BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (unlikely(BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_IS_EMBEDDED(bp)) {
 		if (unlikely(BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 		if (unlikely(BPE_GET_PSIZE(bp) > BPE_PAYLOAD_SIZE)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid PSIZE %llu",
 			    bp, (longlong_t)BPE_GET_PSIZE(bp));
 		}
 		return (errors ? ECKSUM : 0);
 	} else if (BP_IS_HOLE(bp)) {
 		/*
 		 * Holes are allowed (expected, even) to have no DVAs, no
 		 * checksum, and no psize.
 		 */
 		return (errors ? ECKSUM : 0);
 	} else if (unlikely(!DVA_IS_VALID(&bp->blk_dva[0]))) {
 		/* Non-hole, non-embedded BPs _must_ have at least one DVA */
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has no valid DVAs", bp);
 	}
 	if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (unlikely(BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	/*
 	 * Do not verify individual DVAs if the config is not trusted. This
 	 * will be done once the zio is executed in vdev_mirror_map_alloc.
 	 */
 	if (unlikely(!spa->spa_trust_config))
 		return (errors ? ECKSUM : 0);
 
 	switch (blk_config) {
 	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
 		break;
 	case BLK_CONFIG_NEEDED:
 		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
 		break;
 	case BLK_CONFIG_NEEDED_TRY:
 		if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER))
 			return (EBUSY);
 		break;
 	case BLK_CONFIG_SKIP:
 		return (errors ? ECKSUM : 0);
 	default:
 		panic("invalid blk_config %u", blk_config);
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the logical birth
 	 * and physical birth are not too large.  However,
 	 * spa_freeze() allows the birth time of log blocks (and
 	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
 	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		uint64_t vdevid = DVA_GET_VDEV(dva);
 
 		if (unlikely(vdevid >= spa->spa_root_vdev->vdev_children)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (unlikely(vd == NULL)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (unlikely(vd->vdev_ops == &vdev_hole_ops)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(dva);
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		if (DVA_GET_GANG(dva))
 			asize = vdev_gang_header_asize(vd);
 		if (unlikely(offset + asize > vd->vdev_asize)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 	if (blk_config == BLK_CONFIG_NEEDED || blk_config ==
 	    BLK_CONFIG_NEEDED_TRY)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors ? ECKSUM : 0);
 }
 
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
 	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
 		return (B_FALSE);
 
 	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 	if (vd == NULL)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_hole_ops)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_missing_ops) {
 		return (B_FALSE);
 	}
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = DVA_GET_ASIZE(dva);
 
 	if (DVA_GET_GANG(dva))
 		asize = vdev_gang_header_asize(vd);
 	if (offset + asize > vd->vdev_asize)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 	enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
 	    ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
 
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, pipeline);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
-    boolean_t brtwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
+    boolean_t nopwrite, boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
+	zio->io_prop.zp_gang_copies = gang_copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 *
 	 * Note that we only defer frees after zfs_sync_pass_deferred_free
 	 * when the log space map feature is disabled. [see relevant comment
 	 * in spa_sync_iterate_to_convergence()]
 	 */
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
 	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
 	}
 }
 
 /*
  * To improve performance, this function may return NULL if we were able
  * to do the free immediately.  This avoids the cost of creating a zio
  * (and linking it to the parent, etc).
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (NULL);
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    brt_maybe_exists(spa, bp)) {
 		/*
 		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
 		 * block header, the DDT or the BRT), so issue them
 		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
 
 		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 		    BP_GET_PSIZE(bp), NULL, NULL,
 		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
 	} else {
 		metaslab_free(spa, bp, txg, B_FALSE);
 		return (NULL);
 	}
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
 	ASSERT0(vd->vdev_children);
 	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	ASSERT3U(size, !=, 0);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
 	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
 	zio->io_trim_flags = trim_flags;
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	/*
 	 * vdev child I/Os do not propagate their error to the parent.
 	 * Therefore, for correct operation the caller *must* check for
 	 * and handle the error in the child i/o's done callback.
 	 * The only exceptions are i/os that we don't care about
 	 * (OPTIONAL or REPAIR).
 	 */
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 		/*
 		 * We never allow the mirror VDEV to attempt reading from any
 		 * additional data copies after the first Direct I/O checksum
 		 * verify failure. This is to avoid bad data being written out
 		 * through the mirror during self healing. See comment in
 		 * vdev_mirror_io_done() for more details.
 		 */
 		ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 	} else if (type == ZIO_TYPE_WRITE &&
 	    pio->io_prop.zp_direct_write == B_TRUE) {
 		/*
 		 * By default we only will verify checksums for Direct I/O
 		 * writes for Linux. FreeBSD is able to place user pages under
 		 * write protection before issuing them to the ZIO pipeline.
 		 *
 		 * Checksum validation errors will only be reported through
 		 * the top-level VDEV, which is set by this child ZIO.
 		 */
 		ASSERT3P(bp, !=, NULL);
 		ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 		pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		ASSERT0(vd->vdev_children);
 		offset += VDEV_LABEL_START_SIZE;
 	}
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 
 /*
  * Send a flush command to the given vdev. Unlike most zio creation functions,
  * the flush zios are issued immediately. You can wait on pio to pause until
  * the flushes complete.
  */
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
 	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY;
 
 	if (vd->vdev_nowritecache)
 		return;
 
 	if (vd->vdev_children == 0) {
 		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
 		    NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
 		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
 	}
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * Round provided allocation size up to a value that can be allocated
  * by at least some vdev(s) in the pool with minimum or no additional
  * padding and without extra space usage on others
  */
 static uint64_t
 zio_roundup_alloc_size(spa_t *spa, uint64_t size)
 {
 	if (size > spa->spa_min_alloc)
 		return (roundup(size, spa->spa_gcd_alloc));
 	return (spa->spa_min_alloc);
 }
 
 size_t
 zio_get_compression_max_size(enum zio_compress compress, uint64_t gcd_alloc,
     uint64_t min_alloc, size_t s_len)
 {
 	size_t d_len;
 
 	/* minimum 12.5% must be saved (legacy value, may be changed later) */
 	d_len = s_len - (s_len >> 3);
 
 	/* ZLE can't use exactly d_len bytes, it needs more, so ignore it */
 	if (compress == ZIO_COMPRESS_ZLE)
 		return (d_len);
 
 	d_len = d_len - d_len % gcd_alloc;
 
 	if (d_len < min_alloc)
 		return (BPE_PAYLOAD_SIZE);
 	return (d_len);
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zp->zp_brtwrite)
 			return (zio);
 
 		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
 
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (zio);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
 		    MIN(zp->zp_copies, spa_max_replication(spa))
 		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		abd_t *cabd = NULL;
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0)
 			psize = 0;
 		else if (compress == ZIO_COMPRESS_EMPTY)
 			psize = lsize;
 		else
 			psize = zio_compress_data(compress, zio->io_abd, &cabd,
 			    lsize,
 			    zio_get_compression_max_size(compress,
 			    spa->spa_gcd_alloc, spa->spa_min_alloc, lsize),
 			    zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			if (cabd != NULL)
 				abd_free(cabd);
 		} else if (psize <= BPE_PAYLOAD_SIZE && !zp->zp_encrypt &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			void *cbuf = abd_borrow_buf_copy(cabd, lsize);
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			abd_return_buf(cabd, cbuf, lsize);
 			abd_free(cabd);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (zio);
 		} else {
 			/*
 			 * Round compressed size up to the minimum allocation
 			 * size of the smallest-ashift device, and zero the
 			 * tail. This ensures that the compressed size of the
 			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				abd_free(cabd);
 				psize = lsize;
 			} else {
 				abd_zero_off(cabd, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cabd,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
 	    zp->zp_type == DMU_OT_DNODE) {
 		/*
 		 * The DMU actually relies on the zio layer's compression
 		 * to free metadnode blocks that have had all contained
 		 * dnodes freed. As a result, even when doing a raw
 		 * receive, we must check whether the block can be compressed
 		 * to a hole.
 		 */
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0) {
 			psize = 0;
 			compress = ZIO_COMPRESS_OFF;
 		} else {
 			psize = lsize;
 		}
 	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
 		/*
 		 * If we are raw receiving an encrypted dataset we should not
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
 		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
 		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
 			abd_zero_off(cdata, psize, rounded - psize);
 			abd_copy_off(cdata, zio->io_abd, 0, 0, psize);
 			psize = rounded;
 			zio_push_transform(zio, cdata,
 			    psize, rounded, NULL);
 		}
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available or cut the line otherwise.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
 		if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 			q++;
 		else
 			cutinline = B_TRUE;
 	}
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	spa_t *spa = zio->io_spa;
 
 	taskq_t *tq = taskq_of_curthread();
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (tqs->stqs_taskq[i] == tq)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 	return (NULL);
 }
 
 void
 zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			int ticks = MAX(1, NSEC_TO_TICK(diff));
 			clock_t expire_at_tick = ddi_get_lbolt() + ticks;
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			tid = taskq_dispatch_delay(system_taskq, zio_interrupt,
 			    zio, TQ_NOSLEEP, expire_at_tick);
 			if (tid == TASKQID_INVALID) {
 				/*
 				 * Couldn't allocate a task.  Just finish the
 				 * zio without a delay.
 				 */
 				zio_interrupt(zio);
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 static void
 zio_deadman_impl(zio_t *pio, int ziodepth)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 	vdev_t *vd = pio->io_vd;
 
 	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
 		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
 		zbookmark_phys_t *zb = &pio->io_bookmark;
 		uint64_t delta = gethrtime() - pio->io_timestamp;
 		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
 
 		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
 		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
 		    "offset=%llu size=%llu "
 		    "error=%d",
 		    ziodepth, pio, pio->io_timestamp,
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
 		    pio->io_priority, (u_longlong_t)pio->io_flags,
 		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
 		    pio->io_error);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
 		    pio->io_spa, vd, zb, pio, 0);
 
 		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
 		    taskq_empty_ent(&pio->io_tqent)) {
 			zio_interrupt(pio);
 		}
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_deadman_impl(cio, ziodepth + 1);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Log the critical information describing this zio and all of its children
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
 zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
 
 	if (!zfs_deadman_enabled || spa_suspended(spa))
 		return;
 
 	zio_deadman_impl(pio, 0);
 
 	switch (spa_get_deadman_failmode(spa)) {
 	case ZIO_FAILURE_MODE_WAIT:
 		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_CONTINUE:
 		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_PANIC:
 		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
 		break;
 	}
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 static boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #else
 	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 
 		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 
 		/*
 		 * The zio pipeline stage returns the next zio to execute
 		 * (typically the same as this one), or NULL if we should
 		 * stop.
 		 */
 		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (zio == NULL)
 			return;
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	/*
 	 * Some routines, like zio_free_sync(), may return a NULL zio
 	 * to avoid the performance overhead of creating and then destroying
 	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
 	 * zio and ignore it.
 	 */
 	if (zio == NULL)
 		return (0);
 
 	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;
 
 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL) {
 		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
 		    ddi_get_lbolt() + timeout);
 
 		if (zfs_deadman_enabled && error == -1 &&
 		    gethrtime() - zio->io_queued_timestamp >
 		    spa_deadman_ziotime(zio->io_spa)) {
 			mutex_exit(&zio->io_lock);
 			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
 			zio_deadman(zio, FTAG);
 			mutex_enter(&zio->io_lock);
 		}
 	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	/*
 	 * See comment in zio_wait().
 	 */
 	if (zio == NULL)
 		return;
 
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
 	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
 	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
 
 	/*
 	 * It's possible for a failed ZIO to be a descendant of more than one
 	 * ZIO tree. When reexecuting it, we have to be sure to add its wait
 	 * states to all parent wait counts.
 	 *
 	 * Those parents, in turn, may have other children that are currently
 	 * active, usually because they've already been reexecuted after
 	 * resuming. Those children may be executing and may call
 	 * zio_notify_parent() at the same time as we're updating our parent's
 	 * counts. To avoid races while updating the counts, we take
 	 * gio->io_lock before each update.
 	 */
 	zio_link_t *zl = NULL;
 	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
 		mutex_enter(&gio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
 			gio->io_children[pio->io_child_type][w] +=
 			    !pio->io_state[w];
 		}
 		mutex_exit(&gio->io_lock);
 	}
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
 	}
 	mutex_exit(&pio->io_lock);
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	if (reason != ZIO_SUSPEND_MMP) {
 		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
 		    "I/O failure and has been suspended.", spa_name(spa));
 	}
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = reason;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	if (spa->spa_suspended != ZIO_SUSPEND_NONE)
 		cmn_err(CE_WARN, "Pool '%s' was suspended and is being "
 		    "resumed. Failed I/O will be retried.",
 		    spa_name(spa));
 	spa->spa_suspended = ZIO_SUSPEND_NONE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_free(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
 		zio = zio_null(pio, pio->io_spa,
 		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (zio);
 }
 
 static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	zio_t *gio __maybe_unused = zio->io_gang_leader;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	/*
 	 * The io_abd field will be NULL for a zio with no data.  The io_flags
 	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
 	 * check for it here as it is cleared in zio_ready.
 	 */
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
-	 * If one copy was requested, store 2 copies of the GBH, so that we
-	 * can still traverse all the data (e.g. to free or scrub) even if a
-	 * block is damaged.  Note that we can't store 3 copies of the GBH in
-	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
+	 * Store multiple copies of the GBH, so that we can still traverse
+	 * all the data (e.g. to free or scrub) even if a block is damaged.
+	 * This value respects the redundant_metadata property.
 	 */
-	int gbh_copies = copies;
-	if (gbh_copies == 1) {
-		gbh_copies = MIN(2, spa_max_replication(spa));
-	}
+	int gbh_copies = gio->io_prop.zp_gang_copies;
+	ASSERT3S(gbh_copies, >, 0);
+	ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
 		    mca_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
+		ASSERT3U(gbh_copies, >=, copies);
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 
 		pio->io_error = error;
 		return (pio);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_complevel = gio->io_prop.zp_complevel;
 		zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
+		zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		zp.zp_direct_write = B_FALSE;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
 		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		zio_gang_inherit_allocator(zio, cio);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (pio);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
 		 * indirect vdev, then ignore the nopwrite request and
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
 		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
 			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
 			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
 			if (tvd->vdev_ops == &vdev_indirect_ops) {
 				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 				return (zio);
 			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Block Reference Table
  * ==========================================================================
  */
 static zio_t *
 zio_brt_free(zio_t *zio)
 {
 	blkptr_t *bp;
 
 	bp = zio->io_bp;
 
 	if (BP_GET_LEVEL(bp) > 0 ||
 	    BP_IS_METADATA(bp) ||
 	    !brt_maybe_exists(zio->io_spa, bp)) {
 		return (zio);
 	}
 
 	if (!brt_entry_decref(zio->io_spa, bp)) {
 		/*
 		 * This isn't the last reference, so we cannot free
 		 * the data yet.
 		 */
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddt = ddt_select(zio->io_spa, bp);
 
 	if (zio->io_error == 0) {
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 		/* this phys variant doesn't need repair */
 		ddt_phys_clear(dde->dde_phys, v);
 	}
 
 	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
 		dde->dde_io->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
 		ddt_univ_phys_t *ddp = dde->dde_phys;
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (v_self == DDT_PHYS_NONE)
 			return (zio);
 
 		/* issue I/O for the other copies */
 		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 			ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 			if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
 				continue;
 
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
 			    ddp, v, &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (zio);
 }
 
 static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
 		if (dde->dde_io->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (zio);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
 		if (dde->dde_io == NULL)
 			continue;
 
 		zio_t *lio = dde->dde_io->dde_lead_zio[p];
 		if (lio == NULL)
 			continue;
 
 		if (do_raw)
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 
 		return (lio->io_orig_size != zio->io_orig_size ||
 		    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 	}
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
 
 		if (phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
 	zio_link_t *zl = NULL;
 	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	ddt_enter(ddt);
 
 	/* we're the lead, so once we're done there's no one else outstanding */
 	if (dde->dde_io->dde_lead_zio[p] == zio)
 		dde->dde_io->dde_lead_zio[p] = NULL;
 
 	ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
 
 	if (zio->io_error != 0) {
 		/*
 		 * The write failed, so we're about to abort the entire IO
 		 * chain. We need to revert the entry back to what it was at
 		 * the last time it was successfully extended.
 		 */
 		ddt_phys_copy(ddp, orig, v);
 		ddt_phys_clear(orig, v);
 
 		ddt_exit(ddt);
 		return;
 	}
 
 	/*
 	 * We've successfully added new DVAs to the entry. Clear the saved
 	 * state or, if there's still outstanding IO, remember it so we can
 	 * revert to a known good state if that IO fails.
 	 */
 	if (dde->dde_io->dde_lead_zio[p] == NULL)
 		ddt_phys_clear(orig, v);
 	else
 		ddt_phys_copy(orig, ddp, v);
 
 	/*
 	 * Add references for all dedup writes that were waiting on the
 	 * physical one, skipping any other physical writes that are waiting.
 	 */
 	zio_t *pio;
 	zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
 			ddt_phys_addref(ddp, v);
 	}
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
 	zio_link_t *zl = NULL;
 	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 	if (zio->io_error != 0)
 		return;
 
 	ddt_enter(ddt);
 
 	ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
 
 	zio_t *pio;
 	zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
 			ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
 	}
 
 	ddt_exit(ddt);
 }
 
 static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 	/*
 	 * Deduplication will not take place for Direct I/O writes. The
 	 * ddt_tree will be emptied in syncing context. Direct I/O writes take
 	 * place in the open-context. Direct I/O write can not attempt to
 	 * modify the ddt_tree while issuing out a write.
 	 */
 	ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
 
 	ddt_enter(ddt);
 	/*
 	 * Search DDT for matching entry.  Skip DVAs verification here, since
 	 * they can go only from override, and once we get here the override
 	 * pointer can't have "D" flag to be confused with pruned DDT entries.
 	 */
 	IMPLY(zio->io_bp_override, !BP_GET_DEDUP(zio->io_bp_override));
 	dde = ddt_lookup(ddt, bp, B_FALSE);
 	if (dde == NULL) {
 		/* DDT size is over its quota so no new entries */
 		zp->zp_dedup = B_FALSE;
 		BP_SET_DEDUP(bp, B_FALSE);
 		if (zio->io_bp_override == NULL)
 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	/*
 	 * In the common cases, at this point we have a regular BP with no
 	 * allocated DVAs, and the corresponding DDT entry for its checksum.
 	 * Our goal is to fill the BP with enough DVAs to satisfy its copies=
 	 * requirement.
 	 *
 	 * One of three things needs to happen to fulfill this:
 	 *
 	 * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
 	 *   them out of the entry and return;
 	 *
 	 * - if the DDT entry has no DVAs (ie its brand new), then we have to
 	 *   issue the write as normal so that DVAs can be allocated and the
 	 *   data land on disk. We then copy the DVAs into the DDT entry on
 	 *   return.
 	 *
 	 * - if the DDT entry has some DVAs, but too few, we have to issue the
 	 *   write, adjusted to have allocate fewer copies. When it returns, we
 	 *   add the new DVAs to the DDT entry, and update the BP to have the
 	 *   full amount it originally requested.
 	 *
 	 * In all cases, if there's already a writing IO in flight, we need to
 	 * defer the action until after the write is done. If our action is to
 	 * write, we need to adjust our request for additional DVAs to match
 	 * what will be in the DDT entry after it completes. In this way every
 	 * IO can be guaranteed to recieve enough DVAs simply by joining the
 	 * end of the chain and letting the sequence play out.
 	 */
 
 	/*
 	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
 	 * the third one as normal.
 	 */
 	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
 	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
 
 	/* Number of DVAs requested bya the IO. */
 	uint8_t need_dvas = zp->zp_copies;
 
 	/*
 	 * What we do next depends on whether or not there's IO outstanding that
 	 * will update this entry.
 	 */
 	if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
 		/*
 		 * No IO outstanding, so we only need to worry about ourselves.
 		 */
 
 		/*
 		 * Override BPs bring their own DVAs and their own problems.
 		 */
 		if (zio->io_bp_override) {
 			/*
 			 * For a brand-new entry, all the work has been done
 			 * for us, and we can just fill it out from the provided
 			 * block and leave.
 			 */
 			if (have_dvas == 0) {
 				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 				ddt_phys_extend(ddp, v, bp);
 				ddt_phys_addref(ddp, v);
 				ddt_exit(ddt);
 				return (zio);
 			}
 
 			/*
 			 * If we already have this entry, then we want to treat
 			 * it like a regular write. To do this we just wipe
 			 * them out and proceed like a regular write.
 			 *
 			 * Even if there are some DVAs in the entry, we still
 			 * have to clear them out. We can't use them to fill
 			 * out the dedup entry, as they are all referenced
 			 * together by a bp already on disk, and will be freed
 			 * as a group.
 			 */
 			BP_ZERO_DVAS(bp);
 			BP_SET_BIRTH(bp, 0, 0);
 		}
 
 		/*
 		 * If there are enough DVAs in the entry to service our request,
 		 * then we can just use them as-is.
 		 */
 		if (have_dvas >= need_dvas) {
 			ddt_bp_fill(ddp, v, bp, txg);
 			ddt_phys_addref(ddp, v);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * Otherwise, we have to issue IO to fill the entry up to the
 		 * amount we need.
 		 */
 		need_dvas -= have_dvas;
 	} else {
 		/*
 		 * There's a write in-flight. If there's already enough DVAs on
 		 * the entry, then either there were already enough to start
 		 * with, or the in-flight IO is between READY and DONE, and so
 		 * has extended the entry with new DVAs. Either way, we don't
 		 * need to do anything, we can just slot in behind it.
 		 */
 
 		if (zio->io_bp_override) {
 			/*
 			 * If there's a write out, then we're soon going to
 			 * have our own copies of this block, so clear out the
 			 * override block and treat it as a regular dedup
 			 * write. See comment above.
 			 */
 			BP_ZERO_DVAS(bp);
 			BP_SET_BIRTH(bp, 0, 0);
 		}
 
 		if (have_dvas >= need_dvas) {
 			/*
 			 * A minor point: there might already be enough
 			 * committed DVAs in the entry to service our request,
 			 * but we don't know which are completed and which are
 			 * allocated but not yet written. In this case, should
 			 * the IO for the new DVAs fail, we will be on the end
 			 * of the IO chain and will also recieve an error, even
 			 * though our request could have been serviced.
 			 *
 			 * This is an extremely rare case, as it requires the
 			 * original block to be copied with a request for a
 			 * larger number of DVAs, then copied again requesting
 			 * the same (or already fulfilled) number of DVAs while
 			 * the first request is active, and then that first
 			 * request errors. In return, the logic required to
 			 * catch and handle it is complex. For now, I'm just
 			 * not going to bother with it.
 			 */
 
 			/*
 			 * We always fill the bp here as we may have arrived
 			 * after the in-flight write has passed READY, and so
 			 * missed out.
 			 */
 			ddt_bp_fill(ddp, v, bp, txg);
 			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * There's not enough in the entry yet, so we need to look at
 		 * the write in-flight and see how many DVAs it will have once
 		 * it completes.
 		 *
 		 * The in-flight write has potentially had its copies request
 		 * reduced (if we're filling out an existing entry), so we need
 		 * to reach in and get the original write to find out what it is
 		 * expecting.
 		 *
 		 * Note that the parent of the lead zio will always have the
 		 * highest zp_copies of any zio in the chain, because ones that
 		 * can be serviced without additional IO are always added to
 		 * the back of the chain.
 		 */
 		zio_link_t *zl = NULL;
 		zio_t *pio =
 		    zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
 		ASSERT(pio);
 		uint8_t parent_dvas = pio->io_prop.zp_copies;
 
 		if (parent_dvas >= need_dvas) {
 			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * Still not enough, so we will need to issue to get the
 		 * shortfall.
 		 */
 		need_dvas -= parent_dvas;
 	}
 
 	/*
 	 * We need to write. We will create a new write with the copies
 	 * property adjusted to match the number of DVAs we need to need to
 	 * grow the DDT entry by to satisfy the request.
 	 */
 	zio_prop_t czp = *zp;
-	czp.zp_copies = need_dvas;
+	czp.zp_copies = czp.zp_gang_copies = need_dvas;
 	zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 	    zio->io_orig_size, zio->io_orig_size, &czp,
 	    zio_ddt_child_write_ready, NULL,
 	    zio_ddt_child_write_done, dde, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 
 	/*
 	 * We are the new lead zio, because our parent has the highest
 	 * zp_copies that has been requested for this entry so far.
 	 */
 	ddt_alloc_entry_io(dde);
 	if (dde->dde_io->dde_lead_zio[p] == NULL) {
 		/*
 		 * First time out, take a copy of the stable entry to revert
 		 * to if there's an error (see zio_ddt_child_write_done())
 		 */
 		ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
 	} else {
 		/*
 		 * Make the existing chain our child, because it cannot
 		 * complete until we have.
 		 */
 		zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
 	}
 	dde->dde_io->dde_lead_zio[p] = cio;
 
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
 
 	return (zio);
 }
 
 static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde = NULL;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 		if (v != DDT_PHYS_NONE)
 			ddt_phys_decref(dde->dde_phys, v);
 	}
 	ddt_exit(ddt);
 
 	/*
 	 * When no entry was found, it must have been pruned,
 	 * so we can free it now instead of decrementing the
 	 * refcount in the DDT.
 	 */
 	if (!dde) {
 		BP_SET_DEDUP(bp, 0);
 		zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
 	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
 	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 	metaslab_class_t *mc;
 
 	/* locate an appropriate allocation class */
 	mc = spa_preferred_class(spa, zio);
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !mc->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (zio);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
 	nio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
 static void
 zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	/*
 	 * if not already chosen, locate an appropriate allocation class
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL) {
 		mc = spa_preferred_class(spa, zio);
 		zio->io_metaslab_class = mc;
 	}
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 
 	/*
 	 * Try allocating the block in the usual metaslab class.
 	 * If that's full, allocate it in the normal class.
 	 * If that's full, allocate as a gang block,
 	 * and if all are full, the allocation fails (which shouldn't happen).
 	 *
 	 * Note that we do not fall back on embedded slog (ZIL) space, to
 	 * preserve unfragmented slog space, which is critical for decent
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	/*
 	 * Fallback to normal class when an alloc class is full
 	 */
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		/*
 		 * When the dedup or special class is spilling into the  normal
 		 * class, there can still be significant space available due
 		 * to deferred frees that are in-flight.  We track the txg when
 		 * this occurred and back off adding new DDT entries for a few
 		 * txgs to allow the free blocks to be processed.
 		 */
 		if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
 		    mc == spa_special_class(spa))) &&
 		    spa->spa_dedup_class_full_txg != zio->io_txg) {
 			spa->spa_dedup_class_full_txg = zio->io_txg;
 			zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
 			    "%llu allocated of %llu",
 			    spa_name(spa), (int)zio->io_txg,
 			    mc == spa_dedup_class(spa) ? "dedup" : "special",
 			    (int)zio->io_size,
 			    (u_longlong_t)metaslab_class_get_alloc(mc),
 			    (u_longlong_t)metaslab_class_get_space(mc));
 		}
 
 		/*
 		 * If throttling, transfer reservation over to normal class.
 		 * The io_allocator slot can remain the same even though we
 		 * are switching classes.
 		 */
 		if (mc->mc_alloc_throttle_enabled &&
 		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
 			metaslab_class_throttle_unreserve(mc,
 			    zio->io_prop.zp_copies, zio->io_allocator, zio);
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 
 			VERIFY(metaslab_class_throttle_reserve(
 			    spa_normal_class(spa),
 			    zio->io_prop.zp_copies, zio->io_allocator, zio,
 			    flags | METASLAB_MUST_RESERVE));
 		}
 		zio->io_metaslab_class = mc = spa_normal_class(spa);
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying normal class: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
 		error = metaslab_alloc(spa, mc, zio->io_size, bp,
 		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 		    &zio->io_alloc_list, zio, zio->io_allocator);
 	}
 
 	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying ganging: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		ZIOSTAT_BUMP(ziostat_gang_writes);
 		if (flags & METASLAB_GANG_CHILD)
 			ZIOSTAT_BUMP(ziostat_gang_multilevel);
 		return (zio_write_gang_block(zio, mc));
 	}
 	if (error != 0) {
 		if (error != ENOSPC ||
 		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
 			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
 			    "size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_error = error;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (zio);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp)) {
 		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
 		    B_TRUE);
 	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 
 	/*
 	 * Block pointer fields are useful to metaslabs for stats and debugging.
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 	BP_SET_PSIZE(new_bp, size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
 	 * When allocating a zil block, we don't have information about
 	 * the final destination of the block except the objset it's part
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
 	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object)
 	    % spa->spa_alloc_count;
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	if (error != 0) {
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
 		    error);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (NULL);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(spa->spa_trust_config);
 
 		/*
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering.
 	 *
 	 * There are a few ways that we can end up creating these spurious
 	 * resilver i/os:
 	 *
 	 * 1. A resilver i/o will be issued if any DVA in the BP has a
 	 * dirty DTL.  The mirror code will issue resilver writes to
 	 * each DVA, including the one(s) that are not on vdevs with dirty
 	 * DTLs.
 	 *
 	 * 2. With nested replication, which happens when we have a
 	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
 	 * For example, given mirror(replacing(A+B), C), it's likely that
 	 * only A is out of date (it's the new device). In this case, we'll
 	 * read from C, then use the data to resilver A+B -- but we don't
 	 * actually want to resilver B, just A. The top-level mirror has no
 	 * way to know this, so instead we just discard unnecessary repairs
 	 * as we work our way down the vdev tree.
 	 *
 	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
 	 * The same logic applies to any form of nested replication: ditto
 	 * + mirror, RAID-Z + replacing, etc.
 	 *
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
 	 *
 	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
 	 * a dRAID spare vdev. For example, when a dRAID spare is first
 	 * used, its spare blocks need to be written to but the leaf vdev's
 	 * of such blocks can have empty DTL_PARTIAL.
 	 *
 	 * There seemed no clean way to allow such writes while bypassing
 	 * spurious ones. At this point, just avoid all bypassing for dRAID
 	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
 	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}
 
 	/*
 	 * Select the next best leaf I/O to process.  Distributed spares are
 	 * excluded since they dispatch the I/O directly to a leaf vdev after
 	 * applying the dRAID mapping.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (NULL);
 		}
 		zio->io_delay = gethrtime();
 
 		if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
 			/*
 			 * "no-op" injections return success, but do no actual
 			 * work. Just return it.
 			 */
 			zio_delay_interrupt(zio);
 			return (NULL);
 		}
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (NULL);
 }
 
 static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_FLUSH ||
 	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		if (zio->io_type != ZIO_TYPE_FLUSH)
 			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH &&
 		    zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
 }
 
 /*
  * This function is used to change the priority of an existing zio that is
  * currently in-flight. This is used by the arc to upgrade priority in the
  * event that a demand read is made for a block that is currently queued
  * as a scrub or async read IO. Otherwise, the high priority read request
  * would end up having to wait for the lower priority IO.
  */
 void
 zio_change_priority(zio_t *pio, zio_priority_t priority)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_change_io_priority(pio, priority);
 	} else {
 		pio->io_priority = priority;
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_change_priority(cio, priority);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	/*
 	 * If a Direct I/O operation has a checksum verify error then this I/O
 	 * should not attempt to be issued again.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 			ASSERT3U(zio->io_error, ==, EIO);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		return (zio);
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (NULL);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
 		    "cant_write=TRUE due to write failure with ENXIO",
 		    zio);
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future, and
 	 * then we act like the flush succeeded.
 	 */
 	if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH &&
 	    vd != NULL) {
 		vd->vdev_nowritecache = B_TRUE;
 		zio->io_error = 0;
 	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static zio_t *
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (zio);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (zio);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (zio);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		ASSERT0(BP_GET_LEVEL(bp));
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 
 		/* dnode blocks must be written out in the provided byteorder */
 		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
 		    ot == DMU_OT_DNODE) {
 			void *bswap_buf = zio_buf_alloc(psize);
 			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
 
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
 			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
 			    psize);
 
 			abd_take_ownership_of_buf(babd, B_TRUE);
 			zio_push_transform(zio, babd, psize, psize, NULL);
 		}
 
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (zio);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
 		return (zio);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this constitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
 	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (zio);
 }
 
 static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
 	ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 	IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			if (zio->io_flags & ZIO_FLAG_DIO_READ) {
 				zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 				zio_t *pio = zio_unique_parent(zio);
 				/*
 				 * Any Direct I/O read that has a checksum
 				 * error must be treated as suspicous as the
 				 * contents of the buffer could be getting
 				 * manipulated while the I/O is taking place.
 				 *
 				 * The checksum verify error will only be
 				 * reported here for disk and file VDEV's and
 				 * will be reported on those that the failure
 				 * occurred on. Other types of VDEV's report the
 				 * verify failure in their own code paths.
 				 */
 				if (pio->io_child_type == ZIO_CHILD_LOGICAL) {
 					zio_dio_chksum_verify_error_report(zio);
 				}
 			} else {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_checksum_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 				(void) zfs_ereport_start_checksum(zio->io_spa,
 				    zio->io_vd, &zio->io_bookmark, zio,
 				    zio->io_offset, zio->io_size, &info);
 			}
 		}
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dio_checksum_verify(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	int error;
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
 	ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 
 	if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
 		goto out;
 
 	if ((error = zio_checksum_error(zio, NULL)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM) {
 			zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 			zio_dio_chksum_verify_error_report(zio);
 		}
 	}
 
 out:
 	return (zio);
 }
 
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * Report Direct I/O checksum verify error and create ZED event.
  */
 void
 zio_dio_chksum_verify_error_report(zio_t *zio)
 {
 	ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 		return;
 
 	mutex_enter(&zio->io_vd->vdev_stat_lock);
 	zio->io_vd->vdev_stat.vs_dio_verify_errors++;
 	mutex_exit(&zio->io_vd->vdev_stat_lock);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * Convert checksum error for writes into EIO.
 		 */
 		zio->io_error = SET_ERROR(EIO);
 		/*
 		 * Report dio_verify_wr ZED event.
 		 */
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR,
 		    zio->io_spa,  zio->io_vd, &zio->io_bookmark, zio, 0);
 	} else {
 		/*
 		 * Report dio_verify_rd ZED event.
 		 */
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD,
 		    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 	}
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
 		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 #ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 #endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
 			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    zio->io_metaslab_class, zio->io_prop.zp_copies,
 			    zio->io_allocator, zio);
 			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (zio);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *lio __maybe_unused = zio->io_logical;
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 	ASSERT(zio->io_metaslab_class != NULL);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
 	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
 	    pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
 static zio_t *
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recursively up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(zio->io_metaslab_class != NULL);
 		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
 		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
 		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
 		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
 	}
 
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (adata != NULL && asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL && asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
 			/*
 			 * We want to only increment our slow IO counters if
 			 * the IO is valid (i.e. not if the drive is removed).
 			 *
 			 * zfs_ereport_post() will also do these checks, but
 			 * it can also ratelimit and have other failures, so we
 			 * need to increment the slow_io counters independent
 			 * of it.
 			 */
 			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
 			    zio->io_spa, zio->io_vd, zio)) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
 				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
 				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
 				    zio, 0);
 			}
 		}
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				if (zio->io_type == ZIO_TYPE_READ)
 					zio->io_vd->vdev_stat.vs_read_errors++;
 				else if (zio->io_type == ZIO_TYPE_WRITE)
 					zio->io_vd->vdev_stat.vs_write_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 			}
 		}
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * A Direct I/O operation that has a checksum verify error
 		 * should not attempt to reexecute. Instead, the error should
 		 * just be propagated back.
 		 */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
 
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
 				 * bother with "next_to_execute".
 				 */
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
 				    NULL);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			/*
 			 * This is a rare code path, so we don't bother with
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			spa_taskq_dispatch(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    zio_reexecute, zio, B_FALSE);
 		}
 		return (NULL);
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * We are done executing this zio.  We may want to execute a parent
 	 * next.  See the comment in zio_notify_parent().
 	 */
 	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (next_to_execute);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_dio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
 	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 /*
  * This function is similar to zbookmark_subtree_completed(), but returns true
  * if subtree_root is equal or ahead of last_block, i.e. still to be done.
  */
 boolean_t
 zbookmark_subtree_tbd(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	ASSERT0(last_block->zb_level);
 	if (dnp == NULL)
 		return (B_FALSE);
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
 	    last_block) >= 0);
 }
 
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 	"Throttle block allocations in the ZIO pipeline");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 70b35e30ecbd..1f8aca0d9e1b 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -1,1077 +1,1081 @@
 # SPDX-License-Identifier: CDDL-1.0
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # This run file contains all of the common functional tests.  When
 # adding a new test consider also adding it to the sanity.run file
 # if the new test runs to completion in only a few seconds.
 #
 # Approximate run time: 4-5 hours
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 600
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 tags = ['functional']
 
 [tests/functional/acl/off]
 tests = ['dosmode', 'posixmode']
 tags = ['functional', 'acl']
 
 [tests/functional/alloc_class]
 tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
     'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
 tests = ['file_append', 'threadsappend_001_pos']
 tags = ['functional', 'append']
 
 [tests/functional/arc]
 tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos',
     'arcstats_runtime_tuning']
 tags = ['functional', 'arc']
 
 [tests/functional/atime]
 tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
 tags = ['functional', 'atime']
 
 [tests/functional/bclone]
 tests = ['bclone_crossfs_corner_cases_limited',
     'bclone_crossfs_data',
     'bclone_crossfs_embedded',
     'bclone_crossfs_hole',
     'bclone_diffprops_all',
     'bclone_diffprops_checksum',
     'bclone_diffprops_compress',
     'bclone_diffprops_copies',
     'bclone_diffprops_recordsize',
     'bclone_prop_sync',
     'bclone_samefs_corner_cases_limited',
     'bclone_samefs_data',
     'bclone_samefs_embedded',
     'bclone_samefs_hole']
 tags = ['functional', 'bclone']
 timeout = 7200
 
 [tests/functional/block_cloning]
 tests = ['block_cloning_clone_mmap_cached',
     'block_cloning_copyfilerange',
     'block_cloning_copyfilerange_partial',
     'block_cloning_copyfilerange_fallback',
     'block_cloning_disabled_copyfilerange',
     'block_cloning_copyfilerange_cross_dataset',
     'block_cloning_cross_enc_dataset',
     'block_cloning_copyfilerange_fallback_same_txg',
     'block_cloning_replay', 'block_cloning_replay_encrypted',
     'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
     'block_cloning_rlimit_fsize', 'block_cloning_large_offset']
 tags = ['functional', 'block_cloning']
 
 [tests/functional/bootfs]
 tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
     'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',
     'bootfs_008_pos']
 tags = ['functional', 'bootfs']
 
 [tests/functional/btree]
 tests = ['btree_positive', 'btree_negative']
 tags = ['functional', 'btree']
 pre =
 post =
 
 [tests/functional/cache]
 tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg',
     'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg',
     'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
 tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos',
     'cachefile_004_pos']
 tags = ['functional', 'cachefile']
 
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure',
     'sensitive_none_lookup', 'sensitive_none_delete',
     'sensitive_formd_lookup', 'sensitive_formd_delete',
     'insensitive_none_lookup', 'insensitive_none_delete',
     'insensitive_formd_lookup', 'insensitive_formd_delete',
     'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
     'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
 tags = ['functional', 'casenorm']
 
 [tests/functional/channel_program/lua_core]
 tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists',
     'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg',
     'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries',
     'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua',
     'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large',
     'tst.return_nvlist_neg', 'tst.return_nvlist_pos',
     'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout']
 tags = ['functional', 'channel_program', 'lua_core']
 
 [tests/functional/channel_program/synctask_core]
 tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
     'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg',
     'tst.get_number_props', 'tst.get_string_props', 'tst.get_type',
     'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks',
     'tst.list_children', 'tst.list_clones', 'tst.list_holds',
     'tst.list_snapshots', 'tst.list_system_props',
     'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
     'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
     'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg',
     'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple',
     'tst.bookmark.create', 'tst.bookmark.copy',
     'tst.terminate_by_signal'
     ]
 tags = ['functional', 'channel_program', 'synctask_core']
 
 [tests/functional/checksum]
 tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test',
     'filetest_001_pos', 'filetest_002_pos']
 tags = ['functional', 'checksum']
 
 [tests/functional/clean_mirror]
 tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']
 
 [tests/functional/cli_root/json]
 tests = ['json_sanity']
 tags = ['functional', 'cli_root', 'json']
 
 [tests/functional/cli_root/zinject]
 tests = ['zinject_args', 'zinject_counts', 'zinject_probe']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zinject']
 
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
     'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
     'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
 timeout = 1200
 
 [tests/functional/cli_root/zfs]
 tests = ['zfs_001_neg', 'zfs_002_pos']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_bookmark]
 tests = ['zfs_bookmark_cliargs']
 tags = ['functional', 'cli_root', 'zfs_bookmark']
 
 [tests/functional/cli_root/zfs_change-key]
 tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format',
     'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location',
     'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones']
 tags = ['functional', 'cli_root', 'zfs_change-key']
 
 [tests/functional/cli_root/zfs_clone]
 tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos',
     'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos',
     'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg',
     'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested',
     'zfs_clone_rm_nested']
 tags = ['functional', 'cli_root', 'zfs_clone']
 
 [tests/functional/cli_root/zfs_copies]
 tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos',
     'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos']
 tags = ['functional', 'cli_root', 'zfs_copies']
 
 [tests/functional/cli_root/zfs_create]
 tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
     'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos',
     'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg',
     'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos',
     'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted',
     'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount',
     'zfs_create_verbose']
 tags = ['functional', 'cli_root', 'zfs_create']
 
 [tests/functional/cli_root/zpool_prefetch]
 tests = ['zpool_prefetch_001_pos']
 tags = ['functional', 'cli_root', 'zpool_prefetch']
 
 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_clone_livelist_condense_and_disable',
     'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
     'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
     'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
     'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
     'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
 tags = ['functional', 'cli_root', 'zfs_destroy']
 
 [tests/functional/cli_root/zfs_diff]
 tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp',
     'zfs_diff_types', 'zfs_diff_encrypted', 'zfs_diff_mangle']
 tags = ['functional', 'cli_root', 'zfs_diff']
 
 [tests/functional/cli_root/zfs_get]
 tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos',
     'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg',
     'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg']
 tags = ['functional', 'cli_root', 'zfs_get']
 
 [tests/functional/cli_root/zfs_ids_to_path]
 tests = ['zfs_ids_to_path_001_pos']
 tags = ['functional', 'cli_root', 'zfs_ids_to_path']
 
 [tests/functional/cli_root/zfs_inherit]
 tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos',
     'zfs_inherit_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_inherit']
 
 [tests/functional/cli_root/zfs_load-key]
 tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file',
     'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop',
     'zfs_load-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_load-key']
 
 [tests/functional/cli_root/zfs_mount]
 tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
 tests = ['zfs_program_json']
 tags = ['functional', 'cli_root', 'zfs_program']
 
 [tests/functional/cli_root/zfs_promote]
 tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
     'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg',
     'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot']
 tags = ['functional', 'cli_root', 'zfs_promote']
 
 [tests/functional/cli_root/zfs_property]
 tests = ['zfs_written_property_001_pos']
 tags = ['functional', 'cli_root', 'zfs_property']
 
 [tests/functional/cli_root/zfs_receive]
 tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
     'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
     'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
     'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos',
     'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos',
     'zfs_receive_016_pos', 'receive-o-x_props_override',
     'receive-o-x_props_aliases',
     'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted',
     'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e',
     'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props',
     'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective',
     'zfs_receive_compressed_corrective', 'zfs_receive_large_block_corrective']
 tags = ['functional', 'cli_root', 'zfs_receive']
 
 [tests/functional/cli_root/zfs_rename]
 tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos',
     'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos',
     'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg',
     'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg',
     'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child',
     'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount']
 tags = ['functional', 'cli_root', 'zfs_rename']
 
 [tests/functional/cli_root/zfs_reservation]
 tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos',
     'zfs_rollback_003_neg', 'zfs_rollback_004_neg']
 tags = ['functional', 'cli_root', 'zfs_rollback']
 
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
     'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
     'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded',
     'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing']
 tags = ['functional', 'cli_root', 'zfs_send']
 
 [tests/functional/cli_root/zfs_set]
 tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
     'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
     'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos',
     'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos',
     'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
     'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
     'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg',
     'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos',
     'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation',
     'zfs_set_feature_activation', 'zfs_set_nomount']
 tags = ['functional', 'cli_root', 'zfs_set']
 
 [tests/functional/cli_root/zfs_share]
 tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos',
     'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg',
     'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares',
     'zfs_share_after_mount']
 tags = ['functional', 'cli_root', 'zfs_share']
 
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
     'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
     'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg',
     'zfs_snapshot_009_pos']
 tags = ['functional', 'cli_root', 'zfs_snapshot']
 
 [tests/functional/cli_root/zfs_unload-key]
 tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_unload-key']
 
 [tests/functional/cli_root/zfs_unmount]
 tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos',
     'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos',
     'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos',
     'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys']
 tags = ['functional', 'cli_root', 'zfs_unmount']
 
 [tests/functional/cli_root/zfs_unshare]
 tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos',
     'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos',
     'zfs_unshare_007_pos']
 tags = ['functional', 'cli_root', 'zfs_unshare']
 
 [tests/functional/cli_root/zfs_upgrade]
 tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos',
     'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg',
     'zfs_upgrade_007_neg']
 tags = ['functional', 'cli_root', 'zfs_upgrade']
 
 [tests/functional/cli_root/zfs_wait]
 tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt']
 tags = ['functional', 'cli_root', 'zfs_wait']
 
 [tests/functional/cli_root/zhack]
 tests = ['zhack_label_repair_001', 'zhack_label_repair_002',
     'zhack_label_repair_003', 'zhack_label_repair_004']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zhack']
 
 [tests/functional/cli_root/zpool]
 tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors']
 tags = ['functional', 'cli_root', 'zpool']
 
 [tests/functional/cli_root/zpool_add]
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
     'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
 tests = ['zpool_attach_001_neg', 'attach-o_ashift']
 tags = ['functional', 'cli_root', 'zpool_attach']
 
 [tests/functional/cli_root/zpool_clear]
 tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg',
     'zpool_clear_readonly']
 tags = ['functional', 'cli_root', 'zpool_clear']
 
 [tests/functional/cli_root/zpool_create]
 tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
     'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos',
     'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos',
     'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg',
     'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg',
     'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos',
     'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
     'zpool_create_023_neg', 'zpool_create_024_pos',
     'zpool_create_encrypted', 'zpool_create_crypt_combos',
     'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos',
     'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos',
     'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
     'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
     'zpool_create_features_005_pos', 'zpool_create_features_006_pos',
     'zpool_create_features_007_pos', 'zpool_create_features_008_pos',
     'zpool_create_features_009_pos', 'create-o_ashift',
     'zpool_create_tempname', 'zpool_create_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_create']
 
 [tests/functional/cli_root/zpool_destroy]
 tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos',
     'zpool_destroy_003_neg']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_destroy']
 
 [tests/functional/cli_root/zpool_detach]
 tests = ['zpool_detach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_detach']
 
 [tests/functional/cli_root/zpool_events]
 tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow',
     'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates',
     'zpool_events_clear_retained']
 tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_export]
 tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
     'zpool_export_003_neg', 'zpool_export_004_pos',
     'zpool_export_parallel_pos', 'zpool_export_parallel_admin']
 tags = ['functional', 'cli_root', 'zpool_export']
 
 [tests/functional/cli_root/zpool_get]
 tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos',
     'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos']
 tags = ['functional', 'cli_root', 'zpool_get']
 
 [tests/functional/cli_root/zpool_history]
 tests = ['zpool_history_001_neg', 'zpool_history_002_pos']
 tags = ['functional', 'cli_root', 'zpool_history']
 
 [tests/functional/cli_root/zpool_import]
 tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos',
     'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos',
     'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg',
     'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
     'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos',
     'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
     'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
     'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos',
     'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
     'zpool_import_encrypted', 'zpool_import_encrypted_load',
     'zpool_import_errata3', 'zpool_import_errata4',
     'import_cachefile_device_added',
     'import_cachefile_device_removed',
     'import_cachefile_device_replaced',
     'import_cachefile_mirror_attached',
     'import_cachefile_mirror_detached',
     'import_cachefile_paths_changed',
     'import_cachefile_shared_device',
     'import_devices_missing', 'import_log_missing',
     'import_paths_changed',
     'import_rewind_config_changed',
     'import_rewind_device_replaced',
     'zpool_import_status', 'zpool_import_parallel_pos',
     'zpool_import_parallel_neg', 'zpool_import_parallel_admin']
 tags = ['functional', 'cli_root', 'zpool_import']
 timeout = 1200
 
 [tests/functional/cli_root/zpool_labelclear]
 tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
     'zpool_labelclear_removed', 'zpool_labelclear_valid']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_labelclear']
 
 [tests/functional/cli_root/zpool_initialize]
 tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_fault_export_import_online',
     'zpool_initialize_import_export',
     'zpool_initialize_offline_export_import_online',
     'zpool_initialize_online_offline',
     'zpool_initialize_split',
     'zpool_initialize_start_and_cancel_neg',
     'zpool_initialize_start_and_cancel_pos',
     'zpool_initialize_suspend_resume',
     'zpool_initialize_uninit',
     'zpool_initialize_unsupported_vdevs',
     'zpool_initialize_verify_checksums',
     'zpool_initialize_verify_initialized']
 pre =
 tags = ['functional', 'cli_root', 'zpool_initialize']
 
 [tests/functional/cli_root/zpool_offline]
 tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
     'zpool_offline_003_pos']
 tags = ['functional', 'cli_root', 'zpool_offline']
 
 [tests/functional/cli_root/zpool_online]
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
 [tests/functional/cli_root/zpool_reguid]
 tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
 tags = ['functional', 'cli_root', 'zpool_reguid']
 
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
 tags = ['functional', 'cli_root', 'zpool_remove']
 
 [tests/functional/cli_root/zpool_replace]
 tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
 tags = ['functional', 'cli_root', 'zpool_replace']
 
 [tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
     'zpool_resilver_concurrent']
 tags = ['functional', 'cli_root', 'zpool_resilver']
 
 [tests/functional/cli_root/zpool_scrub]
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
     'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
     'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
     'user_property_001_pos', 'user_property_002_neg',
     'zpool_set_clear_userprop']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
 tests = ['zpool_split_cliargs', 'zpool_split_devices',
     'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs',
     'zpool_split_resilver', 'zpool_split_indirect',
     'zpool_split_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
     'zpool_status_003_pos', 'zpool_status_004_pos',
     'zpool_status_005_pos', 'zpool_status_006_pos',
     'zpool_status_007_pos', 'zpool_status_008_pos',
     'zpool_status_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
 tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove',
     'zpool_trim_fault_export_import_online',
     'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
     'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
     'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume',
     'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums',
     'zpool_trim_verify_trimmed']
 tags = ['functional', 'zpool_trim']
 
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos',
     'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos',
     'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg',
     'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos',
     'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_upgrade']
 
 [tests/functional/cli_root/zpool_wait]
 tests = ['zpool_wait_discard', 'zpool_wait_freeing',
     'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel',
     'zpool_wait_initialize_flag', 'zpool_wait_multiple',
     'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel',
     'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag',
     'zpool_wait_usage']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
 tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
     'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
     'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_user/misc]
 tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg',
     'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg',
     'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg',
     'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg',
     'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg',
     'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg',
     'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg',
     'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg',
     'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg',
     'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg',
     'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg',
     'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg',
     'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg',
     'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos',
     'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege',
     'zilstat_001_pos']
 user =
 tags = ['functional', 'cli_user', 'misc']
 
 [tests/functional/cli_user/zfs_list]
 tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos',
     'zfs_list_004_neg', 'zfs_list_005_neg', 'zfs_list_007_pos',
     'zfs_list_008_neg']
 user =
 tags = ['functional', 'cli_user', 'zfs_list']
 
 [tests/functional/cli_user/zpool_iostat]
 tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
     'zpool_iostat_003_neg', 'zpool_iostat_004_pos',
     'zpool_iostat_005_pos', 'zpool_iostat_-c_disable',
     'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_iostat']
 
 [tests/functional/cli_user/zpool_list]
 tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
 [tests/functional/cli_user/zpool_status]
 tests = ['zpool_status_003_pos', 'zpool_status_-c_disable',
     'zpool_status_-c_homedir', 'zpool_status_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_status']
 
 [tests/functional/compression]
 tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
     'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled',
     'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc']
 tags = ['functional', 'compression']
 
 [tests/functional/cp_files]
 tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
 tags = ['functional', 'cp_files']
 
 [tests/functional/zap_shrink]
 tests = ['zap_shrink_001_pos']
 tags = ['functional', 'zap_shrink']
 
 [tests/functional/crtime]
 tests = ['crtime_001_pos' ]
 tags = ['functional', 'crtime']
 
 [tests/functional/crypto]
 tests = ['icp_aes_ccm', 'icp_aes_gcm']
 pre =
 post =
 tags = ['functional', 'crypto']
 
 [tests/functional/ctime]
 tests = ['ctime_001_pos' ]
 tags = ['functional', 'ctime']
 
 [tests/functional/deadman]
 tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio']
 pre =
 post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
 tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
     'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
     'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink']
 pre =
 post =
 tags = ['functional', 'dedup']
 
 [tests/functional/delegate]
 tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
     'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
     'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg',
     'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg',
     'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos',
     'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos',
     'zfs_unallow_007_neg', 'zfs_unallow_008_neg']
 tags = ['functional', 'delegate']
 
 [tests/functional/direct]
 tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
     'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block',
     'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites',
     'dio_property', 'dio_random', 'dio_read_verify', 'dio_recordsize',
     'dio_unaligned_block', 'dio_unaligned_filesize']
 tags = ['functional', 'direct']
 
 [tests/functional/exec]
 tests = ['exec_001_pos', 'exec_002_neg']
 tags = ['functional', 'exec']
 
 [tests/functional/fallocate]
 tests = ['fallocate_punch-hole']
 tags = ['functional', 'fallocate']
 
 [tests/functional/features/async_destroy]
 tests = ['async_destroy_001_pos']
 tags = ['functional', 'features', 'async_destroy']
 
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
     'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
 tags = ['functional', 'features', 'large_dnode']
 
+[tests/functional/gang_blocks]
+tests = ['gang_blocks_redundant']
+tags = ['functional', 'gang_blocks']
+
 [tests/functional/grow]
 pre =
 post =
 tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
 tags = ['functional', 'grow']
 
 [tests/functional/history]
 tests = ['history_001_pos', 'history_002_pos', 'history_003_pos',
     'history_004_pos', 'history_005_neg', 'history_006_neg',
     'history_007_pos', 'history_008_pos', 'history_009_pos',
     'history_010_pos']
 tags = ['functional', 'history']
 
 [tests/functional/hkdf]
 pre =
 post =
 tests = ['hkdf_test']
 tags = ['functional', 'hkdf']
 
 [tests/functional/inheritance]
 tests = ['inherit_001_pos']
 pre =
 tags = ['functional', 'inheritance']
 
 [tests/functional/io]
 tests = ['mmap', 'posixaio', 'psync', 'sync']
 tags = ['functional', 'io']
 
 [tests/functional/inuse]
 tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos']
 post =
 tags = ['functional', 'inuse']
 
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
 
 [tests/functional/limits]
 tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count',
     'snapshot_limit']
 tags = ['functional', 'limits']
 
 [tests/functional/link_count]
 tests = ['link_count_001', 'link_count_root_inode']
 tags = ['functional', 'link_count']
 
 [tests/functional/migration]
 tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
     'migration_004_pos', 'migration_005_pos', 'migration_006_pos',
     'migration_007_pos', 'migration_008_pos', 'migration_009_pos',
     'migration_010_pos', 'migration_011_pos', 'migration_012_pos']
 tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
     'mmap_sync_001_pos', 'mmap_write_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
 tests = ['umount_001', 'umountall_001']
 tags = ['functional', 'mount']
 
 [tests/functional/mv_files]
 tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation']
 tags = ['functional', 'mv_files']
 
 [tests/functional/nestedfs]
 tests = ['nestedfs_001_pos']
 tags = ['functional', 'nestedfs']
 
 [tests/functional/no_space]
 tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos',
     'enospc_df', 'enospc_ganging', 'enospc_rm']
 tags = ['functional', 'no_space']
 
 [tests/functional/nopwrite]
 tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative',
     'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync',
     'nopwrite_varying_compression', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
 [tests/functional/online_offline]
 tests = ['online_offline_001_pos', 'online_offline_002_neg',
     'online_offline_003_neg']
 tags = ['functional', 'online_offline']
 
 [tests/functional/pool_checkpoint]
 tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind',
     'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard',
     'checkpoint_discard_busy', 'checkpoint_discard_many',
     'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz',
     'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind',
     'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice',
     'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat']
 tags = ['functional', 'pool_checkpoint']
 timeout = 1800
 
 [tests/functional/pool_names]
 tests = ['pool_names_001_pos', 'pool_names_002_neg']
 pre =
 post =
 tags = ['functional', 'pool_names']
 
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
 
 [tests/functional/pyzfs]
 tests = ['pyzfs_unittest']
 pre =
 post =
 tags = ['functional', 'pyzfs']
 
 [tests/functional/quota]
 tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos',
          'quota_004_pos', 'quota_005_pos', 'quota_006_neg']
 tags = ['functional', 'quota']
 
 [tests/functional/redacted_send]
 tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
     'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes',
     'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones',
     'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative',
     'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume',
     'redacted_size', 'redacted_volume']
 tags = ['functional', 'redacted_send']
 
 [tests/functional/raidz]
 tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos',
     'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos',
     'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg',
     'raidz_expand_007_neg']
 tags = ['functional', 'raidz']
 timeout = 1200
 
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
     'redundancy_draid3', 'redundancy_draid_damaged1',
     'redundancy_draid_damaged2', 'redundancy_draid_spare1',
     'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
     'redundancy_raidz3', 'redundancy_stripe']
 tags = ['functional', 'redundancy']
 timeout = 1200
 
 [tests/functional/refquota]
 tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
     'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg',
     'refquota_007_neg', 'refquota_008_neg']
 tags = ['functional', 'refquota']
 
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
     'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
     'refreserv_raidz']
 tags = ['functional', 'refreserv']
 
 [tests/functional/removal]
 pre =
 tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space',
     'removal_condense_export', 'removal_multiple_indirection',
     'removal_nopwrite', 'removal_remap_deadlists',
     'removal_resume_export', 'removal_sanity', 'removal_with_add',
     'removal_with_create_fs', 'removal_with_dedup',
     'removal_with_errors', 'removal_with_export', 'removal_with_indirect',
     'removal_with_ganging', 'removal_with_faulted',
     'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
     'removal_with_send_recv', 'removal_with_snapshot',
     'removal_with_write', 'removal_with_zdb', 'remove_expanded',
     'remove_mirror', 'remove_mirror_sanity', 'remove_raidz',
     'remove_indirect', 'remove_attach_mirror', 'removal_reservation',
     'removal_with_hole']
 tags = ['functional', 'removal']
 
 [tests/functional/rename_dirs]
 tests = ['rename_dirs_001_pos']
 tags = ['functional', 'rename_dirs']
 
 [tests/functional/replacement]
 tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
     'attach_resilver', 'detach', 'rebuild_disabled_feature',
     'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
     'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
     'scrub_cancel']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
 tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos',
     'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos',
     'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos',
     'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos',
     'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos',
     'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
 [tests/functional/rootpool]
 tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
 tags = ['functional', 'rootpool']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
     'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
     'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos',
     'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos',
     'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos',
     'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'rsend_025_pos',
     'rsend_026_neg', 'rsend_027_pos', 'rsend_028_neg', 'rsend_029_neg',
     'rsend_030_pos', 'rsend_031_pos', 'send-c_verify_ratio',
     'send-c_verify_contents', 'send-c_props', 'send-c_incremental',
     'send-c_volume', 'send-c_zstream_recompress', 'send-c_zstreamdump',
     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
     'send-c_mixed_compression', 'send-c_stream_size_estimate',
     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
     'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_incremental',
     'send_encrypted_freeobjects', 'send_encrypted_hierarchy',
     'send_encrypted_props', 'send_encrypted_truncated_files',
     'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files',
     'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw',
     'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid',
     'send_doall', 'send_raw_spill_block', 'send_raw_ashift',
     'send_raw_large_blocks']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
     'scrub_mirror_003_pos', 'scrub_mirror_004_pos']
 tags = ['functional', 'scrub_mirror']
 
 [tests/functional/slog]
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
     'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
     'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
     'slog_replay_fs_002', 'slog_replay_volume', 'slog_016_pos']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
 tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
     'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos',
     'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
     'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
     'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
     'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
     'snapshot_017_pos', 'snapshot_018_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/snapused]
 tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
     'snapused_004_pos', 'snapused_005_pos']
 tags = ['functional', 'snapused']
 
 [tests/functional/sparse]
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
 [tests/functional/stat]
 tests = ['stat_001_pos', 'statx_dioalign']
 tags = ['functional', 'stat']
 
 [tests/functional/suid]
 tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
     'suid_write_to_none', 'suid_write_zil_replay']
 tags = ['functional', 'suid']
 
 [tests/functional/trim]
 tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
     'trim_integrity', 'trim_config', 'trim_l2arc']
 tags = ['functional', 'trim']
 
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps']
 tags = ['functional', 'truncate']
 
 [tests/functional/upgrade]
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
 [tests/functional/userquota]
 tests = [
     'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos',
     'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos',
     'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos',
     'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg',
     'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted',
     'userspace_send_encrypted', 'userspace_encrypted_13709']
 tags = ['functional', 'userquota']
 
 [tests/functional/vdev_disk:Linux]
 pre =
 post =
 tests = ['page_alignment']
 tags = ['functional', 'vdev_disk']
 
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
     'vdev_zaps_007_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/write_dirs]
 tests = ['write_dirs_001_pos', 'write_dirs_002_pos']
 tags = ['functional', 'write_dirs']
 
 [tests/functional/xattr]
 tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
     'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
     'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat']
 tags = ['functional', 'xattr']
 
 [tests/functional/zvol/zvol_ENOSPC]
 tests = ['zvol_ENOSPC_001_pos']
 tags = ['functional', 'zvol', 'zvol_ENOSPC']
 
 [tests/functional/zvol/zvol_cli]
 tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 tags = ['functional', 'zvol', 'zvol_cli']
 
 [tests/functional/zvol/zvol_misc]
 tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
     'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil']
 tags = ['functional', 'zvol', 'zvol_misc']
 
 [tests/functional/zvol/zvol_stress]
 tests = ['zvol_stress']
 tags = ['functional', 'zvol', 'zvol_stress']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos']
 tags = ['functional', 'zvol', 'zvol_swap']
 
 [tests/functional/libzfs]
 tests = ['many_fds', 'libzfs_input']
 tags = ['functional', 'libzfs']
 
 [tests/functional/log_spacemap]
 tests = ['log_spacemap_import_logs']
 pre =
 post =
 tags = ['functional', 'log_spacemap']
 
 [tests/functional/l2arc]
 tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos',
     'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
     'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos']
 tags = ['functional', 'l2arc']
 
 [tests/functional/zpool_influxdb]
 tests = ['zpool_influxdb']
 tags = ['functional', 'zpool_influxdb']
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 0a546dd44553..79dc64ad9350 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -1,116 +1,117 @@
 # This file exports variables for each tunable used in the test suite.
 #
 # Different platforms use different names for most tunables. To avoid littering
 # the tests with conditional logic for deciding how to set each tunable, the
 # logic is instead consolidated to this one file.
 #
 # Any use of tunables in tests must use a name defined here. New entries
 # should be added to the table as needed. Please keep the table sorted
 # alphabetically for ease of maintenance.
 #
 # Platform-specific tunables should still use a NAME from this table for
 # consistency. Enter UNSUPPORTED in the column for platforms on which the
 # tunable is not implemented.
 
 UNAME=$(uname)
 
 # NAME				FreeBSD tunable			Linux tunable
 cat <<%%%% |
 ADMIN_SNAPSHOT			UNSUPPORTED			zfs_admin_snapshot
 ALLOW_REDACTED_DATASET_MOUNT	allow_redacted_dataset_mount	zfs_allow_redacted_dataset_mount
 ARC_MAX				arc.max				zfs_arc_max
 ARC_MIN				arc.min				zfs_arc_min
 ASYNC_BLOCK_MAX_BLOCKS		async_block_max_blocks		zfs_async_block_max_blocks
 CHECKSUM_EVENTS_PER_SECOND	checksum_events_per_second	zfs_checksum_events_per_second
 COMMIT_TIMEOUT_PCT		commit_timeout_pct		zfs_commit_timeout_pct
 COMPRESSED_ARC_ENABLED		compressed_arc_enabled		zfs_compressed_arc_enabled
 CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS	condense.indirect_commit_entry_delay_ms	zfs_condense_indirect_commit_entry_delay_ms
 CONDENSE_INDIRECT_OBSOLETE_PCT	condense.indirect_obsolete_pct	zfs_condense_indirect_obsolete_pct
 CONDENSE_MIN_MAPPING_BYTES	condense.min_mapping_bytes	zfs_condense_min_mapping_bytes
 DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
 DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEDUP_LOG_FLUSH_ENTRIES_MAX	dedup.log_flush_entries_max	zfs_dedup_log_flush_entries_max
 DEDUP_LOG_FLUSH_ENTRIES_MIN	dedup.log_flush_entries_min	zfs_dedup_log_flush_entries_min
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
 DEADMAN_SYNCTIME_MS		deadman.synctime_ms		zfs_deadman_synctime_ms
 DEADMAN_ZIOTIME_MS		deadman.ziotime_ms		zfs_deadman_ziotime_ms
 DISABLE_IVSET_GUID_CHECK	disable_ivset_guid_check	zfs_disable_ivset_guid_check
 DMU_OFFSET_NEXT_SYNC		dmu_offset_next_sync		zfs_dmu_offset_next_sync
 EMBEDDED_SLOG_MIN_MS		embedded_slog_min_ms		zfs_embedded_slog_min_ms
 INITIALIZE_CHUNK_SIZE		initialize_chunk_size		zfs_initialize_chunk_size
 INITIALIZE_VALUE		initialize_value		zfs_initialize_value
 KEEP_LOG_SPACEMAPS_AT_EXPORT	keep_log_spacemaps_at_export	zfs_keep_log_spacemaps_at_export
 LUA_MAX_MEMLIMIT		lua.max_memlimit		zfs_lua_max_memlimit
 L2ARC_MFUONLY			l2arc.mfuonly			l2arc_mfuonly
 L2ARC_NOPREFETCH		l2arc.noprefetch		l2arc_noprefetch
 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE	l2arc.rebuild_blocks_min_l2size	l2arc_rebuild_blocks_min_l2size
 L2ARC_REBUILD_ENABLED		l2arc.rebuild_enabled		l2arc_rebuild_enabled
 L2ARC_TRIM_AHEAD		l2arc.trim_ahead		l2arc_trim_ahead
 L2ARC_WRITE_BOOST		l2arc.write_boost		l2arc_write_boost
 L2ARC_WRITE_MAX			l2arc.write_max			l2arc_write_max
 LIVELIST_CONDENSE_NEW_ALLOC	livelist.condense.new_alloc	zfs_livelist_condense_new_alloc
 LIVELIST_CONDENSE_SYNC_CANCEL	livelist.condense.sync_cancel	zfs_livelist_condense_sync_cancel
 LIVELIST_CONDENSE_SYNC_PAUSE	livelist.condense.sync_pause	zfs_livelist_condense_sync_pause
 LIVELIST_CONDENSE_ZTHR_CANCEL	livelist.condense.zthr_cancel	zfs_livelist_condense_zthr_cancel
 LIVELIST_CONDENSE_ZTHR_PAUSE	livelist.condense.zthr_pause	zfs_livelist_condense_zthr_pause
 LIVELIST_MAX_ENTRIES		livelist.max_entries		zfs_livelist_max_entries
 LIVELIST_MIN_PERCENT_SHARED	livelist.min_percent_shared	zfs_livelist_min_percent_shared
 MAX_DATASET_NESTING		max_dataset_nesting		zfs_max_dataset_nesting
 MAX_MISSING_TVDS		max_missing_tvds		zfs_max_missing_tvds
 METASLAB_DEBUG_LOAD		metaslab.debug_load		metaslab_debug_load
 METASLAB_FORCE_GANGING		metaslab.force_ganging		metaslab_force_ganging
+METASLAB_FORCE_GANGING_PCT	metaslab.force_ganging_pct	metaslab_force_ganging_pct
 MULTIHOST_FAIL_INTERVALS	multihost.fail_intervals	zfs_multihost_fail_intervals
 MULTIHOST_HISTORY		multihost.history		zfs_multihost_history
 MULTIHOST_IMPORT_INTERVALS	multihost.import_intervals	zfs_multihost_import_intervals
 MULTIHOST_INTERVAL		multihost.interval		zfs_multihost_interval
 OVERRIDE_ESTIMATE_RECORDSIZE	send.override_estimate_recordsize	zfs_override_estimate_recordsize
 PREFETCH_DISABLE		prefetch.disable		zfs_prefetch_disable
 RAIDZ_EXPAND_MAX_REFLOW_BYTES	vdev.expand_max_reflow_bytes	raidz_expand_max_reflow_bytes
 REBUILD_SCRUB_ENABLED		rebuild_scrub_enabled		zfs_rebuild_scrub_enabled
 REMOVAL_SUSPEND_PROGRESS	removal_suspend_progress	zfs_removal_suspend_progress
 REMOVE_MAX_SEGMENT		remove_max_segment		zfs_remove_max_segment
 RESILVER_MIN_TIME_MS		resilver_min_time_ms		zfs_resilver_min_time_ms
 RESILVER_DEFER_PERCENT		resilver_defer_percent		zfs_resilver_defer_percent
 SCAN_LEGACY			scan_legacy			zfs_scan_legacy
 SCAN_SUSPEND_PROGRESS		scan_suspend_progress		zfs_scan_suspend_progress
 SCAN_VDEV_LIMIT			scan_vdev_limit			zfs_scan_vdev_limit
 SCRUB_AFTER_EXPAND		scrub_after_expand		zfs_scrub_after_expand
 SEND_HOLES_WITHOUT_BIRTH_TIME	send_holes_without_birth_time	send_holes_without_birth_time
 SLOW_IO_EVENTS_PER_SECOND	slow_io_events_per_second	zfs_slow_io_events_per_second
 SPA_ASIZE_INFLATION		spa.asize_inflation		spa_asize_inflation
 SPA_DISCARD_MEMORY_LIMIT	spa.discard_memory_limit	zfs_spa_discard_memory_limit
 SPA_LOAD_VERIFY_DATA		spa.load_verify_data		spa_load_verify_data
 SPA_LOAD_VERIFY_METADATA	spa.load_verify_metadata	spa_load_verify_metadata
 TRIM_EXTENT_BYTES_MIN		trim.extent_bytes_min		zfs_trim_extent_bytes_min
 TRIM_METASLAB_SKIP		trim.metaslab_skip		zfs_trim_metaslab_skip
 TRIM_TXG_BATCH			trim.txg_batch			zfs_trim_txg_batch
 TXG_HISTORY			txg.history			zfs_txg_history
 TXG_TIMEOUT			txg.timeout			zfs_txg_timeout
 UNLINK_SUSPEND_PROGRESS		UNSUPPORTED			zfs_unlink_suspend_progress
 VDEV_FILE_LOGICAL_ASHIFT	vdev.file.logical_ashift	vdev_file_logical_ashift
 VDEV_FILE_PHYSICAL_ASHIFT	vdev.file.physical_ashift	vdev_file_physical_ashift
 VDEV_MAX_AUTO_ASHIFT		vdev.max_auto_ashift		zfs_vdev_max_auto_ashift
 VDEV_MIN_MS_COUNT		vdev.min_ms_count		zfs_vdev_min_ms_count
 VDEV_DIRECT_WR_VERIFY		vdev.direct_write_verify	zfs_vdev_direct_write_verify
 VDEV_VALIDATE_SKIP		vdev.validate_skip		vdev_validate_skip
 VOL_INHIBIT_DEV			UNSUPPORTED			zvol_inhibit_dev
 VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
 VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
 BCLONE_ENABLED			bclone_enabled			zfs_bclone_enabled
 BCLONE_WAIT_DIRTY		bclone_wait_dirty		zfs_bclone_wait_dirty
 DIO_ENABLED			dio_enabled			zfs_dio_enabled
 XATTR_COMPAT			xattr_compat			zfs_xattr_compat
 ZEVENT_LEN_MAX			zevent.len_max			zfs_zevent_len_max
 ZEVENT_RETAIN_MAX		zevent.retain_max		zfs_zevent_retain_max
 ZIO_SLOW_IO_MS			zio.slow_io_ms			zio_slow_io_ms
 ZIL_SAXATTR			zil_saxattr			zfs_zil_saxattr
 %%%%
 while read name FreeBSD Linux; do
 	eval "export ${name}=\$${UNAME}"
 done
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 0942082cf972..bce546d066f6 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1,2195 +1,2199 @@
 CLEANFILES =
 dist_noinst_DATA =
 include $(top_srcdir)/config/Substfiles.am
 
 
 datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests
 nobase_dist_datadir_zfs_tests_tests_DATA = \
 	perf/nfs-sample.cfg \
 	perf/perf.shlib \
 	\
 	perf/fio/mkfiles.fio \
 	perf/fio/random_reads.fio \
 	perf/fio/random_readwrite.fio \
 	perf/fio/random_readwrite_fixed.fio \
 	perf/fio/random_writes.fio \
 	perf/fio/sequential_reads.fio \
 	perf/fio/sequential_readwrite.fio \
 	perf/fio/sequential_writes.fio
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \
 	perf/regression/random_reads.ksh \
 	perf/regression/random_readwrite.ksh \
 	perf/regression/random_readwrite_fixed.ksh \
 	perf/regression/random_writes.ksh \
 	perf/regression/random_writes_zil.ksh \
 	perf/regression/sequential_reads_arc_cached_clone.ksh \
 	perf/regression/sequential_reads_arc_cached.ksh \
 	perf/regression/sequential_reads_dbuf_cached.ksh \
 	perf/regression/sequential_reads.ksh \
 	perf/regression/sequential_writes.ksh \
 	perf/regression/setup.ksh \
 	\
 	perf/scripts/prefetch_io.sh
 
 # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source:
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #
 # simd and tmpfile are Linux-only and not installed elsewhere
 #
 # C programs are specced in ../Makefile.am above as part of the main Makefile
 
 find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'
 regen:
 	@$(MAKE) -C $(top_builddir) clean
 	@$(MAKE) clean
 	$(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am
 	$(find_common) ! -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am
 	$(find_common)   -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am
 	echo >> Makefile.am
 	echo 'if BUILD_LINUX' >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'endif' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am
 	$(find_common) ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 
 # -- >8 --
 
 nobase_nodist_datadir_zfs_tests_tests_DATA = \
 	functional/pam/utilities.kshlib
 nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \
 	functional/pyzfs/pyzfs_unittest.ksh
 
 SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)
 
 if BUILD_LINUX
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/simd/simd_supported.ksh \
 	functional/tmpfile/cleanup.ksh \
 	functional/tmpfile/setup.ksh \
 	functional/luks/luks_sanity.ksh
 endif
 
 nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/acl/acl.cfg \
 	functional/acl/acl_common.kshlib \
 	functional/alloc_class/alloc_class.cfg \
 	functional/alloc_class/alloc_class.kshlib \
 	functional/atime/atime.cfg \
 	functional/atime/atime_common.kshlib \
 	functional/bclone/bclone.cfg \
 	functional/bclone/bclone_common.kshlib \
 	functional/bclone/bclone_corner_cases.kshlib \
 	functional/block_cloning/block_cloning.kshlib \
 	functional/cache/cache.cfg \
 	functional/cache/cache.kshlib \
 	functional/cachefile/cachefile.cfg \
 	functional/cachefile/cachefile.kshlib \
 	functional/casenorm/casenorm.cfg \
 	functional/casenorm/casenorm.kshlib \
 	functional/channel_program/channel_common.kshlib \
 	functional/channel_program/lua_core/tst.args_to_lua.out \
 	functional/channel_program/lua_core/tst.args_to_lua.zcp \
 	functional/channel_program/lua_core/tst.divide_by_zero.err \
 	functional/channel_program/lua_core/tst.divide_by_zero.zcp \
 	functional/channel_program/lua_core/tst.exists.zcp \
 	functional/channel_program/lua_core/tst.large_prog.out \
 	functional/channel_program/lua_core/tst.large_prog.zcp \
 	functional/channel_program/lua_core/tst.lib_base.lua \
 	functional/channel_program/lua_core/tst.lib_coroutine.lua \
 	functional/channel_program/lua_core/tst.lib_strings.lua \
 	functional/channel_program/lua_core/tst.lib_table.lua \
 	functional/channel_program/lua_core/tst.nested_neg.zcp \
 	functional/channel_program/lua_core/tst.nested_pos.zcp \
 	functional/channel_program/lua_core/tst.recursive.zcp \
 	functional/channel_program/lua_core/tst.return_large.zcp \
 	functional/channel_program/lua_core/tst.return_recursive_table.zcp \
 	functional/channel_program/lua_core/tst.stack_gsub.err \
 	functional/channel_program/lua_core/tst.stack_gsub.zcp \
 	functional/channel_program/lua_core/tst.timeout.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.copy.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.create.zcp \
 	functional/channel_program/synctask_core/tst.get_index_props.out \
 	functional/channel_program/synctask_core/tst.get_index_props.zcp \
 	functional/channel_program/synctask_core/tst.get_number_props.out \
 	functional/channel_program/synctask_core/tst.get_number_props.zcp \
 	functional/channel_program/synctask_core/tst.get_string_props.out \
 	functional/channel_program/synctask_core/tst.get_string_props.zcp \
 	functional/channel_program/synctask_core/tst.promote_conflict.zcp \
 	functional/channel_program/synctask_core/tst.set_props.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_neg.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_rename.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_simple.zcp \
 	functional/checksum/default.cfg \
 	functional/clean_mirror/clean_mirror_common.kshlib \
 	functional/clean_mirror/default.cfg \
 	functional/crypto/aes_ccm_test.json \
 	functional/crypto/aes_ccm_test.txt \
 	functional/crypto/aes_gcm_test.json \
 	functional/crypto/aes_gcm_test.txt \
 	functional/cli_root/cli_common.kshlib \
 	functional/cli_root/zfs_copies/zfs_copies.cfg \
 	functional/cli_root/zfs_copies/zfs_copies.kshlib \
 	functional/cli_root/zfs_create/properties.kshlib \
 	functional/cli_root/zfs_create/zfs_create.cfg \
 	functional/cli_root/zfs_create/zfs_create_common.kshlib \
 	functional/cli_root/zfs_destroy/zfs_destroy.cfg \
 	functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_list_d.kshlib \
 	functional/cli_root/zfs_jail/jail.conf \
 	functional/cli_root/zfs_load-key/HEXKEY \
 	functional/cli_root/zfs_load-key/PASSPHRASE \
 	functional/cli_root/zfs_load-key/RAWKEY \
 	functional/cli_root/zfs_load-key/zfs_load-key.cfg \
 	functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \
 	functional/cli_root/zfs_mount/zfs_mount.cfg \
 	functional/cli_root/zfs_mount/zfs_mount.kshlib \
 	functional/cli_root/zfs_promote/zfs_promote.cfg \
 	functional/cli_root/zfs_receive/zstd_test_data.txt \
 	functional/cli_root/zfs_rename/zfs_rename.cfg \
 	functional/cli_root/zfs_rename/zfs_rename.kshlib \
 	functional/cli_root/zfs_rollback/zfs_rollback.cfg \
 	functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \
 	functional/cli_root/zfs_send/zfs_send.cfg \
 	functional/cli_root/zfs_set/zfs_set_common.kshlib \
 	functional/cli_root/zfs_share/zfs_share.cfg \
 	functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.kshlib \
 	functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \
 	functional/cli_root/zfs_wait/zfs_wait.kshlib \
 	functional/cli_root/zpool_add/zpool_add.cfg \
 	functional/cli_root/zpool_add/zpool_add.kshlib \
 	functional/cli_root/zpool_clear/zpool_clear.cfg \
 	functional/cli_root/zpool_create/draidcfg.gz \
 	functional/cli_root/zpool_create/zpool_create.cfg \
 	functional/cli_root/zpool_create/zpool_create.shlib \
 	functional/cli_root/zpool_destroy/zpool_destroy.cfg \
 	functional/cli_root/zpool_events/zpool_events.cfg \
 	functional/cli_root/zpool_events/zpool_events.kshlib \
 	functional/cli_root/zpool_expand/zpool_expand.cfg \
 	functional/cli_root/zpool_export/zpool_export.cfg \
 	functional/cli_root/zpool_export/zpool_export.kshlib \
 	functional/cli_root/zpool_get/vdev_get.cfg \
 	functional/cli_root/zpool_get/zpool_get.cfg \
 	functional/cli_root/zpool_get/zpool_get_parsable.cfg \
 	functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \
 	functional/cli_root/zpool_import/zpool_import.cfg \
 	functional/cli_root/zpool_import/zpool_import.kshlib \
 	functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
 	functional/cli_root/zpool_labelclear/labelclear.cfg \
 	functional/cli_root/zpool_remove/zpool_remove.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.shlib \
 	functional/cli_root/zpool_resilver/zpool_resilver.cfg \
 	functional/cli_root/zpool_scrub/zpool_scrub.cfg \
 	functional/cli_root/zpool_split/zpool_split.cfg \
 	functional/cli_root/zpool_trim/zpool_trim.kshlib \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \
 	functional/cli_root/zpool_wait/zpool_wait.kshlib \
 	functional/cli_root/zhack/library.kshlib \
 	functional/cli_user/misc/misc.cfg \
 	functional/cli_user/zfs_list/zfs_list.cfg \
 	functional/cli_user/zfs_list/zfs_list.kshlib \
 	functional/compression/compress.cfg \
 	functional/compression/testpool_zstd.tar.gz \
 	functional/deadman/deadman.cfg \
 	functional/delegate/delegate.cfg \
 	functional/delegate/delegate_common.kshlib \
 	functional/devices/devices.cfg \
 	functional/devices/devices_common.kshlib \
 	functional/direct/dio.cfg \
 	functional/direct/dio.kshlib \
 	functional/events/events.cfg \
 	functional/events/events_common.kshlib \
 	functional/fault/fault.cfg \
+	functional/gang_blocks/gang_blocks.kshlib \
 	functional/grow/grow.cfg \
 	functional/history/history.cfg \
 	functional/history/history_common.kshlib \
 	functional/history/i386.migratedpool.DAT.Z \
 	functional/history/i386.orig_history.txt \
 	functional/history/sparc.migratedpool.DAT.Z \
 	functional/history/sparc.orig_history.txt \
 	functional/history/zfs-pool-v4.dat.Z \
 	functional/inheritance/config001.cfg \
 	functional/inheritance/config002.cfg \
 	functional/inheritance/config003.cfg \
 	functional/inheritance/config004.cfg \
 	functional/inheritance/config005.cfg \
 	functional/inheritance/config006.cfg \
 	functional/inheritance/config007.cfg \
 	functional/inheritance/config008.cfg \
 	functional/inheritance/config009.cfg \
 	functional/inheritance/config010.cfg \
 	functional/inheritance/config011.cfg \
 	functional/inheritance/config012.cfg \
 	functional/inheritance/config013.cfg \
 	functional/inheritance/config014.cfg \
 	functional/inheritance/config015.cfg \
 	functional/inheritance/config016.cfg \
 	functional/inheritance/config017.cfg \
 	functional/inheritance/config018.cfg \
 	functional/inheritance/config019.cfg \
 	functional/inheritance/config020.cfg \
 	functional/inheritance/config021.cfg \
 	functional/inheritance/config022.cfg \
 	functional/inheritance/config023.cfg \
 	functional/inheritance/config024.cfg \
 	functional/inheritance/inherit.kshlib \
 	functional/inheritance/README.config \
 	functional/inheritance/README.state \
 	functional/inheritance/state001.cfg \
 	functional/inheritance/state002.cfg \
 	functional/inheritance/state003.cfg \
 	functional/inheritance/state004.cfg \
 	functional/inheritance/state005.cfg \
 	functional/inheritance/state006.cfg \
 	functional/inheritance/state007.cfg \
 	functional/inheritance/state008.cfg \
 	functional/inheritance/state009.cfg \
 	functional/inheritance/state010.cfg \
 	functional/inheritance/state011.cfg \
 	functional/inheritance/state012.cfg \
 	functional/inheritance/state013.cfg \
 	functional/inheritance/state014.cfg \
 	functional/inheritance/state015.cfg \
 	functional/inheritance/state016.cfg \
 	functional/inheritance/state017.cfg \
 	functional/inheritance/state018.cfg \
 	functional/inheritance/state019.cfg \
 	functional/inheritance/state020.cfg \
 	functional/inheritance/state021.cfg \
 	functional/inheritance/state022.cfg \
 	functional/inheritance/state023.cfg \
 	functional/inheritance/state024.cfg \
 	functional/inuse/inuse.cfg \
 	functional/io/io.cfg \
 	functional/l2arc/l2arc.cfg \
 	functional/largest_pool/largest_pool.cfg \
 	functional/migration/migration.cfg \
 	functional/migration/migration.kshlib \
 	functional/mmap/mmap.cfg \
 	functional/mmp/mmp.cfg \
 	functional/mmp/mmp.kshlib \
 	functional/mv_files/mv_files.cfg \
 	functional/mv_files/mv_files_common.kshlib \
 	functional/nopwrite/nopwrite.shlib \
 	functional/no_space/enospc.cfg \
 	functional/online_offline/online_offline.cfg \
 	functional/pool_checkpoint/pool_checkpoint.kshlib \
 	functional/projectquota/projectquota.cfg \
 	functional/projectquota/projectquota_common.kshlib \
 	functional/quota/quota.cfg \
 	functional/quota/quota.kshlib \
 	functional/redacted_send/redacted.cfg \
 	functional/redacted_send/redacted.kshlib \
 	functional/redundancy/redundancy.cfg \
 	functional/redundancy/redundancy.kshlib \
 	functional/refreserv/refreserv.cfg \
 	functional/removal/removal.kshlib \
 	functional/replacement/replacement.cfg \
 	functional/reservation/reservation.cfg \
 	functional/reservation/reservation.shlib \
 	functional/rsend/dedup_encrypted_zvol.bz2 \
 	functional/rsend/dedup_encrypted_zvol.zsend.bz2 \
 	functional/rsend/dedup.zsend.bz2 \
 	functional/rsend/fs.tar.gz \
 	functional/rsend/rsend.cfg \
 	functional/rsend/rsend.kshlib \
 	functional/scrub_mirror/default.cfg \
 	functional/scrub_mirror/scrub_mirror_common.kshlib \
 	functional/slog/slog.cfg \
 	functional/slog/slog.kshlib \
 	functional/snapshot/snapshot.cfg \
 	functional/snapused/snapused.kshlib \
 	functional/sparse/sparse.cfg \
 	functional/trim/trim.cfg \
 	functional/trim/trim.kshlib \
 	functional/truncate/truncate.cfg \
 	functional/upgrade/upgrade_common.kshlib \
 	functional/user_namespace/user_namespace.cfg \
 	functional/user_namespace/user_namespace_common.kshlib \
 	functional/userquota/13709_reproducer.bz2 \
 	functional/userquota/userquota.cfg \
 	functional/userquota/userquota_common.kshlib \
 	functional/vdev_zaps/vdev_zaps.kshlib \
 	functional/xattr/xattr.cfg \
 	functional/xattr/xattr_common.kshlib \
 	functional/zvol/zvol.cfg \
 	functional/zvol/zvol_cli/zvol_cli.cfg \
 	functional/zvol/zvol_common.shlib \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \
 	functional/zvol/zvol_misc/zvol_misc_common.kshlib \
 	functional/zvol/zvol_swap/zvol_swap.cfg \
 	functional/idmap_mount/idmap_mount.cfg \
 	functional/idmap_mount/idmap_mount_common.kshlib
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/acl/off/cleanup.ksh \
 	functional/acl/off/dosmode.ksh \
 	functional/acl/off/posixmode.ksh \
 	functional/acl/off/setup.ksh \
 	functional/acl/posix/cleanup.ksh \
 	functional/acl/posix/posix_001_pos.ksh \
 	functional/acl/posix/posix_002_pos.ksh \
 	functional/acl/posix/posix_003_pos.ksh \
 	functional/acl/posix/posix_004_pos.ksh \
 	functional/acl/posix-sa/cleanup.ksh \
 	functional/acl/posix-sa/posix_001_pos.ksh \
 	functional/acl/posix-sa/posix_002_pos.ksh \
 	functional/acl/posix-sa/posix_003_pos.ksh \
 	functional/acl/posix-sa/posix_004_pos.ksh \
 	functional/acl/posix-sa/setup.ksh \
 	functional/acl/posix/setup.ksh \
 	functional/alloc_class/alloc_class_001_pos.ksh \
 	functional/alloc_class/alloc_class_002_neg.ksh \
 	functional/alloc_class/alloc_class_003_pos.ksh \
 	functional/alloc_class/alloc_class_004_pos.ksh \
 	functional/alloc_class/alloc_class_005_pos.ksh \
 	functional/alloc_class/alloc_class_006_pos.ksh \
 	functional/alloc_class/alloc_class_007_pos.ksh \
 	functional/alloc_class/alloc_class_008_pos.ksh \
 	functional/alloc_class/alloc_class_009_pos.ksh \
 	functional/alloc_class/alloc_class_010_pos.ksh \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
 	functional/alloc_class/alloc_class_014_neg.ksh \
 	functional/alloc_class/alloc_class_015_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
 	functional/append/file_append.ksh \
 	functional/append/threadsappend_001_pos.ksh \
 	functional/append/cleanup.ksh \
 	functional/append/setup.ksh \
 	functional/arc/arcstats_runtime_tuning.ksh \
 	functional/arc/cleanup.ksh \
 	functional/arc/dbufstats_001_pos.ksh \
 	functional/arc/dbufstats_002_pos.ksh \
 	functional/arc/dbufstats_003_pos.ksh \
 	functional/arc/setup.ksh \
 	functional/atime/atime_001_pos.ksh \
 	functional/atime/atime_002_neg.ksh \
 	functional/atime/atime_003_pos.ksh \
 	functional/atime/cleanup.ksh \
 	functional/atime/root_atime_off.ksh \
 	functional/atime/root_atime_on.ksh \
 	functional/atime/root_relatime_on.ksh \
 	functional/atime/setup.ksh \
 	functional/bclone/bclone_crossfs_corner_cases.ksh \
 	functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
 	functional/bclone/bclone_crossfs_data.ksh \
 	functional/bclone/bclone_crossfs_embedded.ksh \
 	functional/bclone/bclone_crossfs_hole.ksh \
 	functional/bclone/bclone_diffprops_all.ksh \
 	functional/bclone/bclone_diffprops_checksum.ksh \
 	functional/bclone/bclone_diffprops_compress.ksh \
 	functional/bclone/bclone_diffprops_copies.ksh \
 	functional/bclone/bclone_diffprops_recordsize.ksh \
 	functional/bclone/bclone_prop_sync.ksh \
 	functional/bclone/bclone_samefs_corner_cases.ksh \
 	functional/bclone/bclone_samefs_corner_cases_limited.ksh \
 	functional/bclone/bclone_samefs_data.ksh \
 	functional/bclone/bclone_samefs_embedded.ksh \
 	functional/bclone/bclone_samefs_hole.ksh \
 	functional/bclone/cleanup.ksh \
 	functional/bclone/setup.ksh \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_write.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlone.ksh \
 	functional/block_cloning/block_cloning_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
 	functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
 	functional/block_cloning/block_cloning_replay.ksh \
 	functional/block_cloning/block_cloning_replay_encrypted.ksh \
 	functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
 	functional/block_cloning/block_cloning_rlimit_fsize.ksh \
 	functional/block_cloning/block_cloning_large_offset.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
 	functional/bootfs/bootfs_004_neg.ksh \
 	functional/bootfs/bootfs_005_neg.ksh \
 	functional/bootfs/bootfs_006_pos.ksh \
 	functional/bootfs/bootfs_007_pos.ksh \
 	functional/bootfs/bootfs_008_pos.ksh \
 	functional/bootfs/cleanup.ksh \
 	functional/bootfs/setup.ksh \
 	functional/btree/btree_negative.ksh \
 	functional/btree/btree_positive.ksh \
 	functional/cache/cache_001_pos.ksh \
 	functional/cache/cache_002_pos.ksh \
 	functional/cache/cache_003_pos.ksh \
 	functional/cache/cache_004_neg.ksh \
 	functional/cache/cache_005_neg.ksh \
 	functional/cache/cache_006_pos.ksh \
 	functional/cache/cache_007_neg.ksh \
 	functional/cache/cache_008_neg.ksh \
 	functional/cache/cache_009_pos.ksh \
 	functional/cache/cache_010_pos.ksh \
 	functional/cache/cache_011_pos.ksh \
 	functional/cache/cache_012_pos.ksh \
 	functional/cache/cleanup.ksh \
 	functional/cachefile/cachefile_001_pos.ksh \
 	functional/cachefile/cachefile_002_pos.ksh \
 	functional/cachefile/cachefile_003_pos.ksh \
 	functional/cachefile/cachefile_004_pos.ksh \
 	functional/cachefile/cleanup.ksh \
 	functional/cachefile/setup.ksh \
 	functional/cache/setup.ksh \
 	functional/casenorm/case_all_values.ksh \
 	functional/casenorm/cleanup.ksh \
 	functional/casenorm/insensitive_formd_delete.ksh \
 	functional/casenorm/insensitive_formd_lookup.ksh \
 	functional/casenorm/insensitive_none_delete.ksh \
 	functional/casenorm/insensitive_none_lookup.ksh \
 	functional/casenorm/mixed_create_failure.ksh \
 	functional/casenorm/mixed_formd_delete.ksh \
 	functional/casenorm/mixed_formd_lookup_ci.ksh \
 	functional/casenorm/mixed_formd_lookup.ksh \
 	functional/casenorm/mixed_none_delete.ksh \
 	functional/casenorm/mixed_none_lookup_ci.ksh \
 	functional/casenorm/mixed_none_lookup.ksh \
 	functional/casenorm/norm_all_values.ksh \
 	functional/casenorm/sensitive_formd_delete.ksh \
 	functional/casenorm/sensitive_formd_lookup.ksh \
 	functional/casenorm/sensitive_none_delete.ksh \
 	functional/casenorm/sensitive_none_lookup.ksh \
 	functional/casenorm/setup.ksh \
 	functional/channel_program/lua_core/cleanup.ksh \
 	functional/channel_program/lua_core/setup.ksh \
 	functional/channel_program/lua_core/tst.args_to_lua.ksh \
 	functional/channel_program/lua_core/tst.divide_by_zero.ksh \
 	functional/channel_program/lua_core/tst.exists.ksh \
 	functional/channel_program/lua_core/tst.integer_illegal.ksh \
 	functional/channel_program/lua_core/tst.integer_overflow.ksh \
 	functional/channel_program/lua_core/tst.language_functions_neg.ksh \
 	functional/channel_program/lua_core/tst.language_functions_pos.ksh \
 	functional/channel_program/lua_core/tst.large_prog.ksh \
 	functional/channel_program/lua_core/tst.libraries.ksh \
 	functional/channel_program/lua_core/tst.memory_limit.ksh \
 	functional/channel_program/lua_core/tst.nested_neg.ksh \
 	functional/channel_program/lua_core/tst.nested_pos.ksh \
 	functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \
 	functional/channel_program/lua_core/tst.recursive_neg.ksh \
 	functional/channel_program/lua_core/tst.recursive_pos.ksh \
 	functional/channel_program/lua_core/tst.return_large.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \
 	functional/channel_program/lua_core/tst.return_recursive_table.ksh \
 	functional/channel_program/lua_core/tst.stack_gsub.ksh \
 	functional/channel_program/lua_core/tst.timeout.ksh \
 	functional/channel_program/synctask_core/cleanup.ksh \
 	functional/channel_program/synctask_core/setup.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.copy.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.create.ksh \
 	functional/channel_program/synctask_core/tst.destroy_fs.ksh \
 	functional/channel_program/synctask_core/tst.destroy_snap.ksh \
 	functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \
 	functional/channel_program/synctask_core/tst.get_index_props.ksh \
 	functional/channel_program/synctask_core/tst.get_mountpoint.ksh \
 	functional/channel_program/synctask_core/tst.get_neg.ksh \
 	functional/channel_program/synctask_core/tst.get_number_props.ksh \
 	functional/channel_program/synctask_core/tst.get_string_props.ksh \
 	functional/channel_program/synctask_core/tst.get_type.ksh \
 	functional/channel_program/synctask_core/tst.get_userquota.ksh \
 	functional/channel_program/synctask_core/tst.get_written.ksh \
 	functional/channel_program/synctask_core/tst.inherit.ksh \
 	functional/channel_program/synctask_core/tst.list_bookmarks.ksh \
 	functional/channel_program/synctask_core/tst.list_children.ksh \
 	functional/channel_program/synctask_core/tst.list_clones.ksh \
 	functional/channel_program/synctask_core/tst.list_holds.ksh \
 	functional/channel_program/synctask_core/tst.list_snapshots.ksh \
 	functional/channel_program/synctask_core/tst.list_system_props.ksh \
 	functional/channel_program/synctask_core/tst.list_user_props.ksh \
 	functional/channel_program/synctask_core/tst.parse_args_neg.ksh \
 	functional/channel_program/synctask_core/tst.promote_conflict.ksh \
 	functional/channel_program/synctask_core/tst.promote_multiple.ksh \
 	functional/channel_program/synctask_core/tst.promote_simple.ksh \
 	functional/channel_program/synctask_core/tst.rollback_mult.ksh \
 	functional/channel_program/synctask_core/tst.rollback_one.ksh \
 	functional/channel_program/synctask_core/tst.set_props.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_neg.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_rename.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_simple.ksh \
 	functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \
 	functional/chattr/chattr_001_pos.ksh \
 	functional/chattr/chattr_002_neg.ksh \
 	functional/chattr/cleanup.ksh \
 	functional/chattr/setup.ksh \
 	functional/checksum/cleanup.ksh \
 	functional/checksum/filetest_001_pos.ksh \
 	functional/checksum/filetest_002_pos.ksh \
 	functional/checksum/run_blake3_test.ksh \
 	functional/checksum/run_edonr_test.ksh \
 	functional/checksum/run_sha2_test.ksh \
 	functional/checksum/run_skein_test.ksh \
 	functional/checksum/setup.ksh \
 	functional/clean_mirror/clean_mirror_001_pos.ksh \
 	functional/clean_mirror/clean_mirror_002_pos.ksh \
 	functional/clean_mirror/clean_mirror_003_pos.ksh \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
 	functional/cli_root/json/cleanup.ksh \
 	functional/cli_root/json/setup.ksh \
 	functional/cli_root/json/json_sanity.ksh \
 	functional/cli_root/zinject/zinject_args.ksh \
 	functional/cli_root/zinject/zinject_counts.ksh \
 	functional/cli_root/zinject/zinject_probe.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
 	functional/cli_root/zdb/zdb_005_pos.ksh \
 	functional/cli_root/zdb/zdb_006_pos.ksh \
 	functional/cli_root/zdb/zdb_args_neg.ksh \
 	functional/cli_root/zdb/zdb_args_pos.ksh \
 	functional/cli_root/zdb/zdb_backup.ksh \
 	functional/cli_root/zdb/zdb_block_size_histogram.ksh \
 	functional/cli_root/zdb/zdb_checksum.ksh \
 	functional/cli_root/zdb/zdb_decompress.ksh \
 	functional/cli_root/zdb/zdb_decompress_zstd.ksh \
 	functional/cli_root/zdb/zdb_display_block.ksh \
 	functional/cli_root/zdb/zdb_encrypted.ksh \
 	functional/cli_root/zdb/zdb_label_checksum.ksh \
 	functional/cli_root/zdb/zdb_object_range_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_pos.ksh \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
 	functional/cli_root/zdb/zdb_recover_2.ksh \
 	functional/cli_root/zdb/zdb_recover.ksh \
 	functional/cli_root/zfs_bookmark/cleanup.ksh \
 	functional/cli_root/zfs_bookmark/setup.ksh \
 	functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \
 	functional/cli_root/zfs_change-key/cleanup.ksh \
 	functional/cli_root/zfs_change-key/setup.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \
 	functional/cli_root/zfs/cleanup.ksh \
 	functional/cli_root/zfs_clone/cleanup.ksh \
 	functional/cli_root/zfs_clone/setup.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \
 	functional/cli_root/zfs_copies/cleanup.ksh \
 	functional/cli_root/zfs_copies/setup.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \
 	functional/cli_root/zfs_create/cleanup.ksh \
 	functional/cli_root/zfs_create/setup.ksh \
 	functional/cli_root/zfs_create/zfs_create_001_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_002_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_003_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_004_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_005_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_006_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_007_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_008_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_009_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_010_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_011_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_012_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_013_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_014_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \
 	functional/cli_root/zfs_create/zfs_create_dryrun.ksh \
 	functional/cli_root/zfs_create/zfs_create_encrypted.ksh \
 	functional/cli_root/zfs_create/zfs_create_nomount.ksh \
 	functional/cli_root/zfs_create/zfs_create_verbose.ksh \
 	functional/cli_root/zfs_destroy/cleanup.ksh \
 	functional/cli_root/zfs_destroy/setup.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \
 	functional/cli_root/zfs_diff/cleanup.ksh \
 	functional/cli_root/zfs_diff/setup.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_changes.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_types.ksh \
 	functional/cli_root/zfs_get/cleanup.ksh \
 	functional/cli_root/zfs_get/setup.ksh \
 	functional/cli_root/zfs_get/zfs_get_001_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_002_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_003_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_004_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_005_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_006_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_007_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_008_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_009_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_010_neg.ksh \
 	functional/cli_root/zfs_ids_to_path/cleanup.ksh \
 	functional/cli_root/zfs_ids_to_path/setup.ksh \
 	functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \
 	functional/cli_root/zfs_inherit/cleanup.ksh \
 	functional/cli_root/zfs_inherit/setup.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \
 	functional/cli_root/zfs_jail/cleanup.ksh \
 	functional/cli_root/zfs_jail/setup.ksh \
 	functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \
 	functional/cli_root/zfs_load-key/cleanup.ksh \
 	functional/cli_root/zfs_load-key/setup.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \
 	functional/cli_root/zfs_mount/cleanup.ksh \
 	functional/cli_root/zfs_mount/setup.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
 	functional/cli_root/zfs_program/cleanup.ksh \
 	functional/cli_root/zfs_program/setup.ksh \
 	functional/cli_root/zfs_program/zfs_program_json.ksh \
 	functional/cli_root/zfs_promote/cleanup.ksh \
 	functional/cli_root/zfs_promote/setup.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \
 	functional/cli_root/zfs_property/cleanup.ksh \
 	functional/cli_root/zfs_property/setup.ksh \
 	functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \
 	functional/cli_root/zfs_receive/cleanup.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \
 	functional/cli_root/zfs_receive/setup.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-e.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \
 	functional/cli_root/zfs_rename/cleanup.ksh \
 	functional/cli_root/zfs_rename/setup.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \
 	functional/cli_root/zfs_reservation/cleanup.ksh \
 	functional/cli_root/zfs_reservation/setup.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \
 	functional/cli_root/zfs_rollback/cleanup.ksh \
 	functional/cli_root/zfs_rollback/setup.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \
 	functional/cli_root/zfs_send/cleanup.ksh \
 	functional/cli_root/zfs_send/setup.ksh \
 	functional/cli_root/zfs_send/zfs_send_001_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_002_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_003_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_004_neg.ksh \
 	functional/cli_root/zfs_send/zfs_send_005_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_006_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_007_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send-b.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \
 	functional/cli_root/zfs_send/zfs_send_raw.ksh \
 	functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \
 	functional/cli_root/zfs_send/zfs_send_sparse.ksh \
 	functional/cli_root/zfs_set/cache_001_pos.ksh \
 	functional/cli_root/zfs_set/cache_002_neg.ksh \
 	functional/cli_root/zfs_set/canmount_001_pos.ksh \
 	functional/cli_root/zfs_set/canmount_002_pos.ksh \
 	functional/cli_root/zfs_set/canmount_003_pos.ksh \
 	functional/cli_root/zfs_set/canmount_004_pos.ksh \
 	functional/cli_root/zfs_set/checksum_001_pos.ksh \
 	functional/cli_root/zfs_set/cleanup.ksh \
 	functional/cli_root/zfs_set/compression_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_002_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_003_pos.ksh \
 	functional/cli_root/zfs_set/onoffs_001_pos.ksh \
 	functional/cli_root/zfs_set/property_alias_001_pos.ksh \
 	functional/cli_root/zfs_set/readonly_001_pos.ksh \
 	functional/cli_root/zfs_set/reservation_001_neg.ksh \
 	functional/cli_root/zfs_set/ro_props_001_pos.ksh \
 	functional/cli_root/zfs_set/setup.ksh \
 	functional/cli_root/zfs_set/share_mount_001_neg.ksh \
 	functional/cli_root/zfs_set/snapdir_001_pos.ksh \
 	functional/cli_root/zfs/setup.ksh \
 	functional/cli_root/zfs_set/user_property_001_pos.ksh \
 	functional/cli_root/zfs_set/user_property_002_pos.ksh \
 	functional/cli_root/zfs_set/user_property_003_neg.ksh \
 	functional/cli_root/zfs_set/user_property_004_pos.ksh \
 	functional/cli_root/zfs_set/version_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_002_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_003_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \
 	functional/cli_root/zfs_set/zfs_set_keylocation.ksh \
 	functional/cli_root/zfs_set/zfs_set_nomount.ksh \
 	functional/cli_root/zfs_share/cleanup.ksh \
 	functional/cli_root/zfs_share/setup.ksh \
 	functional/cli_root/zfs_share/zfs_share_001_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_002_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_003_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_004_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_005_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_006_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_007_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_008_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_009_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_010_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_011_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_012_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_013_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \
 	functional/cli_root/zfs_share/zfs_share_after_mount.ksh \
 	functional/cli_root/zfs_snapshot/cleanup.ksh \
 	functional/cli_root/zfs_snapshot/setup.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \
 	functional/cli_root/zfs_sysfs/cleanup.ksh \
 	functional/cli_root/zfs_sysfs/setup.ksh \
 	functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \
 	functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \
 	functional/cli_root/zfs_unload-key/cleanup.ksh \
 	functional/cli_root/zfs_unload-key/setup.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \
 	functional/cli_root/zfs_unmount/cleanup.ksh \
 	functional/cli_root/zfs_unmount/setup.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \
 	functional/cli_root/zfs_unshare/cleanup.ksh \
 	functional/cli_root/zfs_unshare/setup.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \
 	functional/cli_root/zfs_upgrade/cleanup.ksh \
 	functional/cli_root/zfs_upgrade/setup.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \
 	functional/cli_root/zfs_wait/cleanup.ksh \
 	functional/cli_root/zfs_wait/setup.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \
 	functional/cli_root/zfs/zfs_001_neg.ksh \
 	functional/cli_root/zfs/zfs_002_pos.ksh \
 	functional/cli_root/zfs/zfs_003_neg.ksh \
 	functional/cli_root/zhack/zhack_label_repair_001.ksh \
 	functional/cli_root/zhack/zhack_label_repair_002.ksh \
 	functional/cli_root/zhack/zhack_label_repair_003.ksh \
 	functional/cli_root/zhack/zhack_label_repair_004.ksh \
 	functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \
 	functional/cli_root/zpool_add/add-o_ashift.ksh \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_004_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_005_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_006_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_007_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_008_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_009_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_010_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \
 	functional/cli_root/zpool_attach/attach-o_ashift.ksh \
 	functional/cli_root/zpool_attach/cleanup.ksh \
 	functional/cli_root/zpool_attach/setup.ksh \
 	functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \
 	functional/cli_root/zpool/cleanup.ksh \
 	functional/cli_root/zpool_clear/cleanup.ksh \
 	functional/cli_root/zpool_clear/setup.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \
 	functional/cli_root/zpool_create/cleanup.ksh \
 	functional/cli_root/zpool_create/create-o_ashift.ksh \
 	functional/cli_root/zpool_create/setup.ksh \
 	functional/cli_root/zpool_create/zpool_create_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_007_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_009_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_010_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_011_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_012_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_014_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_015_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_016_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_017_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_018_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_019_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_020_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_021_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_022_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_023_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_024_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \
 	functional/cli_root/zpool_create/zpool_create_encrypted.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_tempname.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \
 	functional/cli_root/zpool_detach/cleanup.ksh \
 	functional/cli_root/zpool_detach/setup.ksh \
 	functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \
 	functional/cli_root/zpool_events/cleanup.ksh \
 	functional/cli_root/zpool_events/setup.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \
 	functional/cli_root/zpool_events/zpool_events_cliargs.ksh \
 	functional/cli_root/zpool_events/zpool_events_duplicates.ksh \
 	functional/cli_root/zpool_events/zpool_events_errors.ksh \
 	functional/cli_root/zpool_events/zpool_events_follow.ksh \
 	functional/cli_root/zpool_events/zpool_events_poolname.ksh \
 	functional/cli_root/zpool_expand/cleanup.ksh \
 	functional/cli_root/zpool_expand/setup.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \
 	functional/cli_root/zpool_export/cleanup.ksh \
 	functional/cli_root/zpool_export/setup.ksh \
 	functional/cli_root/zpool_export/zpool_export_001_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_002_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_003_neg.ksh \
 	functional/cli_root/zpool_export/zpool_export_004_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \
 	functional/cli_root/zpool_get/cleanup.ksh \
 	functional/cli_root/zpool_get/setup.ksh \
 	functional/cli_root/zpool_get/vdev_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_002_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_003_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_004_neg.ksh \
 	functional/cli_root/zpool_get/zpool_get_005_pos.ksh \
 	functional/cli_root/zpool_history/cleanup.ksh \
 	functional/cli_root/zpool_history/setup.ksh \
 	functional/cli_root/zpool_history/zpool_history_001_neg.ksh \
 	functional/cli_root/zpool_history/zpool_history_002_pos.ksh \
 	functional/cli_root/zpool_import/cleanup.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_added.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \
 	functional/cli_root/zpool_import/import_devices_missing.ksh \
 	functional/cli_root/zpool_import/import_log_missing.ksh \
 	functional/cli_root/zpool_import/import_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_config_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \
 	functional/cli_root/zpool_import/setup.ksh \
 	functional/cli_root/zpool_import/zpool_import_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_004_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_005_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_006_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_007_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_008_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_009_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_010_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_011_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_012_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_013_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_014_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_015_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_016_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_017_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata3.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata4.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_status.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \
 	functional/cli_root/zpool_initialize/cleanup.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \
 	functional/cli_root/zpool_offline/cleanup.ksh \
 	functional/cli_root/zpool_offline/setup.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \
 	functional/cli_root/zpool_online/cleanup.ksh \
 	functional/cli_root/zpool_online/setup.ksh \
 	functional/cli_root/zpool_online/zpool_online_001_pos.ksh \
 	functional/cli_root/zpool_online/zpool_online_002_neg.ksh \
 	functional/cli_root/zpool_prefetch/cleanup.ksh \
 	functional/cli_root/zpool_prefetch/setup.ksh \
 	functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \
 	functional/cli_root/zpool_reguid/cleanup.ksh \
 	functional/cli_root/zpool_reguid/setup.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh \
 	functional/cli_root/zpool_remove/cleanup.ksh \
 	functional/cli_root/zpool_remove/setup.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \
 	functional/cli_root/zpool_reopen/cleanup.ksh \
 	functional/cli_root/zpool_reopen/setup.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \
 	functional/cli_root/zpool_replace/cleanup.ksh \
 	functional/cli_root/zpool_replace/replace-o_ashift.ksh \
 	functional/cli_root/zpool_replace/replace_prop_ashift.ksh \
 	functional/cli_root/zpool_replace/setup.ksh \
 	functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \
 	functional/cli_root/zpool_resilver/cleanup.ksh \
 	functional/cli_root/zpool_resilver/setup.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
 	functional/cli_root/zpool_scrub/cleanup.ksh \
 	functional/cli_root/zpool_scrub/setup.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
 	functional/cli_root/zpool_set/cleanup.ksh \
 	functional/cli_root/zpool_set/setup.ksh \
 	functional/cli_root/zpool/setup.ksh \
 	functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_common.kshlib \
 	functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_003_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_ashift.ksh \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
 	functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \
 	functional/cli_root/zpool_split/cleanup.ksh \
 	functional/cli_root/zpool_split/setup.ksh \
 	functional/cli_root/zpool_split/zpool_split_cliargs.ksh \
 	functional/cli_root/zpool_split/zpool_split_devices.ksh \
 	functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \
 	functional/cli_root/zpool_split/zpool_split_encryption.ksh \
 	functional/cli_root/zpool_split/zpool_split_indirect.ksh \
 	functional/cli_root/zpool_split/zpool_split_props.ksh \
 	functional/cli_root/zpool_split/zpool_split_resilver.ksh \
 	functional/cli_root/zpool_split/zpool_split_vdevs.ksh \
 	functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \
 	functional/cli_root/zpool_status/cleanup.ksh \
 	functional/cli_root/zpool_status/setup.ksh \
 	functional/cli_root/zpool_status/zpool_status_001_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_002_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_004_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \
 	functional/cli_root/zpool_trim/cleanup.ksh \
 	functional/cli_root/zpool_trim/setup.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_partial.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_secure.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_split.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \
 	functional/cli_root/zpool_upgrade/cleanup.ksh \
 	functional/cli_root/zpool_upgrade/setup.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \
 	functional/cli_root/zpool_wait/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/setup.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \
 	functional/cli_root/zpool_wait/setup.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_discard.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_usage.ksh \
 	functional/cli_root/zpool/zpool_001_neg.ksh \
 	functional/cli_root/zpool/zpool_002_pos.ksh \
 	functional/cli_root/zpool/zpool_003_pos.ksh \
 	functional/cli_root/zpool/zpool_colors.ksh \
 	functional/cli_user/misc/arcstat_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_002_neg.ksh \
 	functional/cli_user/misc/zilstat_001_pos.ksh \
 	functional/cli_user/misc/cleanup.ksh \
 	functional/cli_user/misc/setup.ksh \
 	functional/cli_user/misc/zdb_001_neg.ksh \
 	functional/cli_user/misc/zfs_001_neg.ksh \
 	functional/cli_user/misc/zfs_allow_001_neg.ksh \
 	functional/cli_user/misc/zfs_clone_001_neg.ksh \
 	functional/cli_user/misc/zfs_create_001_neg.ksh \
 	functional/cli_user/misc/zfs_destroy_001_neg.ksh \
 	functional/cli_user/misc/zfs_get_001_neg.ksh \
 	functional/cli_user/misc/zfs_inherit_001_neg.ksh \
 	functional/cli_user/misc/zfs_mount_001_neg.ksh \
 	functional/cli_user/misc/zfs_promote_001_neg.ksh \
 	functional/cli_user/misc/zfs_receive_001_neg.ksh \
 	functional/cli_user/misc/zfs_rename_001_neg.ksh \
 	functional/cli_user/misc/zfs_rollback_001_neg.ksh \
 	functional/cli_user/misc/zfs_send_001_neg.ksh \
 	functional/cli_user/misc/zfs_set_001_neg.ksh \
 	functional/cli_user/misc/zfs_share_001_neg.ksh \
 	functional/cli_user/misc/zfs_snapshot_001_neg.ksh \
 	functional/cli_user/misc/zfs_unallow_001_neg.ksh \
 	functional/cli_user/misc/zfs_unmount_001_neg.ksh \
 	functional/cli_user/misc/zfs_unshare_001_neg.ksh \
 	functional/cli_user/misc/zfs_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_001_neg.ksh \
 	functional/cli_user/misc/zpool_add_001_neg.ksh \
 	functional/cli_user/misc/zpool_attach_001_neg.ksh \
 	functional/cli_user/misc/zpool_clear_001_neg.ksh \
 	functional/cli_user/misc/zpool_create_001_neg.ksh \
 	functional/cli_user/misc/zpool_destroy_001_neg.ksh \
 	functional/cli_user/misc/zpool_detach_001_neg.ksh \
 	functional/cli_user/misc/zpool_export_001_neg.ksh \
 	functional/cli_user/misc/zpool_get_001_neg.ksh \
 	functional/cli_user/misc/zpool_history_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_002_neg.ksh \
 	functional/cli_user/misc/zpool_offline_001_neg.ksh \
 	functional/cli_user/misc/zpool_online_001_neg.ksh \
 	functional/cli_user/misc/zpool_remove_001_neg.ksh \
 	functional/cli_user/misc/zpool_replace_001_neg.ksh \
 	functional/cli_user/misc/zpool_scrub_001_neg.ksh \
 	functional/cli_user/misc/zpool_set_001_neg.ksh \
 	functional/cli_user/misc/zpool_status_001_neg.ksh \
 	functional/cli_user/misc/zpool_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_wait_privilege.ksh \
 	functional/cli_user/zfs_list/cleanup.ksh \
 	functional/cli_user/zfs_list/setup.ksh \
 	functional/cli_user/zfs_list/zfs_list_001_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_002_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_003_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_004_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_005_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_007_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_008_neg.ksh \
 	functional/cli_user/zpool_iostat/cleanup.ksh \
 	functional/cli_user/zpool_iostat/setup.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \
 	functional/cli_user/zpool_list/cleanup.ksh \
 	functional/cli_user/zpool_list/setup.ksh \
 	functional/cli_user/zpool_list/zpool_list_001_pos.ksh \
 	functional/cli_user/zpool_list/zpool_list_002_neg.ksh \
 	functional/cli_user/zpool_status/cleanup.ksh \
 	functional/cli_user/zpool_status/setup.ksh \
 	functional/cli_user/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \
 	functional/compression/cleanup.ksh \
 	functional/compression/compress_001_pos.ksh \
 	functional/compression/compress_002_pos.ksh \
 	functional/compression/compress_003_pos.ksh \
 	functional/compression/compress_004_pos.ksh \
 	functional/compression/compress_zstd_bswap.ksh \
 	functional/compression/l2arc_compressed_arc_disabled.ksh \
 	functional/compression/l2arc_compressed_arc.ksh \
 	functional/compression/l2arc_encrypted.ksh \
 	functional/compression/l2arc_encrypted_no_compressed_arc.ksh \
 	functional/compression/setup.ksh \
 	functional/cp_files/cleanup.ksh \
 	functional/cp_files/cp_files_001_pos.ksh \
 	functional/cp_files/cp_files_002_pos.ksh \
 	functional/cp_files/cp_stress.ksh \
 	functional/cp_files/setup.ksh \
 	functional/crtime/cleanup.ksh \
 	functional/crtime/crtime_001_pos.ksh \
 	functional/crtime/setup.ksh \
 	functional/crypto/icp_aes_ccm.ksh \
 	functional/crypto/icp_aes_gcm.ksh \
 	functional/deadman/deadman_ratelimit.ksh \
 	functional/deadman/deadman_sync.ksh \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
 	functional/dedup/dedup_fdt_pacing.ksh \
 	functional/dedup/dedup_legacy_create.ksh \
 	functional/dedup/dedup_legacy_import.ksh \
 	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
 	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_prune.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/dedup/dedup_zap_shrink.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
 	functional/delegate/zfs_allow_001_pos.ksh \
 	functional/delegate/zfs_allow_002_pos.ksh \
 	functional/delegate/zfs_allow_003_pos.ksh \
 	functional/delegate/zfs_allow_004_pos.ksh \
 	functional/delegate/zfs_allow_005_pos.ksh \
 	functional/delegate/zfs_allow_006_pos.ksh \
 	functional/delegate/zfs_allow_007_pos.ksh \
 	functional/delegate/zfs_allow_008_pos.ksh \
 	functional/delegate/zfs_allow_009_neg.ksh \
 	functional/delegate/zfs_allow_010_pos.ksh \
 	functional/delegate/zfs_allow_011_neg.ksh \
 	functional/delegate/zfs_allow_012_neg.ksh \
 	functional/delegate/zfs_unallow_001_pos.ksh \
 	functional/delegate/zfs_unallow_002_pos.ksh \
 	functional/delegate/zfs_unallow_003_pos.ksh \
 	functional/delegate/zfs_unallow_004_pos.ksh \
 	functional/delegate/zfs_unallow_005_pos.ksh \
 	functional/delegate/zfs_unallow_006_pos.ksh \
 	functional/delegate/zfs_unallow_007_neg.ksh \
 	functional/delegate/zfs_unallow_008_neg.ksh \
 	functional/devices/cleanup.ksh \
 	functional/devices/devices_001_pos.ksh \
 	functional/devices/devices_002_neg.ksh \
 	functional/devices/devices_003_pos.ksh \
 	functional/devices/setup.ksh \
 	functional/direct/dio_aligned_block.ksh \
 	functional/direct/dio_async_always.ksh \
 	functional/direct/dio_async_fio_ioengines.ksh \
 	functional/direct/dio_compression.ksh \
 	functional/direct/dio_dedup.ksh \
 	functional/direct/dio_encryption.ksh \
 	functional/direct/dio_grow_block.ksh \
 	functional/direct/dio_loopback_dev.ksh \
 	functional/direct/dio_max_recordsize.ksh \
 	functional/direct/dio_mixed.ksh \
 	functional/direct/dio_mmap.ksh \
 	functional/direct/dio_overwrites.ksh \
 	functional/direct/dio_property.ksh \
 	functional/direct/dio_random.ksh \
 	functional/direct/dio_read_verify.ksh \
 	functional/direct/dio_recordsize.ksh \
 	functional/direct/dio_unaligned_block.ksh \
 	functional/direct/dio_unaligned_filesize.ksh \
 	functional/direct/dio_write_verify.ksh \
 	functional/direct/dio_write_stable_pages.ksh \
 	functional/direct/setup.ksh \
 	functional/direct/cleanup.ksh \
 	functional/dos_attributes/cleanup.ksh \
 	functional/dos_attributes/read_dos_attrs_001.ksh \
 	functional/dos_attributes/setup.ksh \
 	functional/dos_attributes/write_dos_attrs_001.ksh \
 	functional/events/cleanup.ksh \
 	functional/events/events_001_pos.ksh \
 	functional/events/events_002_pos.ksh \
 	functional/events/setup.ksh \
 	functional/events/zed_cksum_config.ksh \
 	functional/events/zed_cksum_reported.ksh \
 	functional/events/zed_diagnose_multiple.ksh \
 	functional/events/zed_fd_spill.ksh \
 	functional/events/zed_io_config.ksh \
 	functional/events/zed_rc_filter.ksh \
 	functional/events/zed_slow_io.ksh \
 	functional/events/zed_slow_io_many_vdevs.ksh \
 	functional/exec/cleanup.ksh \
 	functional/exec/exec_001_pos.ksh \
 	functional/exec/exec_002_neg.ksh \
 	functional/exec/setup.ksh \
 	functional/fadvise/cleanup.ksh \
 	functional/fadvise/fadvise_sequential.ksh \
 	functional/fadvise/setup.ksh \
 	functional/fallocate/cleanup.ksh \
 	functional/fallocate/fallocate_prealloc.ksh \
 	functional/fallocate/fallocate_punch-hole.ksh \
 	functional/fallocate/fallocate_zero-range.ksh \
 	functional/fallocate/setup.ksh \
 	functional/fault/auto_offline_001_pos.ksh \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
 	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
 	functional/fault/decompress_fault.ksh \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/fault_limits.ksh \
 	functional/fault/scrub_after_resilver.ksh \
 	functional/fault/suspend_on_probe_errors.ksh \
 	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
 	functional/features/async_destroy/async_destroy_001_pos.ksh \
 	functional/features/async_destroy/cleanup.ksh \
 	functional/features/async_destroy/setup.ksh \
 	functional/features/large_dnode/cleanup.ksh \
 	functional/features/large_dnode/large_dnode_001_pos.ksh \
 	functional/features/large_dnode/large_dnode_002_pos.ksh \
 	functional/features/large_dnode/large_dnode_003_pos.ksh \
 	functional/features/large_dnode/large_dnode_004_neg.ksh \
 	functional/features/large_dnode/large_dnode_005_pos.ksh \
 	functional/features/large_dnode/large_dnode_006_pos.ksh \
 	functional/features/large_dnode/large_dnode_007_neg.ksh \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
+	functional/gang_blocks/cleanup.ksh \
+	functional/gang_blocks/gang_blocks_redundant.ksh \
+	functional/gang_blocks/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
 	functional/history/history_001_pos.ksh \
 	functional/history/history_002_pos.ksh \
 	functional/history/history_003_pos.ksh \
 	functional/history/history_004_pos.ksh \
 	functional/history/history_005_neg.ksh \
 	functional/history/history_006_neg.ksh \
 	functional/history/history_007_pos.ksh \
 	functional/history/history_008_pos.ksh \
 	functional/history/history_009_pos.ksh \
 	functional/history/history_010_pos.ksh \
 	functional/history/setup.ksh \
 	functional/inheritance/cleanup.ksh \
 	functional/inheritance/inherit_001_pos.ksh \
 	functional/inuse/inuse_001_pos.ksh \
 	functional/inuse/inuse_003_pos.ksh \
 	functional/inuse/inuse_004_pos.ksh \
 	functional/inuse/inuse_005_pos.ksh \
 	functional/inuse/inuse_006_pos.ksh \
 	functional/inuse/inuse_007_pos.ksh \
 	functional/inuse/inuse_008_pos.ksh \
 	functional/inuse/inuse_009_pos.ksh \
 	functional/inuse/setup.ksh \
 	functional/io/cleanup.ksh \
 	functional/io/io_uring.ksh \
 	functional/io/libaio.ksh \
 	functional/io/mmap.ksh \
 	functional/io/posixaio.ksh \
 	functional/io/psync.ksh \
 	functional/io/setup.ksh \
 	functional/io/sync.ksh \
 	functional/l2arc/cleanup.ksh \
 	functional/l2arc/l2arc_arcstats_pos.ksh \
 	functional/l2arc/l2arc_l2miss_pos.ksh \
 	functional/l2arc/l2arc_mfuonly_pos.ksh \
 	functional/l2arc/persist_l2arc_001_pos.ksh \
 	functional/l2arc/persist_l2arc_002_pos.ksh \
 	functional/l2arc/persist_l2arc_003_neg.ksh \
 	functional/l2arc/persist_l2arc_004_pos.ksh \
 	functional/l2arc/persist_l2arc_005_pos.ksh \
 	functional/l2arc/setup.ksh \
 	functional/large_files/cleanup.ksh \
 	functional/large_files/large_files_001_pos.ksh \
 	functional/large_files/large_files_002_pos.ksh \
 	functional/large_files/setup.ksh \
 	functional/largest_pool/largest_pool_001_pos.ksh \
 	functional/libzfs/cleanup.ksh \
 	functional/libzfs/libzfs_input.ksh \
 	functional/libzfs/setup.ksh \
 	functional/limits/cleanup.ksh \
 	functional/limits/filesystem_count.ksh \
 	functional/limits/filesystem_limit.ksh \
 	functional/limits/setup.ksh \
 	functional/limits/snapshot_count.ksh \
 	functional/limits/snapshot_limit.ksh \
 	functional/link_count/cleanup.ksh \
 	functional/link_count/link_count_001.ksh \
 	functional/link_count/link_count_root_inode.ksh \
 	functional/link_count/setup.ksh \
 	functional/longname/cleanup.ksh \
 	functional/longname/longname_001_pos.ksh \
 	functional/longname/longname_002_pos.ksh \
 	functional/longname/longname_003_pos.ksh \
 	functional/longname/setup.ksh \
 	functional/log_spacemap/log_spacemap_import_logs.ksh \
 	functional/migration/cleanup.ksh \
 	functional/migration/migration_001_pos.ksh \
 	functional/migration/migration_002_pos.ksh \
 	functional/migration/migration_003_pos.ksh \
 	functional/migration/migration_004_pos.ksh \
 	functional/migration/migration_005_pos.ksh \
 	functional/migration/migration_006_pos.ksh \
 	functional/migration/migration_007_pos.ksh \
 	functional/migration/migration_008_pos.ksh \
 	functional/migration/migration_009_pos.ksh \
 	functional/migration/migration_010_pos.ksh \
 	functional/migration/migration_011_pos.ksh \
 	functional/migration/migration_012_pos.ksh \
 	functional/migration/setup.ksh \
 	functional/mmap/cleanup.ksh \
 	functional/mmap/mmap_libaio_001_pos.ksh \
 	functional/mmap/mmap_mixed.ksh \
 	functional/mmap/mmap_read_001_pos.ksh \
 	functional/mmap/mmap_seek_001_pos.ksh \
 	functional/mmap/mmap_sync_001_pos.ksh \
 	functional/mmap/mmap_write_001_pos.ksh \
 	functional/mmap/setup.ksh \
 	functional/mmp/cleanup.ksh \
 	functional/mmp/mmp_active_import.ksh \
 	functional/mmp/mmp_exported_import.ksh \
 	functional/mmp/mmp_hostid.ksh \
 	functional/mmp/mmp_inactive_import.ksh \
 	functional/mmp/mmp_interval.ksh \
 	functional/mmp/mmp_on_off.ksh \
 	functional/mmp/mmp_on_thread.ksh \
 	functional/mmp/mmp_on_uberblocks.ksh \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
 	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
 	functional/mount/cleanup.ksh \
 	functional/mount/setup.ksh \
 	functional/mount/umount_001.ksh \
 	functional/mount/umountall_001.ksh \
 	functional/mount/umount_unlinked_drain.ksh \
 	functional/mv_files/cleanup.ksh \
 	functional/mv_files/mv_files_001_pos.ksh \
 	functional/mv_files/mv_files_002_pos.ksh \
 	functional/mv_files/random_creation.ksh \
 	functional/mv_files/setup.ksh \
 	functional/nestedfs/cleanup.ksh \
 	functional/nestedfs/nestedfs_001_pos.ksh \
 	functional/nestedfs/setup.ksh \
 	functional/nopwrite/cleanup.ksh \
 	functional/nopwrite/nopwrite_copies.ksh \
 	functional/nopwrite/nopwrite_mtime.ksh \
 	functional/nopwrite/nopwrite_negative.ksh \
 	functional/nopwrite/nopwrite_promoted_clone.ksh \
 	functional/nopwrite/nopwrite_recsize.ksh \
 	functional/nopwrite/nopwrite_sync.ksh \
 	functional/nopwrite/nopwrite_varying_compression.ksh \
 	functional/nopwrite/nopwrite_volume.ksh \
 	functional/nopwrite/setup.ksh \
 	functional/no_space/cleanup.ksh \
 	functional/no_space/enospc_001_pos.ksh \
 	functional/no_space/enospc_002_pos.ksh \
 	functional/no_space/enospc_003_pos.ksh \
 	functional/no_space/enospc_df.ksh \
 	functional/no_space/enospc_ganging.ksh \
 	functional/no_space/enospc_rm.ksh \
 	functional/no_space/setup.ksh \
 	functional/online_offline/cleanup.ksh \
 	functional/online_offline/online_offline_001_pos.ksh \
 	functional/online_offline/online_offline_002_neg.ksh \
 	functional/online_offline/online_offline_003_neg.ksh \
 	functional/online_offline/setup.ksh \
 	functional/pam/cleanup.ksh \
 	functional/pam/pam_basic.ksh \
 	functional/pam/pam_change_unmounted.ksh \
 	functional/pam/pam_mount_recursively.ksh \
 	functional/pam/pam_nounmount.ksh \
 	functional/pam/pam_recursive.ksh \
 	functional/pam/pam_short_password.ksh \
 	functional/pam/setup.ksh \
 	functional/pool_checkpoint/checkpoint_after_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_big_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_capacity.ksh \
 	functional/pool_checkpoint/checkpoint_conf_change.ksh \
 	functional/pool_checkpoint/checkpoint_discard_busy.ksh \
 	functional/pool_checkpoint/checkpoint_discard.ksh \
 	functional/pool_checkpoint/checkpoint_discard_many.ksh \
 	functional/pool_checkpoint/checkpoint_indirect.ksh \
 	functional/pool_checkpoint/checkpoint_invalid.ksh \
 	functional/pool_checkpoint/checkpoint_lun_expsz.ksh \
 	functional/pool_checkpoint/checkpoint_open.ksh \
 	functional/pool_checkpoint/checkpoint_removal.ksh \
 	functional/pool_checkpoint/checkpoint_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_ro_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_sm_scale.ksh \
 	functional/pool_checkpoint/checkpoint_twice.ksh \
 	functional/pool_checkpoint/checkpoint_vdev_add.ksh \
 	functional/pool_checkpoint/checkpoint_zdb.ksh \
 	functional/pool_checkpoint/checkpoint_zhack_feat.ksh \
 	functional/pool_checkpoint/cleanup.ksh \
 	functional/pool_checkpoint/setup.ksh \
 	functional/pool_names/pool_names_001_pos.ksh \
 	functional/pool_names/pool_names_002_neg.ksh \
 	functional/poolversion/cleanup.ksh \
 	functional/poolversion/poolversion_001_pos.ksh \
 	functional/poolversion/poolversion_002_pos.ksh \
 	functional/poolversion/setup.ksh \
 	functional/privilege/cleanup.ksh \
 	functional/privilege/privilege_001_pos.ksh \
 	functional/privilege/privilege_002_pos.ksh \
 	functional/privilege/setup.ksh \
 	functional/procfs/cleanup.ksh \
 	functional/procfs/pool_state.ksh \
 	functional/procfs/procfs_list_basic.ksh \
 	functional/procfs/procfs_list_concurrent_readers.ksh \
 	functional/procfs/procfs_list_stale_read.ksh \
 	functional/procfs/setup.ksh \
 	functional/projectquota/cleanup.ksh \
 	functional/projectquota/projectid_001_pos.ksh \
 	functional/projectquota/projectid_002_pos.ksh \
 	functional/projectquota/projectid_003_pos.ksh \
 	functional/projectquota/projectquota_001_pos.ksh \
 	functional/projectquota/projectquota_002_pos.ksh \
 	functional/projectquota/projectquota_003_pos.ksh \
 	functional/projectquota/projectquota_004_neg.ksh \
 	functional/projectquota/projectquota_005_pos.ksh \
 	functional/projectquota/projectquota_006_pos.ksh \
 	functional/projectquota/projectquota_007_pos.ksh \
 	functional/projectquota/projectquota_008_pos.ksh \
 	functional/projectquota/projectquota_009_pos.ksh \
 	functional/projectquota/projectspace_001_pos.ksh \
 	functional/projectquota/projectspace_002_pos.ksh \
 	functional/projectquota/projectspace_003_pos.ksh \
 	functional/projectquota/projectspace_004_pos.ksh \
 	functional/projectquota/projecttree_001_pos.ksh \
 	functional/projectquota/projecttree_002_pos.ksh \
 	functional/projectquota/projecttree_003_neg.ksh \
 	functional/projectquota/setup.ksh \
 	functional/quota/cleanup.ksh \
 	functional/quota/quota_001_pos.ksh \
 	functional/quota/quota_002_pos.ksh \
 	functional/quota/quota_003_pos.ksh \
 	functional/quota/quota_004_pos.ksh \
 	functional/quota/quota_005_pos.ksh \
 	functional/quota/quota_006_neg.ksh \
 	functional/quota/setup.ksh \
 	functional/raidz/cleanup.ksh \
 	functional/raidz/raidz_001_neg.ksh \
 	functional/raidz/raidz_002_pos.ksh \
 	functional/raidz/raidz_expand_001_pos.ksh \
 	functional/raidz/raidz_expand_002_pos.ksh \
 	functional/raidz/raidz_expand_003_neg.ksh \
 	functional/raidz/raidz_expand_003_pos.ksh \
 	functional/raidz/raidz_expand_004_pos.ksh \
 	functional/raidz/raidz_expand_005_pos.ksh \
 	functional/raidz/raidz_expand_006_neg.ksh \
 	functional/raidz/raidz_expand_007_neg.ksh \
 	functional/raidz/setup.ksh \
 	functional/redacted_send/cleanup.ksh \
 	functional/redacted_send/redacted_compressed.ksh \
 	functional/redacted_send/redacted_contents.ksh \
 	functional/redacted_send/redacted_deleted.ksh \
 	functional/redacted_send/redacted_disabled_feature.ksh \
 	functional/redacted_send/redacted_embedded.ksh \
 	functional/redacted_send/redacted_holes.ksh \
 	functional/redacted_send/redacted_incrementals.ksh \
 	functional/redacted_send/redacted_largeblocks.ksh \
 	functional/redacted_send/redacted_many_clones.ksh \
 	functional/redacted_send/redacted_mixed_recsize.ksh \
 	functional/redacted_send/redacted_mounts.ksh \
 	functional/redacted_send/redacted_negative.ksh \
 	functional/redacted_send/redacted_origin.ksh \
 	functional/redacted_send/redacted_panic.ksh \
 	functional/redacted_send/redacted_props.ksh \
 	functional/redacted_send/redacted_resume.ksh \
 	functional/redacted_send/redacted_size.ksh \
 	functional/redacted_send/redacted_volume.ksh \
 	functional/redacted_send/setup.ksh \
 	functional/redundancy/cleanup.ksh \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
 	functional/redundancy/redundancy_draid_spare3.ksh \
 	functional/redundancy/redundancy_mirror.ksh \
 	functional/redundancy/redundancy_raidz1.ksh \
 	functional/redundancy/redundancy_raidz2.ksh \
 	functional/redundancy/redundancy_raidz3.ksh \
 	functional/redundancy/redundancy_raidz.ksh \
 	functional/redundancy/redundancy_stripe.ksh \
 	functional/redundancy/setup.ksh \
 	functional/refquota/cleanup.ksh \
 	functional/refquota/refquota_001_pos.ksh \
 	functional/refquota/refquota_002_pos.ksh \
 	functional/refquota/refquota_003_pos.ksh \
 	functional/refquota/refquota_004_pos.ksh \
 	functional/refquota/refquota_005_pos.ksh \
 	functional/refquota/refquota_006_neg.ksh \
 	functional/refquota/refquota_007_neg.ksh \
 	functional/refquota/refquota_008_neg.ksh \
 	functional/refquota/setup.ksh \
 	functional/refreserv/cleanup.ksh \
 	functional/refreserv/refreserv_001_pos.ksh \
 	functional/refreserv/refreserv_002_pos.ksh \
 	functional/refreserv/refreserv_003_pos.ksh \
 	functional/refreserv/refreserv_004_pos.ksh \
 	functional/refreserv/refreserv_005_pos.ksh \
 	functional/refreserv/refreserv_multi_raidz.ksh \
 	functional/refreserv/refreserv_raidz.ksh \
 	functional/refreserv/setup.ksh \
 	functional/removal/cleanup.ksh \
 	functional/removal/removal_all_vdev.ksh \
 	functional/removal/removal_cancel.ksh \
 	functional/removal/removal_check_space.ksh \
 	functional/removal/removal_condense_export.ksh \
 	functional/removal/removal_multiple_indirection.ksh \
 	functional/removal/removal_nopwrite.ksh \
 	functional/removal/removal_remap_deadlists.ksh \
 	functional/removal/removal_reservation.ksh \
 	functional/removal/removal_resume_export.ksh \
 	functional/removal/removal_sanity.ksh \
 	functional/removal/removal_with_add.ksh \
 	functional/removal/removal_with_create_fs.ksh \
 	functional/removal/removal_with_dedup.ksh \
 	functional/removal/removal_with_errors.ksh \
 	functional/removal/removal_with_export.ksh \
 	functional/removal/removal_with_faulted.ksh \
 	functional/removal/removal_with_ganging.ksh \
 	functional/removal/removal_with_hole.ksh \
 	functional/removal/removal_with_indirect.ksh \
 	functional/removal/removal_with_remove.ksh \
 	functional/removal/removal_with_scrub.ksh \
 	functional/removal/removal_with_send.ksh \
 	functional/removal/removal_with_send_recv.ksh \
 	functional/removal/removal_with_snapshot.ksh \
 	functional/removal/removal_with_write.ksh \
 	functional/removal/removal_with_zdb.ksh \
 	functional/removal/remove_attach_mirror.ksh \
 	functional/removal/remove_expanded.ksh \
 	functional/removal/remove_indirect.ksh \
 	functional/removal/remove_mirror.ksh \
 	functional/removal/remove_mirror_sanity.ksh \
 	functional/removal/remove_raidz.ksh \
 	functional/rename_dirs/cleanup.ksh \
 	functional/rename_dirs/rename_dirs_001_pos.ksh \
 	functional/rename_dirs/setup.ksh \
 	functional/renameat2/cleanup.ksh \
 	functional/renameat2/setup.ksh \
 	functional/renameat2/renameat2_exchange.ksh \
 	functional/renameat2/renameat2_noreplace.ksh \
 	functional/renameat2/renameat2_whiteout.ksh \
 	functional/replacement/attach_import.ksh \
 	functional/replacement/attach_multiple.ksh \
 	functional/replacement/attach_rebuild.ksh \
 	functional/replacement/attach_resilver.ksh \
 	functional/replacement/cleanup.ksh \
 	functional/replacement/detach.ksh \
 	functional/replacement/rebuild_disabled_feature.ksh \
 	functional/replacement/rebuild_multiple.ksh \
 	functional/replacement/rebuild_raidz.ksh \
 	functional/replacement/replace_import.ksh \
 	functional/replacement/replace_rebuild.ksh \
 	functional/replacement/replace_resilver.ksh \
 	functional/replacement/resilver_restart_001.ksh \
 	functional/replacement/resilver_restart_002.ksh \
 	functional/replacement/scrub_cancel.ksh \
 	functional/replacement/setup.ksh \
 	functional/reservation/cleanup.ksh \
 	functional/reservation/reservation_001_pos.ksh \
 	functional/reservation/reservation_002_pos.ksh \
 	functional/reservation/reservation_003_pos.ksh \
 	functional/reservation/reservation_004_pos.ksh \
 	functional/reservation/reservation_005_pos.ksh \
 	functional/reservation/reservation_006_pos.ksh \
 	functional/reservation/reservation_007_pos.ksh \
 	functional/reservation/reservation_008_pos.ksh \
 	functional/reservation/reservation_009_pos.ksh \
 	functional/reservation/reservation_010_pos.ksh \
 	functional/reservation/reservation_011_pos.ksh \
 	functional/reservation/reservation_012_pos.ksh \
 	functional/reservation/reservation_013_pos.ksh \
 	functional/reservation/reservation_014_pos.ksh \
 	functional/reservation/reservation_015_pos.ksh \
 	functional/reservation/reservation_016_pos.ksh \
 	functional/reservation/reservation_017_pos.ksh \
 	functional/reservation/reservation_018_pos.ksh \
 	functional/reservation/reservation_019_pos.ksh \
 	functional/reservation/reservation_020_pos.ksh \
 	functional/reservation/reservation_021_neg.ksh \
 	functional/reservation/reservation_022_pos.ksh \
 	functional/reservation/setup.ksh \
 	functional/rootpool/cleanup.ksh \
 	functional/rootpool/rootpool_002_neg.ksh \
 	functional/rootpool/rootpool_003_neg.ksh \
 	functional/rootpool/rootpool_007_pos.ksh \
 	functional/rootpool/setup.ksh \
 	functional/rsend/cleanup.ksh \
 	functional/rsend/recv_dedup_encrypted_zvol.ksh \
 	functional/rsend/recv_dedup.ksh \
 	functional/rsend/rsend_001_pos.ksh \
 	functional/rsend/rsend_002_pos.ksh \
 	functional/rsend/rsend_003_pos.ksh \
 	functional/rsend/rsend_004_pos.ksh \
 	functional/rsend/rsend_005_pos.ksh \
 	functional/rsend/rsend_006_pos.ksh \
 	functional/rsend/rsend_007_pos.ksh \
 	functional/rsend/rsend_008_pos.ksh \
 	functional/rsend/rsend_009_pos.ksh \
 	functional/rsend/rsend_010_pos.ksh \
 	functional/rsend/rsend_011_pos.ksh \
 	functional/rsend/rsend_012_pos.ksh \
 	functional/rsend/rsend_013_pos.ksh \
 	functional/rsend/rsend_014_pos.ksh \
 	functional/rsend/rsend_016_neg.ksh \
 	functional/rsend/rsend_019_pos.ksh \
 	functional/rsend/rsend_020_pos.ksh \
 	functional/rsend/rsend_021_pos.ksh \
 	functional/rsend/rsend_022_pos.ksh \
 	functional/rsend/rsend_024_pos.ksh \
 	functional/rsend/rsend_025_pos.ksh \
 	functional/rsend/rsend_026_neg.ksh \
 	functional/rsend/rsend_027_pos.ksh \
 	functional/rsend/rsend_028_neg.ksh \
 	functional/rsend/rsend_029_neg.ksh \
 	functional/rsend/rsend_030_pos.ksh \
 	functional/rsend/rsend_031_pos.ksh \
 	functional/rsend/send-c_embedded_blocks.ksh \
 	functional/rsend/send-c_incremental.ksh \
 	functional/rsend/send-c_longname.ksh \
 	functional/rsend/send-c_lz4_disabled.ksh \
 	functional/rsend/send-c_mixed_compression.ksh \
 	functional/rsend/send-c_props.ksh \
 	functional/rsend/send-c_recv_dedup.ksh \
 	functional/rsend/send-c_recv_lz4_disabled.ksh \
 	functional/rsend/send-c_resume.ksh \
 	functional/rsend/send-c_stream_size_estimate.ksh \
 	functional/rsend/send-c_verify_contents.ksh \
 	functional/rsend/send-c_verify_ratio.ksh \
 	functional/rsend/send-c_volume.ksh \
 	functional/rsend/send-c_zstream_recompress.ksh \
 	functional/rsend/send-c_zstreamdump.ksh \
 	functional/rsend/send-cpL_varied_recsize.ksh \
 	functional/rsend/send_doall.ksh \
 	functional/rsend/send_encrypted_incremental.ksh \
 	functional/rsend/send_encrypted_files.ksh \
 	functional/rsend/send_encrypted_freeobjects.ksh \
 	functional/rsend/send_encrypted_hierarchy.ksh \
 	functional/rsend/send_encrypted_props.ksh \
 	functional/rsend/send_encrypted_truncated_files.ksh \
 	functional/rsend/send_freeobjects.ksh \
 	functional/rsend/send_holds.ksh \
 	functional/rsend/send_hole_birth.ksh \
 	functional/rsend/send_invalid.ksh \
 	functional/rsend/send-L_toggle.ksh \
 	functional/rsend/send_mixed_raw.ksh \
 	functional/rsend/send_partial_dataset.ksh \
 	functional/rsend/send_raw_ashift.ksh \
 	functional/rsend/send_raw_spill_block.ksh \
 	functional/rsend/send_raw_large_blocks.ksh \
 	functional/rsend/send_realloc_dnode_size.ksh \
 	functional/rsend/send_realloc_encrypted_files.ksh \
 	functional/rsend/send_realloc_files.ksh \
 	functional/rsend/send_spill_block.ksh \
 	functional/rsend/send-wR_encrypted_zvol.ksh \
 	functional/rsend/setup.ksh \
 	functional/scrub_mirror/cleanup.ksh \
 	functional/scrub_mirror/scrub_mirror_001_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_002_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_003_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_004_pos.ksh \
 	functional/scrub_mirror/setup.ksh \
 	functional/slog/cleanup.ksh \
 	functional/slog/setup.ksh \
 	functional/slog/slog_001_pos.ksh \
 	functional/slog/slog_002_pos.ksh \
 	functional/slog/slog_003_pos.ksh \
 	functional/slog/slog_004_pos.ksh \
 	functional/slog/slog_005_pos.ksh \
 	functional/slog/slog_006_pos.ksh \
 	functional/slog/slog_007_pos.ksh \
 	functional/slog/slog_008_neg.ksh \
 	functional/slog/slog_009_neg.ksh \
 	functional/slog/slog_010_neg.ksh \
 	functional/slog/slog_011_neg.ksh \
 	functional/slog/slog_012_neg.ksh \
 	functional/slog/slog_013_pos.ksh \
 	functional/slog/slog_014_pos.ksh \
 	functional/slog/slog_015_neg.ksh \
 	functional/slog/slog_016_pos.ksh \
 	functional/slog/slog_replay_fs_001.ksh \
 	functional/slog/slog_replay_fs_002.ksh \
 	functional/slog/slog_replay_volume.ksh \
 	functional/snapshot/cleanup.ksh \
 	functional/snapshot/clone_001_pos.ksh \
 	functional/snapshot/rollback_001_pos.ksh \
 	functional/snapshot/rollback_002_pos.ksh \
 	functional/snapshot/rollback_003_pos.ksh \
 	functional/snapshot/setup.ksh \
 	functional/snapshot/snapshot_001_pos.ksh \
 	functional/snapshot/snapshot_002_pos.ksh \
 	functional/snapshot/snapshot_003_pos.ksh \
 	functional/snapshot/snapshot_004_pos.ksh \
 	functional/snapshot/snapshot_005_pos.ksh \
 	functional/snapshot/snapshot_006_pos.ksh \
 	functional/snapshot/snapshot_007_pos.ksh \
 	functional/snapshot/snapshot_008_pos.ksh \
 	functional/snapshot/snapshot_009_pos.ksh \
 	functional/snapshot/snapshot_010_pos.ksh \
 	functional/snapshot/snapshot_011_pos.ksh \
 	functional/snapshot/snapshot_012_pos.ksh \
 	functional/snapshot/snapshot_013_pos.ksh \
 	functional/snapshot/snapshot_014_pos.ksh \
 	functional/snapshot/snapshot_015_pos.ksh \
 	functional/snapshot/snapshot_016_pos.ksh \
 	functional/snapshot/snapshot_017_pos.ksh \
 	functional/snapshot/snapshot_018_pos.ksh \
 	functional/snapused/cleanup.ksh \
 	functional/snapused/setup.ksh \
 	functional/snapused/snapused_001_pos.ksh \
 	functional/snapused/snapused_002_pos.ksh \
 	functional/snapused/snapused_003_pos.ksh \
 	functional/snapused/snapused_004_pos.ksh \
 	functional/snapused/snapused_005_pos.ksh \
 	functional/sparse/cleanup.ksh \
 	functional/sparse/setup.ksh \
 	functional/sparse/sparse_001_pos.ksh \
 	functional/stat/cleanup.ksh \
 	functional/stat/setup.ksh \
 	functional/stat/stat_001_pos.ksh \
 	functional/stat/statx_dioalign.ksh \
 	functional/suid/cleanup.ksh \
 	functional/suid/setup.ksh \
 	functional/suid/suid_write_to_none.ksh \
 	functional/suid/suid_write_to_sgid.ksh \
 	functional/suid/suid_write_to_suid.ksh \
 	functional/suid/suid_write_to_suid_sgid.ksh \
 	functional/suid/suid_write_zil_replay.ksh \
 	functional/trim/autotrim_config.ksh \
 	functional/trim/autotrim_integrity.ksh \
 	functional/trim/autotrim_trim_integrity.ksh \
 	functional/trim/cleanup.ksh \
 	functional/trim/setup.ksh \
 	functional/trim/trim_config.ksh \
 	functional/trim/trim_integrity.ksh \
 	functional/trim/trim_l2arc.ksh \
 	functional/truncate/cleanup.ksh \
 	functional/truncate/setup.ksh \
 	functional/truncate/truncate_001_pos.ksh \
 	functional/truncate/truncate_002_pos.ksh \
 	functional/truncate/truncate_timestamps.ksh \
 	functional/upgrade/cleanup.ksh \
 	functional/upgrade/setup.ksh \
 	functional/upgrade/upgrade_projectquota_001_pos.ksh \
 	functional/upgrade/upgrade_projectquota_002_pos.ksh \
 	functional/upgrade/upgrade_readonly_pool.ksh \
 	functional/upgrade/upgrade_userobj_001_pos.ksh \
 	functional/user_namespace/cleanup.ksh \
 	functional/user_namespace/setup.ksh \
 	functional/user_namespace/user_namespace_001.ksh \
 	functional/user_namespace/user_namespace_002.ksh \
 	functional/user_namespace/user_namespace_003.ksh \
 	functional/user_namespace/user_namespace_004.ksh \
 	functional/userquota/cleanup.ksh \
 	functional/userquota/groupspace_001_pos.ksh \
 	functional/userquota/groupspace_002_pos.ksh \
 	functional/userquota/groupspace_003_pos.ksh \
 	functional/userquota/setup.ksh \
 	functional/userquota/userquota_001_pos.ksh \
 	functional/userquota/userquota_002_pos.ksh \
 	functional/userquota/userquota_003_pos.ksh \
 	functional/userquota/userquota_004_pos.ksh \
 	functional/userquota/userquota_005_neg.ksh \
 	functional/userquota/userquota_006_pos.ksh \
 	functional/userquota/userquota_007_pos.ksh \
 	functional/userquota/userquota_008_pos.ksh \
 	functional/userquota/userquota_009_pos.ksh \
 	functional/userquota/userquota_010_pos.ksh \
 	functional/userquota/userquota_011_pos.ksh \
 	functional/userquota/userquota_012_neg.ksh \
 	functional/userquota/userquota_013_pos.ksh \
 	functional/userquota/userspace_001_pos.ksh \
 	functional/userquota/userspace_002_pos.ksh \
 	functional/userquota/userspace_003_pos.ksh \
 	functional/userquota/userspace_encrypted.ksh \
 	functional/userquota/userspace_send_encrypted.ksh \
 	functional/userquota/userspace_encrypted_13709.ksh \
 	functional/vdev_zaps/cleanup.ksh \
 	functional/vdev_zaps/setup.ksh \
 	functional/vdev_zaps/vdev_zaps_001_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_002_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_003_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_004_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_005_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_006_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_007_pos.ksh \
 	functional/write_dirs/cleanup.ksh \
 	functional/write_dirs/setup.ksh \
 	functional/write_dirs/write_dirs_001_pos.ksh \
 	functional/write_dirs/write_dirs_002_pos.ksh \
 	functional/xattr/cleanup.ksh \
 	functional/xattr/setup.ksh \
 	functional/xattr/xattr_001_pos.ksh \
 	functional/xattr/xattr_002_neg.ksh \
 	functional/xattr/xattr_003_neg.ksh \
 	functional/xattr/xattr_004_pos.ksh \
 	functional/xattr/xattr_005_pos.ksh \
 	functional/xattr/xattr_006_pos.ksh \
 	functional/xattr/xattr_007_neg.ksh \
 	functional/xattr/xattr_008_pos.ksh \
 	functional/xattr/xattr_009_neg.ksh \
 	functional/xattr/xattr_010_neg.ksh \
 	functional/xattr/xattr_011_pos.ksh \
 	functional/xattr/xattr_012_pos.ksh \
 	functional/xattr/xattr_013_pos.ksh \
 	functional/xattr/xattr_compat.ksh \
 	functional/zap_shrink/cleanup.ksh \
 	functional/zap_shrink/zap_shrink_001_pos.ksh \
 	functional/zap_shrink/setup.ksh \
 	functional/zpool_influxdb/cleanup.ksh \
 	functional/zpool_influxdb/setup.ksh \
 	functional/zpool_influxdb/zpool_influxdb.ksh \
 	functional/zvol/zvol_cli/cleanup.ksh \
 	functional/zvol/zvol_cli/setup.ksh \
 	functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \
 	functional/zvol/zvol_ENOSPC/cleanup.ksh \
 	functional/zvol/zvol_ENOSPC/setup.ksh \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \
 	functional/zvol/zvol_misc/cleanup.ksh \
 	functional/zvol/zvol_misc/setup.ksh \
 	functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_fua.ksh \
 	functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
 	functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
 	functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
 	functional/zvol/zvol_misc/zvol_misc_trim.ksh \
 	functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
 	functional/zvol/zvol_misc/zvol_misc_zil.ksh \
 	functional/zvol/zvol_stress/cleanup.ksh \
 	functional/zvol/zvol_stress/setup.ksh \
 	functional/zvol/zvol_stress/zvol_stress.ksh \
 	functional/zvol/zvol_swap/cleanup.ksh \
 	functional/zvol/zvol_swap/setup.ksh \
 	functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \
 	functional/idmap_mount/cleanup.ksh \
 	functional/idmap_mount/setup.ksh \
 	functional/idmap_mount/idmap_mount_001.ksh \
 	functional/idmap_mount/idmap_mount_002.ksh \
 	functional/idmap_mount/idmap_mount_003.ksh \
 	functional/idmap_mount/idmap_mount_004.ksh \
 	functional/idmap_mount/idmap_mount_005.ksh
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh
new file mode 100755
index 000000000000..4ae6ec16fae4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh
@@ -0,0 +1,31 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+restore_tunable METASLAB_FORCE_GANGING
+restore_tunable METASLAB_FORCE_GANGING_PCT
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
new file mode 100644
index 000000000000..8799a1436c56
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib
@@ -0,0 +1,120 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 By Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Get 0th DVA of first L0 block of file
+#
+# $1 filesystem
+# $2 object number
+#
+function get_object_info
+{
+	typeset fs=$1
+	typeset obj=$2
+
+	zdb -dddddd $fs $obj
+}
+
+#
+# $1 filesystem
+# $2 path to file
+# $3 block filter
+#
+function get_blocks_filter
+{
+	typeset fs=$1
+	typeset path=$2
+
+	typeset full_path="$(get_prop mountpoint $fs)/$path"
+	typeset obj="$(ls -i $full_path | awk '{print $1}')"
+
+	get_object_info $fs $obj | grep $3 | grep -v Dataset
+}
+
+function get_first_block
+{
+	get_blocks_filter $1 $2 L0 | head -n 1
+}
+
+function get_first_block_dva
+{
+	get_first_block $1 $2 | sed 's/.*L0 \([^ ]*\).*/\1/'
+}
+
+# Takes a zdb compressed blkptr line on stdin
+function get_num_dvas
+{
+	sed 's/.*L[0-9] \(.*\) [a-f0-9]*L.*/\1/' | awk '{print NF}'
+}
+
+function check_gang_dva
+{
+	typeset last_byte="$(echo -n $1 | tail -c 1)"
+	[[ "$last_byte" == "G" ]] || return 1
+	return 0
+}
+
+function check_is_gang_dva
+{
+	check_gang_dva $1 || log_fail "Not a gang DVA: \"$1\""
+}
+
+function check_not_gang_dva
+{
+	check_gang_dva $1 && log_fail "Gang DVA: \"$1\""
+}
+
+#
+# Get the gang header contents of the given dva in the given pool
+#
+# $1 pool
+# $2 dva
+# $3 size (in hexidecimal)
+#
+function read_gang_header
+{
+	typeset pool=$1
+	typeset dva=$2
+	typeset size=$3
+
+	check_is_gang_dva $dva
+
+	zdb -R $pool "${dva%:*}:$size:g" 2>&1 | grep -v "Found vdev:"
+}
+
+function preamble
+{
+	save_tunable METASLAB_FORCE_GANGING
+	save_tunable METASLAB_FORCE_GANGING_PCT
+}
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+	restore_tunable METASLAB_FORCE_GANGING
+	restore_tunable METASLAB_FORCE_GANGING_PCT
+}
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh
new file mode 100755
index 000000000000..1c44a7c5e598
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh
@@ -0,0 +1,88 @@
+#!/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that the redundant_metadata setting is respected by gang headers
+#
+# Strategy:
+# 1. Create a filesystem with redundant_metadata={all,most,some,none}
+# 2. Verify that gang blocks at different levels have the right amount of redundancy
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Verify that gang blocks at different levels have the right amount of redundancy."
+
+function cleanup2
+{
+	for red in all most some none; do zfs destroy $TESTPOOL/$TESTFS-$red; done
+	cleanup
+}
+
+preamble
+log_onexit cleanup2
+
+log_must zpool create -f -o ashift=9 $TESTPOOL $DISKS
+set_tunable64 METASLAB_FORCE_GANGING 1500
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+for red in all most some none; do
+	log_must zfs create -o redundant_metadata=$red -o recordsize=512 \
+		 $TESTPOOL/$TESTFS-$red
+	if [[ "$red" == "all" ]]; then
+		log_must zfs set recordsize=8k $TESTPOOL/$TESTFS-$red
+	fi
+	mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS-$red)
+
+	path="${mountpoint}/file"
+	log_must dd if=/dev/urandom of=$path bs=1M count=1
+	log_must zpool sync $TESTPOOL
+	num_l0_dvas=$(get_first_block $TESTPOOL/$TESTFS-$red file | get_num_dvas)
+	if [[ "$red" == "all" ]]; then
+		[[ "$num_l0_dvas" -eq 2 ]] || \
+			log_fail "wrong number of DVAs for L0 in $red: $num_l0_dvas"
+	else
+		[[ "$num_l0_dvas" -eq 1 ]] || \
+			log_fail "wrong number of DVAs for L0 in $red: $num_l0_dvas"
+	fi
+
+	num_l1_dvas=$(get_blocks_filter $TESTPOOL/$TESTFS-$red file L1 | head -n 1 | get_num_dvas)
+	if [[ "$red" == "all" || "$red" == "most" ]]; then
+		[[ "$num_l1_dvas" -eq 2 ]] || \
+			log_fail "wrong number of DVAs for L1 in $red: $num_l1_dvas"
+	else
+		[[ "$num_l1_dvas" -eq 1 ]] || \
+			log_fail "wrong number of DVAs for L1 in $red: $num_l1_dvas"
+	fi
+
+	for i in `seq 1 80`; do
+		dd if=/dev/urandom of=/$mountpoint/f$i bs=512 count=1 2>/dev/null || log_fail "dd failed"
+	done
+	log_must zpool sync $TESTPOOL
+	obj_0_gangs=$(get_object_info $TESTPOOL/$TESTFS-$red 0 L0 | grep G)
+	num_obj_0_dvas=$(echo "$obj_0_gangs" | head -n 1 | get_num_dvas)
+	if [[ "$red" != "none" ]]; then
+		[[ "$num_obj_0_dvas" -eq 2 ]] || \
+			log_fail "wrong number of DVAs for obj 0 in $red: $num_obj_0_dvas"
+	else
+		[[ "$num_obj_0_dvas" -eq 1 ]] || \
+			log_fail "wrong number of DVAs for obj 0 in $red: $num_obj_0_dvas"
+	fi
+	log_note "Level $red passed"
+done
+
+log_pass "Gang blocks at different levels have the right amount of redundancy."
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh
new file mode 100755
index 000000000000..0d2b239a069d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+set_tunable64 METASLAB_FORCE_GANGING 16777217
+set_tunable32 METASLAB_FORCE_GANGING_PCT 0