diff --git a/spl-git/20140411-git-master.patch b/spl-git/20140411-git-master.patch deleted file mode 100644 index 1502991..0000000 --- a/spl-git/20140411-git-master.patch +++ /dev/null @@ -1,1797 +0,0 @@ -diff --git a/config/spl-build.m4 b/config/spl-build.m4 -index f54c5b1..8426780 100644 ---- a/config/spl-build.m4 -+++ b/config/spl-build.m4 -@@ -29,4 +29,3 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ - SPL_AC_2ARGS_REGISTER_SYSCTL -- SPL_AC_SET_SHRINKER -- SPL_AC_3ARGS_SHRINKER_CALLBACK -+ SPL_AC_SHRINKER_CALLBACK - SPL_AC_PATH_IN_NAMEIDATA -@@ -95,2 +94,3 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ - SPL_AC_2ARGS_VFS_GETATTR -+ SPL_AC_USLEEP_RANGE - ]) -@@ -886,29 +886,10 @@ AC_DEFUN([SPL_AC_2ARGS_REGISTER_SYSCTL], - --dnl # --dnl # 2.6.23 API change --dnl # Old set_shrinker API replaced with register_shrinker --dnl # --AC_DEFUN([SPL_AC_SET_SHRINKER], [ -- AC_MSG_CHECKING([whether set_shrinker() available]) -- SPL_LINUX_TRY_COMPILE([ -- #include -- ],[ -- return set_shrinker(DEFAULT_SEEKS, NULL); -- ],[ -- AC_MSG_RESULT([yes]) -- AC_DEFINE(HAVE_SET_SHRINKER, 1, -- [set_shrinker() available]) -- ],[ -- AC_MSG_RESULT([no]) -- ]) --]) -- --dnl # --dnl # 2.6.35 API change, --dnl # Add context to shrinker callback --dnl # --AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], -- [AC_MSG_CHECKING([whether shrinker callback wants 3 args]) -+AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[ - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" -+ dnl # -+ dnl # 2.6.23 to 2.6.34 API change -+ dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask) -+ dnl # -+ AC_MSG_CHECKING([whether old 2-argument shrinker exists]) - SPL_LINUX_TRY_COMPILE([ -@@ -916,3 +897,3 @@ AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], - -- int shrinker_cb(struct shrinker *, int, unsigned int); -+ int shrinker_cb(int nr_to_scan, gfp_t gfp_mask); - ],[ -@@ -925,6 +906,82 @@ AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], - AC_MSG_RESULT(yes) -- AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, -- [shrinker callback wants 3 args]) -+ AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1, -+ [old shrinker callback wants 2 args]) - ],[ - AC_MSG_RESULT(no) -+ dnl # -+ dnl # 2.6.35 - 2.6.39 API change -+ dnl # ->shrink(struct shrinker *, -+ dnl # int nr_to_scan, gfp_t gfp_mask) -+ dnl # -+ AC_MSG_CHECKING([whether old 3-argument shrinker exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ int shrinker_cb(struct shrinker *, int nr_to_scan, -+ gfp_t gfp_mask); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .shrink = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, -+ [old shrinker callback wants 3 args]) -+ ],[ -+ AC_MSG_RESULT(no) -+ dnl # -+ dnl # 3.0 - 3.11 API change -+ dnl # ->shrink(struct shrinker *, -+ dnl # struct shrink_control *sc) -+ dnl # -+ AC_MSG_CHECKING( -+ [whether new 2-argument shrinker exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ int shrinker_cb(struct shrinker *, -+ struct shrink_control *sc); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .shrink = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1, -+ [new shrinker callback wants 2 args]) -+ ],[ -+ AC_MSG_RESULT(no) -+ dnl # -+ dnl # 3.12 API change, -+ dnl # ->shrink() is logically split in to -+ dnl # ->count_objects() and ->scan_objects() -+ dnl # -+ AC_MSG_CHECKING( -+ [whether ->count_objects callback exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ unsigned long shrinker_cb( -+ struct shrinker *, -+ struct shrink_control *sc); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .count_objects = shrinker_cb, -+ .scan_objects = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, -+ 1, [->count_objects exists]) -+ ],[ -+ AC_MSG_ERROR(error) -+ ]) -+ ]) -+ ]) - ]) -@@ -1787,6 +1844,2 @@ AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST], - --dnl # --dnl # SLES API change, never adopted in mainline, --dnl # Third 'struct vfsmount *' argument removed. --dnl # - AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], -@@ -1796,3 +1849,3 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - ],[ -- vfs_unlink(NULL, NULL); -+ vfs_unlink((struct inode *) NULL, (struct dentry *) NULL); - ],[ -@@ -1803,2 +1856,21 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - AC_MSG_RESULT(no) -+ dnl # -+ dnl # Linux 3.13 API change -+ dnl # Added delegated inode -+ dnl # -+ AC_MSG_CHECKING([whether vfs_unlink() wants 3 args]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ vfs_unlink((struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode **) NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_3ARGS_VFS_UNLINK, 1, -+ [vfs_unlink() wants 3 args]) -+ ],[ -+ AC_MSG_ERROR(no) -+ ]) -+ - ]) -@@ -1806,6 +1878,2 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - --dnl # --dnl # SLES API change, never adopted in mainline, --dnl # Third and sixth 'struct vfsmount *' argument removed. --dnl # - AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], -@@ -1815,3 +1883,4 @@ AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], - ],[ -- vfs_rename(NULL, NULL, NULL, NULL); -+ vfs_rename((struct inode *) NULL, (struct dentry *) NULL, -+ (struct inode *) NULL, (struct dentry *) NULL); - ],[ -@@ -1822,2 +1891,22 @@ AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], - AC_MSG_RESULT(no) -+ dnl # -+ dnl # Linux 3.13 API change -+ dnl # Added delegated inode -+ dnl # -+ AC_MSG_CHECKING([whether vfs_rename() wants 5 args]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ vfs_rename((struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode **) NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_5ARGS_VFS_RENAME, 1, -+ [vfs_rename() wants 5 args]) -+ ],[ -+ AC_MSG_ERROR(no) -+ ]) - ]) -@@ -2402 +2491,23 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [ - ]) -+ -+dnl # -+dnl # 2.6.36 API compatibility. -+dnl # Added usleep_range timer. -+dnl # usleep_range is a finer precision implementation of msleep -+dnl # designed to be a drop-in replacement for udelay where a precise -+dnl # sleep / busy-wait is unnecessary. -+dnl # -+AC_DEFUN([SPL_AC_USLEEP_RANGE], [ -+ AC_MSG_CHECKING([whether usleep_range() is available]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ usleep_range(0, 0); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_USLEEP_RANGE, 1, -+ [usleep_range is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/configure.ac b/configure.ac -index 4772298..e81ddfb 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -52,2 +52,3 @@ AC_CONFIG_FILES([ - man/man1/Makefile -+ man/man5/Makefile - lib/Makefile -diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am -index 730f769..59f2ec5 100644 ---- a/include/linux/Makefile.am -+++ b/include/linux/Makefile.am -@@ -5,2 +5,3 @@ KERNEL_H = \ - $(top_srcdir)/include/linux/compiler_compat.h \ -+ $(top_srcdir)/include/linux/delay_compat.h \ - $(top_srcdir)/include/linux/file_compat.h \ -diff --git a/include/linux/delay_compat.h b/include/linux/delay_compat.h -new file mode 100644 -index 0000000..fc9ff66 ---- /dev/null -+++ b/include/linux/delay_compat.h -@@ -0,0 +1,47 @@ -+/*****************************************************************************\ -+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. -+ * Copyright (C) 2007 The Regents of the University of California. -+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+ * Written by Brian Behlendorf . -+ * UCRL-CODE-235197 -+ * -+ * This file is part of the SPL, Solaris Porting Layer. -+ * For details, see . -+ * -+ * The SPL is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. -+ * -+ * The SPL is distributed in the hope that it will be useful, but WITHOUT -+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with the SPL. If not, see . -+\*****************************************************************************/ -+ -+#ifndef _SPL_DELAY_COMPAT_H -+#define _SPL_DELAY_COMPAT_H -+ -+#include -+#include -+ -+/* usleep_range() introduced in 2.6.36 */ -+#ifndef HAVE_USLEEP_RANGE -+ -+static inline void -+usleep_range(unsigned long min, unsigned long max) -+{ -+ unsigned int min_ms = min / USEC_PER_MSEC; -+ -+ if (min >= MAX_UDELAY_MS) -+ msleep(min_ms); -+ else -+ udelay(min); -+} -+ -+#endif /* HAVE_USLEEP_RANGE */ -+ -+#endif /* _SPL_DELAY_COMPAT_H */ -diff --git a/include/linux/mm_compat.h b/include/linux/mm_compat.h -index cb1bef9..37c9b08 100644 ---- a/include/linux/mm_compat.h -+++ b/include/linux/mm_compat.h -@@ -150,64 +150,98 @@ extern shrink_icache_memory_t shrink_icache_memory_fn; - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Due to frequent changes in the shrinker API the following -+ * compatibility wrappers should be used. They are as follows: -+ * -+ * SPL_SHRINKER_DECLARE is used to declare the shrinker which is -+ * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use -+ * shrinker_name to set the shrinker variable name, shrinker_callback -+ * to set the callback function, and seek_cost to define the cost of -+ * reclaiming an object. -+ * -+ * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost); -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration -+ * of the shrinker callback function is required. Only the callback -+ * function needs to be passed. -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback); -+ * -+ * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function -+ * which is registered with the shrinker. This function will call your -+ * custom shrinker which must use the following prototype. Notice the -+ * leading __'s, these must be appended to the callback_function name. -+ * -+ * int __shrinker_callback(struct shrinker *, struct shrink_control *) -+ * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a -+ * -+ * -+ * Example: -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn); -+ * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1); -+ * -+ * static int -+ * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc) -+ * { -+ * if (sc->nr_to_scan) { -+ * ...scan objects in the cache and reclaim them... -+ * } -+ * -+ * ...calculate number of objects in the cache... -+ * -+ * return (number of objects in the cache); -+ * } -+ * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn); - */ --#ifdef HAVE_SET_SHRINKER --typedef struct spl_shrinker { -- struct shrinker *shrinker; -- shrinker_t fn; -- int seeks; --} spl_shrinker_t; -- --static inline void --spl_register_shrinker(spl_shrinker_t *ss) --{ -- ss->shrinker = set_shrinker(ss->seeks, ss->fn); --} - --static inline void --spl_unregister_shrinker(spl_shrinker_t *ss) --{ -- remove_shrinker(ss->shrinker); --} -+#define spl_register_shrinker(x) register_shrinker(x) -+#define spl_unregister_shrinker(x) unregister_shrinker(x) - --# define SPL_SHRINKER_DECLARE(s, x, y) \ -- static spl_shrinker_t s = { \ -- .shrinker = NULL, \ -- .fn = x, \ -- .seeks = y \ -- } -- --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(NULL, &sc); \ -+/* -+ * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility. -+ */ -+#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ - } - --#else -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(int nr_to_scan, unsigned int gfp_mask) - --# define spl_register_shrinker(x) register_shrinker(x) --# define spl_unregister_shrinker(x) unregister_shrinker(x) --# define SPL_SHRINKER_DECLARE(s, x, y) \ -- static struct shrinker s = { \ -- .shrink = x, \ -- .seeks = y \ -- } -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(int nr_to_scan, unsigned int gfp_mask) \ -+{ \ -+ struct shrink_control sc; \ -+ \ -+ sc.nr_to_scan = nr_to_scan; \ -+ sc.gfp_mask = gfp_mask; \ -+ \ -+ return (__ ## fn(NULL, &sc)); \ -+} - - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility. - */ --# if defined(HAVE_SHRINK_CONTROL_STRUCT) --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(struct shrinker *, struct shrink_control *) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(struct shrinker *shrink, struct shrink_control *sc) { \ -- return __ ## fn(shrink, sc); \ -+#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ -+} -+ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(struct shrinker *, int, unsigned int) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ -+{ \ -+ struct shrink_control sc; \ -+ \ -+ sc.nr_to_scan = nr_to_scan; \ -+ sc.gfp_mask = gfp_mask; \ -+ \ -+ return (__ ## fn(shrink, &sc)); \ - } -@@ -215,17 +249,19 @@ fn(struct shrinker *shrink, struct shrink_control *sc) { \ - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ --# elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(struct shrinker *, int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(shrink, &sc); \ -+#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ -+} -+ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(struct shrinker *, struct shrink_control *) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(struct shrinker *shrink, struct shrink_control *sc) \ -+{ \ -+ return (__ ## fn(shrink, sc)); \ - } -@@ -233,21 +269,45 @@ fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 3.12 and later Shrinker API Compatibility. - */ --# else --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(NULL, &sc); \ -+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .count_objects = x ## _count_objects, \ -+ .scan_objects = x ## _scan_objects, \ -+ .seeks = y \ - } - --# endif --#endif /* HAVE_SET_SHRINKER */ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static unsigned long fn ## _count_objects(struct shrinker *, \ -+ struct shrink_control *); \ -+static unsigned long fn ## _scan_objects(struct shrinker *, \ -+ struct shrink_control *) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static unsigned long \ -+fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\ -+{ \ -+ int __ret__; \ -+ \ -+ sc->nr_to_scan = 0; \ -+ __ret__ = __ ## fn(NULL, sc); \ -+ \ -+ /* Errors may not be returned and must be converted to zeros */ \ -+ return ((__ret__ < 0) ? 0 : __ret__); \ -+} \ -+ \ -+static unsigned long \ -+fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \ -+{ \ -+ int __ret__; \ -+ \ -+ __ret__ = __ ## fn(NULL, sc); \ -+ return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \ -+} -+#else -+/* -+ * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. -+ */ -+#error "Unknown shrinker callback" -+#endif - -diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am -index 0e86a28..9d82636 100644 ---- a/include/sys/Makefile.am -+++ b/include/sys/Makefile.am -@@ -15,2 +15,3 @@ KERNEL_H = \ - $(top_srcdir)/include/sys/callb.h \ -+ $(top_srcdir)/include/sys/callo.h \ - $(top_srcdir)/include/sys/cmn_err.h \ -diff --git a/include/sys/callo.h b/include/sys/callo.h -new file mode 100644 -index 0000000..0d9fbcb ---- /dev/null -+++ b/include/sys/callo.h -@@ -0,0 +1,52 @@ -+/*****************************************************************************\ -+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. -+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+ * Written by Brian Behlendorf . -+ * UCRL-CODE-235197 -+ * -+ * This file is part of the SPL, Solaris Porting Layer. -+ * For details, see . -+ * -+ * The SPL is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. -+ * -+ * The SPL is distributed in the hope that it will be useful, but WITHOUT -+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with the SPL. If not, see . -+\*****************************************************************************/ -+ -+#ifndef _SPL_CALLO_H -+#define _SPL_CALLO_H -+ -+/* -+ * Callout flags: -+ * -+ * CALLOUT_FLAG_ROUNDUP -+ * Roundup the expiration time to the next resolution boundary. -+ * If this flag is not specified, the expiration time is rounded down. -+ * CALLOUT_FLAG_ABSOLUTE -+ * Normally, the expiration passed to the timeout API functions is an -+ * expiration interval. If this flag is specified, then it is -+ * interpreted as the expiration time itself. -+ * CALLOUT_FLAG_HRESTIME -+ * Normally, callouts are not affected by changes to system time -+ * (hrestime). This flag is used to create a callout that is affected -+ * by system time. If system time changes, these timers must be -+ * handled in a special way (see callout.c). These are used by condition -+ * variables and LWP timers that need this behavior. -+ * CALLOUT_FLAG_32BIT -+ * Legacy interfaces timeout() and realtime_timeout() pass this flag -+ * to timeout_generic() to indicate that a 32-bit ID should be allocated. -+ */ -+#define CALLOUT_FLAG_ROUNDUP 0x1 -+#define CALLOUT_FLAG_ABSOLUTE 0x2 -+#define CALLOUT_FLAG_HRESTIME 0x4 -+#define CALLOUT_FLAG_32BIT 0x8 -+ -+#endif /* _SPL_CALLB_H */ -diff --git a/include/sys/condvar.h b/include/sys/condvar.h -index c825bd2..c9f2bea 100644 ---- a/include/sys/condvar.h -+++ b/include/sys/condvar.h -@@ -29,4 +29,6 @@ - #include -+#include - #include - #include -+#include - -@@ -58,2 +60,4 @@ extern clock_t __cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp, - clock_t exp_time); -+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, -+ hrtime_t tim, hrtime_t res, int flag); - extern void __cv_signal(kcondvar_t *cvp); -diff --git a/include/sys/disp.h b/include/sys/disp.h -index 9614a47..c3077a7 100644 ---- a/include/sys/disp.h -+++ b/include/sys/disp.h -@@ -29,2 +29,3 @@ - -+#define kpreempt(unused) schedule() - #define kpreempt_disable() preempt_disable() -diff --git a/include/sys/isa_defs.h b/include/sys/isa_defs.h -index 35aee61..cc59a3a 100644 ---- a/include/sys/isa_defs.h -+++ b/include/sys/isa_defs.h -@@ -93,3 +93,31 @@ - --#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ -+/* sparc arch specific defines */ -+#elif defined(__sparc) || defined(__sparc__) -+ -+#if !defined(__sparc) -+#define __sparc -+#endif -+ -+#if !defined(__sparc__) -+#define __sparc__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+/* sparc64 arch specific defines */ -+#elif defined(__sparc64) || defined(__sparc64__) -+ -+#if !defined(__sparc64) -+#define __sparc64 -+#endif -+ -+#if !defined(__sparc64__) -+#define __sparc64__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+#else /* Currently x86_64, i386, arm, powerpc, and sparc are supported */ - #error "Unsupported ISA type" -diff --git a/include/sys/kstat.h b/include/sys/kstat.h -index da3c589..faf6b81 100644 ---- a/include/sys/kstat.h -+++ b/include/sys/kstat.h -@@ -35,2 +35,3 @@ - #define KSTAT_STRLEN 31 -+#define KSTAT_RAW_MAX (128*1024) - -@@ -45,4 +46,3 @@ - #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ --#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ --#define KSTAT_NUM_TYPES 6 -+#define KSTAT_NUM_TYPES 5 - -@@ -81,2 +81,3 @@ - struct kstat_s; -+typedef struct kstat_s kstat_t; - -@@ -92,3 +93,9 @@ typedef struct kstat_module { - --typedef struct kstat_s { -+typedef struct kstat_raw_ops { -+ int (*headers)(char *buf, size_t size); -+ int (*data)(char *buf, size_t size, void *data); -+ void *(*addr)(kstat_t *ksp, loff_t index); -+} kstat_raw_ops_t; -+ -+struct kstat_s { - int ks_magic; /* magic value */ -@@ -109,6 +116,10 @@ typedef struct kstat_s { - void *ks_private; /* private data */ -- kmutex_t ks_lock; /* kstat data lock */ -+ kmutex_t ks_private_lock; /* kstat private data lock */ -+ kmutex_t *ks_lock; /* kstat data lock */ - struct list_head ks_list; /* kstat linkage */ - kstat_module_t *ks_owner; /* kstat module linkage */ --} kstat_t; -+ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ -+ char *ks_raw_buf; /* buf used for raw ops */ -+ size_t ks_raw_bufsize; /* size of raw ops buffer */ -+}; - -@@ -167,22 +178,2 @@ typedef struct kstat_timer { - --typedef enum kstat_txg_state { -- TXG_STATE_OPEN = 1, -- TXG_STATE_QUIESCING = 2, -- TXG_STATE_SYNCING = 3, -- TXG_STATE_COMMITTED = 4, --} kstat_txg_state_t; -- --typedef struct kstat_txg { -- u_longlong_t txg; /* txg id */ -- kstat_txg_state_t state; /* txg state */ -- hrtime_t birth; /* birth time stamp */ -- u_longlong_t nread; /* number of bytes read */ -- u_longlong_t nwritten; /* number of bytes written */ -- uint_t reads; /* number of read operations */ -- uint_t writes; /* number of write operations */ -- hrtime_t open_time; /* open time */ -- hrtime_t quiesce_time;/* quiesce time */ -- hrtime_t sync_time; /* sync time */ --} kstat_txg_t; -- - int spl_kstat_init(void); -@@ -190,2 +181,6 @@ void spl_kstat_fini(void); - -+extern void __kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void* (*addr)(kstat_t *ksp, loff_t index)); - extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, -@@ -196,3 +191,8 @@ extern void __kstat_install(kstat_t *ksp); - extern void __kstat_delete(kstat_t *ksp); -+extern void kstat_waitq_enter(kstat_io_t *); -+extern void kstat_waitq_exit(kstat_io_t *); -+extern void kstat_runq_enter(kstat_io_t *); -+extern void kstat_runq_exit(kstat_io_t *); - -+#define kstat_set_raw_ops(k,h,d,a) __kstat_set_raw_ops(k,h,d,a) - #define kstat_create(m,i,n,c,t,s,f) __kstat_create(m,i,n,c,t,s,f) -diff --git a/include/sys/sdt.h b/include/sys/sdt.h -index 6c8395f..287bfaa 100644 ---- a/include/sys/sdt.h -+++ b/include/sys/sdt.h -@@ -27,2 +27,4 @@ - -+#define SET_ERROR(x) (x) -+ - #endif /* SPL_SDT_H */ -diff --git a/include/sys/thread.h b/include/sys/thread.h -index 369b306..433a076 100644 ---- a/include/sys/thread.h -+++ b/include/sys/thread.h -@@ -53,2 +53,4 @@ typedef void (*thread_func_t)(void *); - #define curthread current -+#define getcomm() current->comm -+#define getpid() current->pid - -@@ -59,2 +61,4 @@ extern kthread_t *__thread_create(caddr_t stk, size_t stksize, - extern void __thread_exit(void); -+extern struct task_struct *spl_kthread_create(int (*func)(void *), -+ void *data, const char namefmt[], ...); - -diff --git a/include/sys/time.h b/include/sys/time.h -index f8d78d1..d8e81c9 100644 ---- a/include/sys/time.h -+++ b/include/sys/time.h -@@ -49,2 +49,5 @@ - -+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) -+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) -+ - /* Already defined in include/linux/time.h */ -diff --git a/include/sys/vmsystm.h b/include/sys/vmsystm.h -index 9097491..adff774 100644 ---- a/include/sys/vmsystm.h -+++ b/include/sys/vmsystm.h -@@ -31,2 +31,3 @@ - #include -+#include - #include -diff --git a/man/Makefile.am b/man/Makefile.am -index 7dc2a57..7791945 100644 ---- a/man/Makefile.am -+++ b/man/Makefile.am -@@ -1 +1 @@ --SUBDIRS = man1 -+SUBDIRS = man1 man5 -diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am -index c91f638..d6becca 100644 ---- a/man/man1/Makefile.am -+++ b/man/man1/Makefile.am -@@ -1,3 +1,2 @@ --man_MANS = splat.1 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = splat.1 - -diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am -new file mode 100644 -index 0000000..fb22beb ---- /dev/null -+++ b/man/man5/Makefile.am -@@ -0,0 +1,4 @@ -+dist_man_MANS = spl-module-parameters.5 -+ -+install-data-local: -+ $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man5" -diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 -new file mode 100644 -index 0000000..3c134f7 ---- /dev/null -+++ b/man/man5/spl-module-parameters.5 -@@ -0,0 +1,126 @@ -+'\" te -+.\" -+.\" Copyright 2013 Turbo Fredriksson . All rights reserved. -+.\" -+.TH SPL-MODULE-PARAMETERS 5 "Nov 18, 2013" -+.SH NAME -+spl\-module\-parameters \- SPL module parameters -+.SH DESCRIPTION -+.sp -+.LP -+Description of the different parameters to the SPL module. -+ -+.SS "Module parameters" -+.sp -+.LP -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_subsys\fR (ulong) -+.ad -+.RS 12n -+Subsystem debugging level mask. -+.sp -+Default value: \fB~0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_mask\fR (ulong) -+.ad -+.RS 12n -+Debugging level mask. -+.sp -+Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_printk\fR (ulong) -+.ad -+.RS 12n -+Console printk level mask. -+.sp -+Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_mb\fR (int) -+.ad -+.RS 12n -+Total debug buffer size. -+.sp -+Default value: \fB-1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_panic_on_bug\fR (int) -+.ad -+.RS 12n -+Panic on BUG -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_kmem_cache_expire\fR (uint) -+.ad -+.RS 12n -+By age (0x1) or low memory (0x2) -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_hostid\fR (ulong) -+.ad -+.RS 12n -+The system hostid. -+.sp -+Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_hostid_path\fR (charp) -+.ad -+.RS 12n -+The system hostid file -+.sp -+Default value: \fB/etc/hostid\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBmutex_spin_max\fR (int) -+.ad -+.RS 12n -+Spin a maximum of N times to acquire lock -+.sp -+.ne 2 -+.na -+\fBPossible values:\fR -+.sp -+.RS 12n -+ \fB0\fR Never spin when trying to acquire lock -+.sp -+\fB-1\fR Spin until acquired or holder yields without dropping lock -+.sp -+\fB1-MAX_INT\fR Spin for N attempts before sleeping for lock -+.RE -+.sp -+.ne -4 -+Default value: \fB0\fR. -diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c -index 60cf726..8236412 100644 ---- a/module/spl/spl-condvar.c -+++ b/module/spl/spl-condvar.c -@@ -38,4 +38,2 @@ __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) - { -- int flags = KM_SLEEP; -- - SENTRY; -@@ -53,8 +51,2 @@ __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) - -- /* We may be called when there is a non-zero preempt_count or -- * interrupts are disabled is which case we must not sleep. -- */ -- if (current_thread_info()->preempt_count || irqs_disabled()) -- flags = KM_NOSLEEP; -- - SEXIT; -@@ -228,2 +220,83 @@ EXPORT_SYMBOL(__cv_timedwait_interruptible); - -+/* -+ *'expire_time' argument is an absolute clock time in nanoseconds. -+ * Return value is time left (expire_time - now) or -1 if timeout occurred. -+ */ -+static clock_t -+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, -+ hrtime_t expire_time, int state) -+{ -+ DEFINE_WAIT(wait); -+ hrtime_t time_left, now; -+ unsigned long time_left_us; -+ SENTRY; -+ -+ ASSERT(cvp); -+ ASSERT(mp); -+ ASSERT(cvp->cv_magic == CV_MAGIC); -+ ASSERT(mutex_owned(mp)); -+ atomic_inc(&cvp->cv_refs); -+ -+ if (cvp->cv_mutex == NULL) -+ cvp->cv_mutex = mp; -+ -+ /* Ensure the same mutex is used by all callers */ -+ ASSERT(cvp->cv_mutex == mp); -+ -+ now = gethrtime(); -+ time_left = expire_time - now; -+ if (time_left <= 0) { -+ atomic_dec(&cvp->cv_refs); -+ SRETURN(-1); -+ } -+ time_left_us = time_left / NSEC_PER_USEC; -+ -+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); -+ atomic_inc(&cvp->cv_waiters); -+ -+ /* Mutex should be dropped after prepare_to_wait() this -+ * ensures we're linked in to the waiters list and avoids the -+ * race where 'cvp->cv_waiters > 0' but the list is empty. */ -+ mutex_exit(mp); -+ /* Allow a 100 us range to give kernel an opportunity to coalesce -+ * interrupts */ -+ usleep_range(time_left_us, time_left_us + 100); -+ mutex_enter(mp); -+ -+ /* No more waiters a different mutex could be used */ -+ if (atomic_dec_and_test(&cvp->cv_waiters)) { -+ cvp->cv_mutex = NULL; -+ wake_up(&cvp->cv_destroy); -+ } -+ -+ finish_wait(&cvp->cv_event, &wait); -+ atomic_dec(&cvp->cv_refs); -+ -+ time_left = expire_time - gethrtime(); -+ SRETURN(time_left > 0 ? time_left : -1); -+} -+ -+/* -+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. -+ */ -+clock_t -+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, -+ hrtime_t res, int flag) -+{ -+ if (res > 1) { -+ /* -+ * Align expiration to the specified resolution. -+ */ -+ if (flag & CALLOUT_FLAG_ROUNDUP) -+ tim += res - 1; -+ tim = (tim / res) * res; -+ } -+ -+ if (!(flag & CALLOUT_FLAG_ABSOLUTE)) -+ tim += gethrtime(); -+ -+ return __cv_timedwait_hires(cvp, mp, tim, TASK_UNINTERRUPTIBLE); -+} -+EXPORT_SYMBOL(cv_timedwait_hires); -+ - void -diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c -index 0ed6572..602bd74 100644 ---- a/module/spl/spl-cred.c -+++ b/module/spl/spl-cred.c -@@ -46,3 +46,4 @@ cr_groups_search(const struct group_info *group_info, gid_t grp) - { -- unsigned int left, right; -+ unsigned int left, right, mid; -+ int cmp; - -@@ -54,4 +55,6 @@ cr_groups_search(const struct group_info *group_info, gid_t grp) - while (left < right) { -- unsigned int mid = (left+right)/2; -- int cmp = KGID_TO_SGID(grp) - KGID_TO_SGID(GROUP_AT(group_info, mid)); -+ mid = (left + right) / 2; -+ cmp = KGID_TO_SGID(grp) - -+ KGID_TO_SGID(GROUP_AT(group_info, mid)); -+ - if (cmp > 0) -@@ -122,3 +125,3 @@ crgetgroups(const cred_t *cr) - --/* Check if the passed gid is available is in supplied credential. */ -+/* Check if the passed gid is available in supplied credential. */ - int -@@ -130,3 +133,3 @@ groupmember(gid_t gid, const cred_t *cr) - gi = get_group_info(cr->group_info); -- rc = cr_groups_search(cr->group_info, SGID_TO_KGID(gid)); -+ rc = cr_groups_search(gi, SGID_TO_KGID(gid)); - put_group_info(gi); -diff --git a/module/spl/spl-debug.c b/module/spl/spl-debug.c -index d450368..93c3f31 100644 ---- a/module/spl/spl-debug.c -+++ b/module/spl/spl-debug.c -@@ -40,2 +40,3 @@ - #include -+#include - #include -@@ -417,3 +418,3 @@ spl_debug_dumplog(int flags) - -- tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); -+ tsk = spl_kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); - if (tsk == NULL) -diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c -index 3cef489..351f536 100644 ---- a/module/spl/spl-generic.c -+++ b/module/spl/spl-generic.c -@@ -761 +761,2 @@ MODULE_DESCRIPTION("Solaris Porting Layer"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); -diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c -index a2dcea0..23e4780 100644 ---- a/module/spl/spl-kmem.c -+++ b/module/spl/spl-kmem.c -@@ -1997,3 +1997,2 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) - spl_kmem_magazine_t *skm; -- unsigned long irq_flags; - void *obj = NULL; -@@ -2005,3 +2004,3 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) - atomic_inc(&skc->skc_ref); -- local_irq_save(irq_flags); -+ local_irq_disable(); - -@@ -2027,3 +2026,3 @@ restart: - -- local_irq_restore(irq_flags); -+ local_irq_enable(); - ASSERT(obj); -diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c -index 4e900c0..c604a32 100644 ---- a/module/spl/spl-kstat.c -+++ b/module/spl/spl-kstat.c -@@ -43,3 +43,82 @@ static kid_t kstat_id; - --static void -+static int -+kstat_resize_raw(kstat_t *ksp) -+{ -+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) -+ return ENOMEM; -+ -+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); -+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); -+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); -+ -+ return 0; -+} -+ -+void -+kstat_waitq_enter(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t wcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->wlastupdate; -+ kiop->wlastupdate = new; -+ wcnt = kiop->wcnt++; -+ if (wcnt != 0) { -+ kiop->wlentime += delta * wcnt; -+ kiop->wtime += delta; -+ } -+} -+EXPORT_SYMBOL(kstat_waitq_enter); -+ -+void -+kstat_waitq_exit(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t wcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->wlastupdate; -+ kiop->wlastupdate = new; -+ wcnt = kiop->wcnt--; -+ ASSERT((int)wcnt > 0); -+ kiop->wlentime += delta * wcnt; -+ kiop->wtime += delta; -+} -+EXPORT_SYMBOL(kstat_waitq_exit); -+ -+void -+kstat_runq_enter(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t rcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->rlastupdate; -+ kiop->rlastupdate = new; -+ rcnt = kiop->rcnt++; -+ if (rcnt != 0) { -+ kiop->rlentime += delta * rcnt; -+ kiop->rtime += delta; -+ } -+} -+EXPORT_SYMBOL(kstat_runq_enter); -+ -+void -+kstat_runq_exit(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t rcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->rlastupdate; -+ kiop->rlastupdate = new; -+ rcnt = kiop->rcnt--; -+ ASSERT((int)rcnt > 0); -+ kiop->rlentime += delta * rcnt; -+ kiop->rtime += delta; -+} -+EXPORT_SYMBOL(kstat_runq_exit); -+ -+static int - kstat_seq_show_headers(struct seq_file *f) -@@ -47,2 +126,4 @@ kstat_seq_show_headers(struct seq_file *f) - kstat_t *ksp = (kstat_t *)f->private; -+ int rc = 0; -+ - ASSERT(ksp->ks_magic == KS_MAGIC); -@@ -56,3 +137,13 @@ kstat_seq_show_headers(struct seq_file *f) - case KSTAT_TYPE_RAW: -- seq_printf(f, "raw data"); -+restart: -+ if (ksp->ks_raw_ops.headers) { -+ rc = ksp->ks_raw_ops.headers( -+ ksp->ks_raw_buf, ksp->ks_raw_bufsize); -+ if (rc == ENOMEM && !kstat_resize_raw(ksp)) -+ goto restart; -+ if (!rc) -+ seq_puts(f, ksp->ks_raw_buf); -+ } else { -+ seq_printf(f, "raw data\n"); -+ } - break; -@@ -83,10 +174,2 @@ kstat_seq_show_headers(struct seq_file *f) - break; -- case KSTAT_TYPE_TXG: -- seq_printf(f, -- "%-8s %-5s %-13s %-12s %-12s %-8s %-8s " -- "%-12s %-12s %-12s\n", -- "txg", "state", "birth", -- "nread", "nwritten", "reads", "writes", -- "otime", "qtime", "stime"); -- break; - default: -@@ -94,2 +177,4 @@ kstat_seq_show_headers(struct seq_file *f) - } -+ -+ return -rc; - } -@@ -204,23 +289,2 @@ kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) - static int --kstat_seq_show_txg(struct seq_file *f, kstat_txg_t *ktp) --{ -- char state; -- -- switch (ktp->state) { -- case TXG_STATE_OPEN: state = 'O'; break; -- case TXG_STATE_QUIESCING: state = 'Q'; break; -- case TXG_STATE_SYNCING: state = 'S'; break; -- case TXG_STATE_COMMITTED: state = 'C'; break; -- default: state = '?'; break; -- } -- -- seq_printf(f, -- "%-8llu %-5c %-13llu %-12llu %-12llu %-8u %-8u " -- "%12lld %12lld %12lld\n", ktp->txg, state, ktp->birth, -- ktp->nread, ktp->nwritten, ktp->reads, ktp->writes, -- ktp->open_time, ktp->quiesce_time, ktp->sync_time); -- return 0; --} -- --static int - kstat_seq_show(struct seq_file *f, void *p) -@@ -234,5 +298,15 @@ kstat_seq_show(struct seq_file *f, void *p) - case KSTAT_TYPE_RAW: -- ASSERT(ksp->ks_ndata == 1); -- rc = kstat_seq_show_raw(f, ksp->ks_data, -- ksp->ks_data_size); -+restart: -+ if (ksp->ks_raw_ops.data) { -+ rc = ksp->ks_raw_ops.data( -+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); -+ if (rc == ENOMEM && !kstat_resize_raw(ksp)) -+ goto restart; -+ if (!rc) -+ seq_puts(f, ksp->ks_raw_buf); -+ } else { -+ ASSERT(ksp->ks_ndata == 1); -+ rc = kstat_seq_show_raw(f, ksp->ks_data, -+ ksp->ks_data_size); -+ } - break; -@@ -250,5 +324,2 @@ kstat_seq_show(struct seq_file *f, void *p) - break; -- case KSTAT_TYPE_TXG: -- rc = kstat_seq_show_txg(f, (kstat_txg_t *)p); -- break; - default: -@@ -257,3 +328,3 @@ kstat_seq_show(struct seq_file *f, void *p) - -- return rc; -+ return -rc; - } -@@ -264,2 +335,6 @@ kstat_default_update(kstat_t *ksp, int rw) - ASSERT(ksp != NULL); -+ -+ if (rw == KSTAT_WRITE) -+ return (EACCES); -+ - return 0; -@@ -275,3 +350,6 @@ kstat_seq_data_addr(kstat_t *ksp, loff_t n) - case KSTAT_TYPE_RAW: -- rc = ksp->ks_data; -+ if (ksp->ks_raw_ops.addr) -+ rc = ksp->ks_raw_ops.addr(ksp, n); -+ else -+ rc = ksp->ks_data; - break; -@@ -289,5 +367,2 @@ kstat_seq_data_addr(kstat_t *ksp, loff_t n) - break; -- case KSTAT_TYPE_TXG: -- rc = ksp->ks_data + n * sizeof(kstat_txg_t); -- break; - default: -@@ -307,3 +382,8 @@ kstat_seq_start(struct seq_file *f, loff_t *pos) - -- mutex_enter(&ksp->ks_lock); -+ mutex_enter(ksp->ks_lock); -+ -+ if (ksp->ks_type == KSTAT_TYPE_RAW) { -+ ksp->ks_raw_bufsize = PAGE_SIZE; -+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); -+ } - -@@ -314,4 +394,4 @@ kstat_seq_start(struct seq_file *f, loff_t *pos) - -- if (!n) -- kstat_seq_show_headers(f); -+ if (!n && kstat_seq_show_headers(f)) -+ SRETURN(NULL); - -@@ -340,6 +420,9 @@ kstat_seq_stop(struct seq_file *f, void *v) - { -- kstat_t *ksp = (kstat_t *)f->private; -- ASSERT(ksp->ks_magic == KS_MAGIC); -+ kstat_t *ksp = (kstat_t *)f->private; -+ ASSERT(ksp->ks_magic == KS_MAGIC); -+ -+ if (ksp->ks_type == KSTAT_TYPE_RAW) -+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); - -- mutex_exit(&ksp->ks_lock); -+ mutex_exit(ksp->ks_lock); - } -@@ -410,9 +493,43 @@ proc_kstat_open(struct inode *inode, struct file *filp) - -+static ssize_t -+proc_kstat_write(struct file *filp, const char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct seq_file *f = filp->private_data; -+ kstat_t *ksp = f->private; -+ int rc; -+ -+ ASSERT(ksp->ks_magic == KS_MAGIC); -+ -+ mutex_enter(ksp->ks_lock); -+ rc = ksp->ks_update(ksp, KSTAT_WRITE); -+ mutex_exit(ksp->ks_lock); -+ -+ if (rc) -+ return (-rc); -+ -+ *ppos += len; -+ return (len); -+} -+ - static struct file_operations proc_kstat_operations = { -- .open = proc_kstat_open, -- .read = seq_read, -- .llseek = seq_lseek, -- .release = seq_release, -+ .open = proc_kstat_open, -+ .write = proc_kstat_write, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, - }; - -+void -+__kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)) -+{ -+ ksp->ks_raw_ops.headers = headers; -+ ksp->ks_raw_ops.data = data; -+ ksp->ks_raw_ops.addr = addr; -+} -+EXPORT_SYMBOL(__kstat_set_raw_ops); -+ - kstat_t * -@@ -442,3 +559,4 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - ksp->ks_magic = KS_MAGIC; -- mutex_init(&ksp->ks_lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); -+ ksp->ks_lock = &ksp->ks_private_lock; - INIT_LIST_HEAD(&ksp->ks_list); -@@ -455,2 +573,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - ksp->ks_private = NULL; -+ ksp->ks_raw_ops.headers = NULL; -+ ksp->ks_raw_ops.data = NULL; -+ ksp->ks_raw_ops.addr = NULL; -+ ksp->ks_raw_buf = NULL; -+ ksp->ks_raw_bufsize = 0; - -@@ -477,6 +600,2 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - break; -- case KSTAT_TYPE_TXG: -- ksp->ks_ndata = ks_ndata; -- ksp->ks_data_size = ks_ndata * sizeof(kstat_timer_t); -- break; - default: -@@ -488,3 +607,3 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - } else { -- ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_SLEEP); -+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); - if (ksp->ks_data == NULL) { -@@ -526,5 +645,5 @@ __kstat_install(kstat_t *ksp) - -- mutex_enter(&ksp->ks_lock); -+ mutex_enter(ksp->ks_lock); - ksp->ks_owner = module; -- ksp->ks_proc = proc_create_data(ksp->ks_name, 0444, -+ ksp->ks_proc = proc_create_data(ksp->ks_name, 0644, - module->ksm_proc, &proc_kstat_operations, (void *)ksp); -@@ -535,3 +654,3 @@ __kstat_install(kstat_t *ksp) - } -- mutex_exit(&ksp->ks_lock); -+ mutex_exit(ksp->ks_lock); - out: -@@ -561,3 +680,4 @@ __kstat_delete(kstat_t *ksp) - -- mutex_destroy(&ksp->ks_lock); -+ ksp->ks_lock = NULL; -+ mutex_destroy(&ksp->ks_private_lock); - kmem_free(ksp, sizeof(*ksp)); -diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c -index bcdc98f..48feb1d 100644 ---- a/module/spl/spl-taskq.c -+++ b/module/spl/spl-taskq.c -@@ -841,3 +841,3 @@ taskq_create(const char *name, int nthreads, pri_t pri, - -- tqt->tqt_thread = kthread_create(taskq_thread, tqt, -+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, - "%s/%d", name, i); -@@ -845,3 +845,2 @@ taskq_create(const char *name, int nthreads, pri_t pri, - list_add(&tqt->tqt_thread_list, &tq->tq_thread_list); -- kthread_bind(tqt->tqt_thread, i % num_online_cpus()); - set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(pri)); -diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c -index 6b3bec5..5c85140 100644 ---- a/module/spl/spl-thread.c -+++ b/module/spl/spl-thread.c -@@ -128,3 +128,3 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, - -- tsk = kthread_create(thread_generic_wrapper, (void *)tp, -+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, - "%s", tp->tp_name); -@@ -139 +139,32 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, - EXPORT_SYMBOL(__thread_create); -+ -+/* -+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for -+ * kthread_create() in which it is not killable and less likely -+ * to return -ENOMEM. -+ */ -+struct task_struct * -+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) -+{ -+ struct task_struct *tsk; -+ va_list args; -+ char name[TASK_COMM_LEN]; -+ -+ va_start(args, namefmt); -+ vsnprintf(name, sizeof(name), namefmt, args); -+ va_end(args); -+ do { -+ tsk = kthread_create(func, data, "%s", name); -+ if (IS_ERR(tsk)) { -+ if (signal_pending(current)) { -+ clear_thread_flag(TIF_SIGPENDING); -+ continue; -+ } -+ if (PTR_ERR(tsk) == -ENOMEM) -+ continue; -+ return (NULL); -+ } else -+ return (tsk); -+ } while (1); -+} -+EXPORT_SYMBOL(spl_kthread_create); -diff --git a/module/spl/spl-time.c b/module/spl/spl-time.c -index 20fd0e3..0ed49cc 100644 ---- a/module/spl/spl-time.c -+++ b/module/spl/spl-time.c -@@ -42,3 +42,5 @@ __gethrestime(timestruc_t *ts) - { -- struct timespec tspec = current_kernel_time(); -+ struct timespec tspec; -+ -+ getnstimeofday(&tspec); - -diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c -index 0784ff2..5496067 100644 ---- a/module/spl/spl-vnode.c -+++ b/module/spl/spl-vnode.c -@@ -336,3 +336,7 @@ vn_remove(const char *path, uio_seg_t seg, int flags) - -+#ifdef HAVE_2ARGS_VFS_UNLINK - rc = vfs_unlink(parent.dentry->d_inode, dentry); -+#else -+ rc = vfs_unlink(parent.dentry->d_inode, dentry, NULL); -+#endif /* HAVE_2ARGS_VFS_UNLINK */ - exit1: -@@ -414,6 +418,6 @@ vn_rename(const char *oldname, const char *newname, int x1) - rc = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry); - #else -- rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, -- new_dir->d_inode, new_dentry, newnd.nd_mnt); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry, NULL); - #endif /* HAVE_4ARGS_VFS_RENAME */ -@@ -480,5 +484,5 @@ vn_remove(const char *path, uio_seg_t seg, int flags) - #ifdef HAVE_2ARGS_VFS_UNLINK -- rc = vfs_unlink(nd.nd_dentry->d_inode, dentry); -+ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry); - #else -- rc = vfs_unlink(nd.nd_dentry->d_inode, dentry, nd.nd_mnt); -+ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry, NULL); - #endif /* HAVE_2ARGS_VFS_UNLINK */ -@@ -573,7 +577,7 @@ vn_rename(const char *oldname, const char *newname, int x1) - #ifdef HAVE_4ARGS_VFS_RENAME -- rc = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry); - #else -- rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, -- new_dir->d_inode, new_dentry, newnd.nd_mnt); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry, NULL); - #endif /* HAVE_4ARGS_VFS_RENAME */ -diff --git a/module/splat/splat-condvar.c b/module/splat/splat-condvar.c -index 1ddde39..3ee2ffc 100644 ---- a/module/splat/splat-condvar.c -+++ b/module/splat/splat-condvar.c -@@ -110,3 +110,3 @@ splat_condvar_test1(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test12_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test12_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -175,3 +175,3 @@ splat_condvar_test2(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test12_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test12_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -256,3 +256,3 @@ splat_condvar_test3(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test34_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test34_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -326,3 +326,3 @@ splat_condvar_test4(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test34_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test34_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -diff --git a/module/splat/splat-cred.c b/module/splat/splat-cred.c -index 47dfa02..fadf9bc 100644 ---- a/module/splat/splat-cred.c -+++ b/module/splat/splat-cred.c -@@ -27,2 +27,3 @@ - #include -+#include - #include "splat-internal.h" -@@ -168,8 +169,4 @@ splat_cred_test2(struct file *file, void *arg) - /* -- * On most/all systems it can be expected that a task with root -- * permissions also is a member of the root group, Since the -- * test suite is always run as root we check first that CRED() is -- * a member of the root group, and secondly that it is not a member -- * of our fake group. This test will break is someone happens to -- * create group number NGROUPS_MAX-1 and then added root to it. -+ * Verify the groupmember() works correctly by constructing an interesting -+ * CRED() and checking that the expected gids are part of it. - */ -@@ -178,28 +175,79 @@ splat_cred_test3(struct file *file, void *arg) - { -- gid_t root_gid, fake_gid; -- int rc; -+ gid_t known_gid, missing_gid, tmp_gid; -+ unsigned char rnd; -+ struct group_info *gi; -+ int i, rc; -+ -+ get_random_bytes((void *)&rnd, 1); -+ known_gid = (rnd > 0) ? rnd : 1; -+ missing_gid = 0; -+ -+ /* -+ * Create an interesting known set of gids for test purposes. The -+ * gids are pseudo randomly selected are will be in the range of -+ * 1:(NGROUPS_MAX-1). Gid 0 is explicitly avoided so we can reliably -+ * test for its absence in the test cases. -+ */ -+ gi = groups_alloc(NGROUPS_SMALL); -+ if (gi == NULL) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed create " -+ "group_info for known gids: %d\n", -ENOMEM); -+ rc = -ENOMEM; -+ goto show_groups; -+ } -+ -+ for (i = 0, tmp_gid = known_gid; i < NGROUPS_SMALL; i++) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Adding gid %d " -+ "to current CRED() (%d/%d)\n", tmp_gid, i, gi->ngroups); -+#ifdef HAVE_KUIDGID_T -+ GROUP_AT(gi, i) = make_kgid(current_user_ns(), tmp_gid); -+#else -+ GROUP_AT(gi, i) = tmp_gid; -+#endif /* HAVE_KUIDGID_T */ -+ tmp_gid = ((tmp_gid * 17) % (NGROUPS_MAX - 1)) + 1; -+ } - -- root_gid = 0; -- fake_gid = NGROUPS_MAX-1; -+ /* Set the new groups in the CRED() and release our reference. */ -+ rc = set_current_groups(gi); -+ put_group_info(gi); - -- rc = groupmember(root_gid, CRED()); -+ if (rc) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed to add " -+ "gid %d to current group: %d\n", known_gid, rc); -+ goto show_groups; -+ } -+ -+ /* Verify groupmember() finds the known_gid in the CRED() */ -+ rc = groupmember(known_gid, CRED()); - if (!rc) { -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, -- "Failed root git %d expected to be member " -- "of CRED() groups: %d\n", root_gid, rc); -- return -EIDRM; -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed to find " -+ "known gid %d in CRED()'s groups.\n", known_gid); -+ rc = -EIDRM; -+ goto show_groups; - } - -- rc = groupmember(fake_gid, CRED()); -+ /* Verify groupmember() does NOT finds the missing gid in the CRED() */ -+ rc = groupmember(missing_gid, CRED()); - if (rc) { -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, -- "Failed fake git %d expected not to be member " -- "of CRED() groups: %d\n", fake_gid, rc); -- return -EIDRM; -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed missing " -+ "gid %d was found in CRED()'s groups.\n", missing_gid); -+ rc = -EIDRM; -+ goto show_groups; -+ } -+ -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Success groupmember() " -+ "correctly detects expected gids in CRED(): %d\n", rc); -+ -+show_groups: -+ if (rc) { -+ int i, grps = crgetngroups(CRED()); -+ -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "%d groups: ", grps); -+ for (i = 0; i < grps; i++) -+ splat_print(file, "%d ", crgetgroups(CRED())[i]); -+ splat_print(file, "%s", "\n"); - } - -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Success root gid " -- "is a member of the expected groups: %d\n", rc); - -- return rc; -+ return (rc); - } /* splat_cred_test3() */ -diff --git a/module/splat/splat-ctl.c b/module/splat/splat-ctl.c -index 54b2ff4..6bbe0ab 100644 ---- a/module/splat/splat-ctl.c -+++ b/module/splat/splat-ctl.c -@@ -723 +723,2 @@ MODULE_DESCRIPTION("Solaris Porting LAyer Tests"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); -diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c -index c7f36ca..25a52b4 100644 ---- a/module/splat/splat-kmem.c -+++ b/module/splat/splat-kmem.c -@@ -246,3 +246,3 @@ splat_kmem_test4(struct file *file, void *arg) - #define SPLAT_KMEM_OBJ_COUNT 1024 --#define SPLAT_KMEM_OBJ_RECLAIM 1000 /* objects */ -+#define SPLAT_KMEM_OBJ_RECLAIM 32 /* objects */ - #define SPLAT_KMEM_THREADS 32 -@@ -684,3 +684,3 @@ splat_kmem_cache_thread_test(struct file *file, void *arg, char *name, - -- start = current_kernel_time(); -+ getnstimeofday(&start); - -@@ -709,3 +709,3 @@ splat_kmem_cache_thread_test(struct file *file, void *arg, char *name, - -- stop = current_kernel_time(); -+ getnstimeofday(&stop); - delta = timespec_sub(stop, start); -@@ -897,3 +897,4 @@ splat_kmem_test8(struct file *file, void *arg) - -- for (i = 0; i < 60; i++) { -+ /* Force reclaim every 1/10 a second for 60 seconds. */ -+ for (i = 0; i < 600; i++) { - kmem_cache_reap_now(kcp->kcp_cache); -@@ -905,3 +906,3 @@ splat_kmem_test8(struct file *file, void *arg) - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(HZ); -+ schedule_timeout(HZ / 10); - } -@@ -1205,3 +1206,3 @@ splat_kmem_test13(struct file *file, void *arg) - struct list_head list; -- struct timespec start, delta = { 0, 0 }; -+ struct timespec start, stop, delta = { 0, 0 }; - int size, count, slabs, fails = 0; -@@ -1252,3 +1253,3 @@ splat_kmem_test13(struct file *file, void *arg) - INIT_LIST_HEAD(&list); -- start = current_kernel_time(); -+ getnstimeofday(&start); - -@@ -1261,3 +1262,4 @@ splat_kmem_test13(struct file *file, void *arg) - -- delta = timespec_sub(current_kernel_time(), start); -+ getnstimeofday(&stop); -+ delta = timespec_sub(stop, start); - if (delta.tv_sec >= max_time) { -@@ -1272,3 +1274,3 @@ splat_kmem_test13(struct file *file, void *arg) - -- dp = (dummy_page_t *)__get_free_page(GFP_KERNEL | __GFP_NORETRY); -+ dp = (dummy_page_t *)__get_free_page(GFP_KERNEL); - if (!dp) { -diff --git a/module/splat/splat-rwlock.c b/module/splat/splat-rwlock.c -index a865fb3..6faf7d2 100644 ---- a/module/splat/splat-rwlock.c -+++ b/module/splat/splat-rwlock.c -@@ -217,6 +217,6 @@ splat_rwlock_test1(struct file *file, void *arg) - if (i == 0) -- rwt[i].rwt_thread = kthread_create(splat_rwlock_wr_thr, -+ rwt[i].rwt_thread = spl_kthread_create(splat_rwlock_wr_thr, - &rwt[i], "%s/%d", SPLAT_RWLOCK_TEST_NAME, i); - else -- rwt[i].rwt_thread = kthread_create(splat_rwlock_rd_thr, -+ rwt[i].rwt_thread = spl_kthread_create(splat_rwlock_rd_thr, - &rwt[i], "%s/%d", SPLAT_RWLOCK_TEST_NAME, i); -diff --git a/rpm/generic/spl-kmod.spec.in b/rpm/generic/spl-kmod.spec.in -index e8d88c1..50947c0 100644 ---- a/rpm/generic/spl-kmod.spec.in -+++ b/rpm/generic/spl-kmod.spec.in -@@ -164,3 +164,3 @@ rm -rf $RPM_BUILD_ROOT - - Released 0.6.2-1 --* Tue Mar 22 2013 Brian Behlendorf - 0.6.1-1 -+* Fri Mar 22 2013 Brian Behlendorf - 0.6.1-1 - - First official stable release. -diff --git a/rpm/generic/spl.spec.in b/rpm/generic/spl.spec.in -index 804584a..a0fe298 100644 ---- a/rpm/generic/spl.spec.in -+++ b/rpm/generic/spl.spec.in -@@ -15,3 +15,3 @@ ExcludeArch: ppc ppc64 - --Requires: %{name}-kmod >= %{version} -+Requires: %{name}-kmod = %{version} - Provides: %{name}-kmod-common = %{version} -@@ -37,2 +37,3 @@ make install DESTDIR=%{?buildroot} - %{_mandir}/man1/* -+%{_mandir}/man5/* - --- -1.9.2 - diff --git a/spl-utils-git/20140411-git-master.patch b/spl-utils-git/20140411-git-master.patch deleted file mode 100644 index 1502991..0000000 --- a/spl-utils-git/20140411-git-master.patch +++ /dev/null @@ -1,1797 +0,0 @@ -diff --git a/config/spl-build.m4 b/config/spl-build.m4 -index f54c5b1..8426780 100644 ---- a/config/spl-build.m4 -+++ b/config/spl-build.m4 -@@ -29,4 +29,3 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ - SPL_AC_2ARGS_REGISTER_SYSCTL -- SPL_AC_SET_SHRINKER -- SPL_AC_3ARGS_SHRINKER_CALLBACK -+ SPL_AC_SHRINKER_CALLBACK - SPL_AC_PATH_IN_NAMEIDATA -@@ -95,2 +94,3 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ - SPL_AC_2ARGS_VFS_GETATTR -+ SPL_AC_USLEEP_RANGE - ]) -@@ -886,29 +886,10 @@ AC_DEFUN([SPL_AC_2ARGS_REGISTER_SYSCTL], - --dnl # --dnl # 2.6.23 API change --dnl # Old set_shrinker API replaced with register_shrinker --dnl # --AC_DEFUN([SPL_AC_SET_SHRINKER], [ -- AC_MSG_CHECKING([whether set_shrinker() available]) -- SPL_LINUX_TRY_COMPILE([ -- #include -- ],[ -- return set_shrinker(DEFAULT_SEEKS, NULL); -- ],[ -- AC_MSG_RESULT([yes]) -- AC_DEFINE(HAVE_SET_SHRINKER, 1, -- [set_shrinker() available]) -- ],[ -- AC_MSG_RESULT([no]) -- ]) --]) -- --dnl # --dnl # 2.6.35 API change, --dnl # Add context to shrinker callback --dnl # --AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], -- [AC_MSG_CHECKING([whether shrinker callback wants 3 args]) -+AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[ - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="-Werror" -+ dnl # -+ dnl # 2.6.23 to 2.6.34 API change -+ dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask) -+ dnl # -+ AC_MSG_CHECKING([whether old 2-argument shrinker exists]) - SPL_LINUX_TRY_COMPILE([ -@@ -916,3 +897,3 @@ AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], - -- int shrinker_cb(struct shrinker *, int, unsigned int); -+ int shrinker_cb(int nr_to_scan, gfp_t gfp_mask); - ],[ -@@ -925,6 +906,82 @@ AC_DEFUN([SPL_AC_3ARGS_SHRINKER_CALLBACK], - AC_MSG_RESULT(yes) -- AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, -- [shrinker callback wants 3 args]) -+ AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1, -+ [old shrinker callback wants 2 args]) - ],[ - AC_MSG_RESULT(no) -+ dnl # -+ dnl # 2.6.35 - 2.6.39 API change -+ dnl # ->shrink(struct shrinker *, -+ dnl # int nr_to_scan, gfp_t gfp_mask) -+ dnl # -+ AC_MSG_CHECKING([whether old 3-argument shrinker exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ int shrinker_cb(struct shrinker *, int nr_to_scan, -+ gfp_t gfp_mask); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .shrink = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, -+ [old shrinker callback wants 3 args]) -+ ],[ -+ AC_MSG_RESULT(no) -+ dnl # -+ dnl # 3.0 - 3.11 API change -+ dnl # ->shrink(struct shrinker *, -+ dnl # struct shrink_control *sc) -+ dnl # -+ AC_MSG_CHECKING( -+ [whether new 2-argument shrinker exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ int shrinker_cb(struct shrinker *, -+ struct shrink_control *sc); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .shrink = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1, -+ [new shrinker callback wants 2 args]) -+ ],[ -+ AC_MSG_RESULT(no) -+ dnl # -+ dnl # 3.12 API change, -+ dnl # ->shrink() is logically split in to -+ dnl # ->count_objects() and ->scan_objects() -+ dnl # -+ AC_MSG_CHECKING( -+ [whether ->count_objects callback exists]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ -+ unsigned long shrinker_cb( -+ struct shrinker *, -+ struct shrink_control *sc); -+ ],[ -+ struct shrinker cache_shrinker = { -+ .count_objects = shrinker_cb, -+ .scan_objects = shrinker_cb, -+ .seeks = DEFAULT_SEEKS, -+ }; -+ register_shrinker(&cache_shrinker); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, -+ 1, [->count_objects exists]) -+ ],[ -+ AC_MSG_ERROR(error) -+ ]) -+ ]) -+ ]) - ]) -@@ -1787,6 +1844,2 @@ AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST], - --dnl # --dnl # SLES API change, never adopted in mainline, --dnl # Third 'struct vfsmount *' argument removed. --dnl # - AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], -@@ -1796,3 +1849,3 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - ],[ -- vfs_unlink(NULL, NULL); -+ vfs_unlink((struct inode *) NULL, (struct dentry *) NULL); - ],[ -@@ -1803,2 +1856,21 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - AC_MSG_RESULT(no) -+ dnl # -+ dnl # Linux 3.13 API change -+ dnl # Added delegated inode -+ dnl # -+ AC_MSG_CHECKING([whether vfs_unlink() wants 3 args]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ vfs_unlink((struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode **) NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_3ARGS_VFS_UNLINK, 1, -+ [vfs_unlink() wants 3 args]) -+ ],[ -+ AC_MSG_ERROR(no) -+ ]) -+ - ]) -@@ -1806,6 +1878,2 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_UNLINK], - --dnl # --dnl # SLES API change, never adopted in mainline, --dnl # Third and sixth 'struct vfsmount *' argument removed. --dnl # - AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], -@@ -1815,3 +1883,4 @@ AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], - ],[ -- vfs_rename(NULL, NULL, NULL, NULL); -+ vfs_rename((struct inode *) NULL, (struct dentry *) NULL, -+ (struct inode *) NULL, (struct dentry *) NULL); - ],[ -@@ -1822,2 +1891,22 @@ AC_DEFUN([SPL_AC_4ARGS_VFS_RENAME], - AC_MSG_RESULT(no) -+ dnl # -+ dnl # Linux 3.13 API change -+ dnl # Added delegated inode -+ dnl # -+ AC_MSG_CHECKING([whether vfs_rename() wants 5 args]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ vfs_rename((struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode *) NULL, -+ (struct dentry *) NULL, -+ (struct inode **) NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_5ARGS_VFS_RENAME, 1, -+ [vfs_rename() wants 5 args]) -+ ],[ -+ AC_MSG_ERROR(no) -+ ]) - ]) -@@ -2402 +2491,23 @@ AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [ - ]) -+ -+dnl # -+dnl # 2.6.36 API compatibility. -+dnl # Added usleep_range timer. -+dnl # usleep_range is a finer precision implementation of msleep -+dnl # designed to be a drop-in replacement for udelay where a precise -+dnl # sleep / busy-wait is unnecessary. -+dnl # -+AC_DEFUN([SPL_AC_USLEEP_RANGE], [ -+ AC_MSG_CHECKING([whether usleep_range() is available]) -+ SPL_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ usleep_range(0, 0); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_USLEEP_RANGE, 1, -+ [usleep_range is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/configure.ac b/configure.ac -index 4772298..e81ddfb 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -52,2 +52,3 @@ AC_CONFIG_FILES([ - man/man1/Makefile -+ man/man5/Makefile - lib/Makefile -diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am -index 730f769..59f2ec5 100644 ---- a/include/linux/Makefile.am -+++ b/include/linux/Makefile.am -@@ -5,2 +5,3 @@ KERNEL_H = \ - $(top_srcdir)/include/linux/compiler_compat.h \ -+ $(top_srcdir)/include/linux/delay_compat.h \ - $(top_srcdir)/include/linux/file_compat.h \ -diff --git a/include/linux/delay_compat.h b/include/linux/delay_compat.h -new file mode 100644 -index 0000000..fc9ff66 ---- /dev/null -+++ b/include/linux/delay_compat.h -@@ -0,0 +1,47 @@ -+/*****************************************************************************\ -+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. -+ * Copyright (C) 2007 The Regents of the University of California. -+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+ * Written by Brian Behlendorf . -+ * UCRL-CODE-235197 -+ * -+ * This file is part of the SPL, Solaris Porting Layer. -+ * For details, see . -+ * -+ * The SPL is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. -+ * -+ * The SPL is distributed in the hope that it will be useful, but WITHOUT -+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with the SPL. If not, see . -+\*****************************************************************************/ -+ -+#ifndef _SPL_DELAY_COMPAT_H -+#define _SPL_DELAY_COMPAT_H -+ -+#include -+#include -+ -+/* usleep_range() introduced in 2.6.36 */ -+#ifndef HAVE_USLEEP_RANGE -+ -+static inline void -+usleep_range(unsigned long min, unsigned long max) -+{ -+ unsigned int min_ms = min / USEC_PER_MSEC; -+ -+ if (min >= MAX_UDELAY_MS) -+ msleep(min_ms); -+ else -+ udelay(min); -+} -+ -+#endif /* HAVE_USLEEP_RANGE */ -+ -+#endif /* _SPL_DELAY_COMPAT_H */ -diff --git a/include/linux/mm_compat.h b/include/linux/mm_compat.h -index cb1bef9..37c9b08 100644 ---- a/include/linux/mm_compat.h -+++ b/include/linux/mm_compat.h -@@ -150,64 +150,98 @@ extern shrink_icache_memory_t shrink_icache_memory_fn; - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Due to frequent changes in the shrinker API the following -+ * compatibility wrappers should be used. They are as follows: -+ * -+ * SPL_SHRINKER_DECLARE is used to declare the shrinker which is -+ * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use -+ * shrinker_name to set the shrinker variable name, shrinker_callback -+ * to set the callback function, and seek_cost to define the cost of -+ * reclaiming an object. -+ * -+ * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost); -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration -+ * of the shrinker callback function is required. Only the callback -+ * function needs to be passed. -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback); -+ * -+ * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function -+ * which is registered with the shrinker. This function will call your -+ * custom shrinker which must use the following prototype. Notice the -+ * leading __'s, these must be appended to the callback_function name. -+ * -+ * int __shrinker_callback(struct shrinker *, struct shrink_control *) -+ * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a -+ * -+ * -+ * Example: -+ * -+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn); -+ * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1); -+ * -+ * static int -+ * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc) -+ * { -+ * if (sc->nr_to_scan) { -+ * ...scan objects in the cache and reclaim them... -+ * } -+ * -+ * ...calculate number of objects in the cache... -+ * -+ * return (number of objects in the cache); -+ * } -+ * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn); - */ --#ifdef HAVE_SET_SHRINKER --typedef struct spl_shrinker { -- struct shrinker *shrinker; -- shrinker_t fn; -- int seeks; --} spl_shrinker_t; -- --static inline void --spl_register_shrinker(spl_shrinker_t *ss) --{ -- ss->shrinker = set_shrinker(ss->seeks, ss->fn); --} - --static inline void --spl_unregister_shrinker(spl_shrinker_t *ss) --{ -- remove_shrinker(ss->shrinker); --} -+#define spl_register_shrinker(x) register_shrinker(x) -+#define spl_unregister_shrinker(x) unregister_shrinker(x) - --# define SPL_SHRINKER_DECLARE(s, x, y) \ -- static spl_shrinker_t s = { \ -- .shrinker = NULL, \ -- .fn = x, \ -- .seeks = y \ -- } -- --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(NULL, &sc); \ -+/* -+ * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility. -+ */ -+#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ - } - --#else -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(int nr_to_scan, unsigned int gfp_mask) - --# define spl_register_shrinker(x) register_shrinker(x) --# define spl_unregister_shrinker(x) unregister_shrinker(x) --# define SPL_SHRINKER_DECLARE(s, x, y) \ -- static struct shrinker s = { \ -- .shrink = x, \ -- .seeks = y \ -- } -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(int nr_to_scan, unsigned int gfp_mask) \ -+{ \ -+ struct shrink_control sc; \ -+ \ -+ sc.nr_to_scan = nr_to_scan; \ -+ sc.gfp_mask = gfp_mask; \ -+ \ -+ return (__ ## fn(NULL, &sc)); \ -+} - - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility. - */ --# if defined(HAVE_SHRINK_CONTROL_STRUCT) --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(struct shrinker *, struct shrink_control *) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(struct shrinker *shrink, struct shrink_control *sc) { \ -- return __ ## fn(shrink, sc); \ -+#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ -+} -+ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(struct shrinker *, int, unsigned int) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ -+{ \ -+ struct shrink_control sc; \ -+ \ -+ sc.nr_to_scan = nr_to_scan; \ -+ sc.gfp_mask = gfp_mask; \ -+ \ -+ return (__ ## fn(shrink, &sc)); \ - } -@@ -215,17 +249,19 @@ fn(struct shrinker *shrink, struct shrink_control *sc) { \ - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ --# elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(struct shrinker *, int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(shrink, &sc); \ -+#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .shrink = x, \ -+ .seeks = y \ -+} -+ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static int fn(struct shrinker *, struct shrink_control *) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static int \ -+fn(struct shrinker *shrink, struct shrink_control *sc) \ -+{ \ -+ return (__ ## fn(shrink, sc)); \ - } -@@ -233,21 +269,45 @@ fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ - /* -- * Linux 2.6. - 2.6. Shrinker API Compatibility. -+ * Linux 3.12 and later Shrinker API Compatibility. - */ --# else --# define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -- static int fn(int, unsigned int) --# define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ --static int \ --fn(int nr_to_scan, unsigned int gfp_mask) \ --{ \ -- struct shrink_control sc; \ -- \ -- sc.nr_to_scan = nr_to_scan; \ -- sc.gfp_mask = gfp_mask; \ -- \ -- return __ ## fn(NULL, &sc); \ -+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -+#define SPL_SHRINKER_DECLARE(s, x, y) \ -+static struct shrinker s = { \ -+ .count_objects = x ## _count_objects, \ -+ .scan_objects = x ## _scan_objects, \ -+ .seeks = y \ - } - --# endif --#endif /* HAVE_SET_SHRINKER */ -+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ -+static unsigned long fn ## _count_objects(struct shrinker *, \ -+ struct shrink_control *); \ -+static unsigned long fn ## _scan_objects(struct shrinker *, \ -+ struct shrink_control *) -+ -+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ -+static unsigned long \ -+fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\ -+{ \ -+ int __ret__; \ -+ \ -+ sc->nr_to_scan = 0; \ -+ __ret__ = __ ## fn(NULL, sc); \ -+ \ -+ /* Errors may not be returned and must be converted to zeros */ \ -+ return ((__ret__ < 0) ? 0 : __ret__); \ -+} \ -+ \ -+static unsigned long \ -+fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \ -+{ \ -+ int __ret__; \ -+ \ -+ __ret__ = __ ## fn(NULL, sc); \ -+ return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \ -+} -+#else -+/* -+ * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. -+ */ -+#error "Unknown shrinker callback" -+#endif - -diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am -index 0e86a28..9d82636 100644 ---- a/include/sys/Makefile.am -+++ b/include/sys/Makefile.am -@@ -15,2 +15,3 @@ KERNEL_H = \ - $(top_srcdir)/include/sys/callb.h \ -+ $(top_srcdir)/include/sys/callo.h \ - $(top_srcdir)/include/sys/cmn_err.h \ -diff --git a/include/sys/callo.h b/include/sys/callo.h -new file mode 100644 -index 0000000..0d9fbcb ---- /dev/null -+++ b/include/sys/callo.h -@@ -0,0 +1,52 @@ -+/*****************************************************************************\ -+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. -+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+ * Written by Brian Behlendorf . -+ * UCRL-CODE-235197 -+ * -+ * This file is part of the SPL, Solaris Porting Layer. -+ * For details, see . -+ * -+ * The SPL is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the -+ * Free Software Foundation; either version 2 of the License, or (at your -+ * option) any later version. -+ * -+ * The SPL is distributed in the hope that it will be useful, but WITHOUT -+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -+ * for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with the SPL. If not, see . -+\*****************************************************************************/ -+ -+#ifndef _SPL_CALLO_H -+#define _SPL_CALLO_H -+ -+/* -+ * Callout flags: -+ * -+ * CALLOUT_FLAG_ROUNDUP -+ * Roundup the expiration time to the next resolution boundary. -+ * If this flag is not specified, the expiration time is rounded down. -+ * CALLOUT_FLAG_ABSOLUTE -+ * Normally, the expiration passed to the timeout API functions is an -+ * expiration interval. If this flag is specified, then it is -+ * interpreted as the expiration time itself. -+ * CALLOUT_FLAG_HRESTIME -+ * Normally, callouts are not affected by changes to system time -+ * (hrestime). This flag is used to create a callout that is affected -+ * by system time. If system time changes, these timers must be -+ * handled in a special way (see callout.c). These are used by condition -+ * variables and LWP timers that need this behavior. -+ * CALLOUT_FLAG_32BIT -+ * Legacy interfaces timeout() and realtime_timeout() pass this flag -+ * to timeout_generic() to indicate that a 32-bit ID should be allocated. -+ */ -+#define CALLOUT_FLAG_ROUNDUP 0x1 -+#define CALLOUT_FLAG_ABSOLUTE 0x2 -+#define CALLOUT_FLAG_HRESTIME 0x4 -+#define CALLOUT_FLAG_32BIT 0x8 -+ -+#endif /* _SPL_CALLB_H */ -diff --git a/include/sys/condvar.h b/include/sys/condvar.h -index c825bd2..c9f2bea 100644 ---- a/include/sys/condvar.h -+++ b/include/sys/condvar.h -@@ -29,4 +29,6 @@ - #include -+#include - #include - #include -+#include - -@@ -58,2 +60,4 @@ extern clock_t __cv_timedwait_interruptible(kcondvar_t *cvp, kmutex_t *mp, - clock_t exp_time); -+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, -+ hrtime_t tim, hrtime_t res, int flag); - extern void __cv_signal(kcondvar_t *cvp); -diff --git a/include/sys/disp.h b/include/sys/disp.h -index 9614a47..c3077a7 100644 ---- a/include/sys/disp.h -+++ b/include/sys/disp.h -@@ -29,2 +29,3 @@ - -+#define kpreempt(unused) schedule() - #define kpreempt_disable() preempt_disable() -diff --git a/include/sys/isa_defs.h b/include/sys/isa_defs.h -index 35aee61..cc59a3a 100644 ---- a/include/sys/isa_defs.h -+++ b/include/sys/isa_defs.h -@@ -93,3 +93,31 @@ - --#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ -+/* sparc arch specific defines */ -+#elif defined(__sparc) || defined(__sparc__) -+ -+#if !defined(__sparc) -+#define __sparc -+#endif -+ -+#if !defined(__sparc__) -+#define __sparc__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+/* sparc64 arch specific defines */ -+#elif defined(__sparc64) || defined(__sparc64__) -+ -+#if !defined(__sparc64) -+#define __sparc64 -+#endif -+ -+#if !defined(__sparc64__) -+#define __sparc64__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+#else /* Currently x86_64, i386, arm, powerpc, and sparc are supported */ - #error "Unsupported ISA type" -diff --git a/include/sys/kstat.h b/include/sys/kstat.h -index da3c589..faf6b81 100644 ---- a/include/sys/kstat.h -+++ b/include/sys/kstat.h -@@ -35,2 +35,3 @@ - #define KSTAT_STRLEN 31 -+#define KSTAT_RAW_MAX (128*1024) - -@@ -45,4 +46,3 @@ - #define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ --#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ --#define KSTAT_NUM_TYPES 6 -+#define KSTAT_NUM_TYPES 5 - -@@ -81,2 +81,3 @@ - struct kstat_s; -+typedef struct kstat_s kstat_t; - -@@ -92,3 +93,9 @@ typedef struct kstat_module { - --typedef struct kstat_s { -+typedef struct kstat_raw_ops { -+ int (*headers)(char *buf, size_t size); -+ int (*data)(char *buf, size_t size, void *data); -+ void *(*addr)(kstat_t *ksp, loff_t index); -+} kstat_raw_ops_t; -+ -+struct kstat_s { - int ks_magic; /* magic value */ -@@ -109,6 +116,10 @@ typedef struct kstat_s { - void *ks_private; /* private data */ -- kmutex_t ks_lock; /* kstat data lock */ -+ kmutex_t ks_private_lock; /* kstat private data lock */ -+ kmutex_t *ks_lock; /* kstat data lock */ - struct list_head ks_list; /* kstat linkage */ - kstat_module_t *ks_owner; /* kstat module linkage */ --} kstat_t; -+ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ -+ char *ks_raw_buf; /* buf used for raw ops */ -+ size_t ks_raw_bufsize; /* size of raw ops buffer */ -+}; - -@@ -167,22 +178,2 @@ typedef struct kstat_timer { - --typedef enum kstat_txg_state { -- TXG_STATE_OPEN = 1, -- TXG_STATE_QUIESCING = 2, -- TXG_STATE_SYNCING = 3, -- TXG_STATE_COMMITTED = 4, --} kstat_txg_state_t; -- --typedef struct kstat_txg { -- u_longlong_t txg; /* txg id */ -- kstat_txg_state_t state; /* txg state */ -- hrtime_t birth; /* birth time stamp */ -- u_longlong_t nread; /* number of bytes read */ -- u_longlong_t nwritten; /* number of bytes written */ -- uint_t reads; /* number of read operations */ -- uint_t writes; /* number of write operations */ -- hrtime_t open_time; /* open time */ -- hrtime_t quiesce_time;/* quiesce time */ -- hrtime_t sync_time; /* sync time */ --} kstat_txg_t; -- - int spl_kstat_init(void); -@@ -190,2 +181,6 @@ void spl_kstat_fini(void); - -+extern void __kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void* (*addr)(kstat_t *ksp, loff_t index)); - extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, -@@ -196,3 +191,8 @@ extern void __kstat_install(kstat_t *ksp); - extern void __kstat_delete(kstat_t *ksp); -+extern void kstat_waitq_enter(kstat_io_t *); -+extern void kstat_waitq_exit(kstat_io_t *); -+extern void kstat_runq_enter(kstat_io_t *); -+extern void kstat_runq_exit(kstat_io_t *); - -+#define kstat_set_raw_ops(k,h,d,a) __kstat_set_raw_ops(k,h,d,a) - #define kstat_create(m,i,n,c,t,s,f) __kstat_create(m,i,n,c,t,s,f) -diff --git a/include/sys/sdt.h b/include/sys/sdt.h -index 6c8395f..287bfaa 100644 ---- a/include/sys/sdt.h -+++ b/include/sys/sdt.h -@@ -27,2 +27,4 @@ - -+#define SET_ERROR(x) (x) -+ - #endif /* SPL_SDT_H */ -diff --git a/include/sys/thread.h b/include/sys/thread.h -index 369b306..433a076 100644 ---- a/include/sys/thread.h -+++ b/include/sys/thread.h -@@ -53,2 +53,4 @@ typedef void (*thread_func_t)(void *); - #define curthread current -+#define getcomm() current->comm -+#define getpid() current->pid - -@@ -59,2 +61,4 @@ extern kthread_t *__thread_create(caddr_t stk, size_t stksize, - extern void __thread_exit(void); -+extern struct task_struct *spl_kthread_create(int (*func)(void *), -+ void *data, const char namefmt[], ...); - -diff --git a/include/sys/time.h b/include/sys/time.h -index f8d78d1..d8e81c9 100644 ---- a/include/sys/time.h -+++ b/include/sys/time.h -@@ -49,2 +49,5 @@ - -+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) -+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) -+ - /* Already defined in include/linux/time.h */ -diff --git a/include/sys/vmsystm.h b/include/sys/vmsystm.h -index 9097491..adff774 100644 ---- a/include/sys/vmsystm.h -+++ b/include/sys/vmsystm.h -@@ -31,2 +31,3 @@ - #include -+#include - #include -diff --git a/man/Makefile.am b/man/Makefile.am -index 7dc2a57..7791945 100644 ---- a/man/Makefile.am -+++ b/man/Makefile.am -@@ -1 +1 @@ --SUBDIRS = man1 -+SUBDIRS = man1 man5 -diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am -index c91f638..d6becca 100644 ---- a/man/man1/Makefile.am -+++ b/man/man1/Makefile.am -@@ -1,3 +1,2 @@ --man_MANS = splat.1 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = splat.1 - -diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am -new file mode 100644 -index 0000000..fb22beb ---- /dev/null -+++ b/man/man5/Makefile.am -@@ -0,0 +1,4 @@ -+dist_man_MANS = spl-module-parameters.5 -+ -+install-data-local: -+ $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man5" -diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 -new file mode 100644 -index 0000000..3c134f7 ---- /dev/null -+++ b/man/man5/spl-module-parameters.5 -@@ -0,0 +1,126 @@ -+'\" te -+.\" -+.\" Copyright 2013 Turbo Fredriksson . All rights reserved. -+.\" -+.TH SPL-MODULE-PARAMETERS 5 "Nov 18, 2013" -+.SH NAME -+spl\-module\-parameters \- SPL module parameters -+.SH DESCRIPTION -+.sp -+.LP -+Description of the different parameters to the SPL module. -+ -+.SS "Module parameters" -+.sp -+.LP -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_subsys\fR (ulong) -+.ad -+.RS 12n -+Subsystem debugging level mask. -+.sp -+Default value: \fB~0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_mask\fR (ulong) -+.ad -+.RS 12n -+Debugging level mask. -+.sp -+Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_printk\fR (ulong) -+.ad -+.RS 12n -+Console printk level mask. -+.sp -+Default value: \fB8 | 10 | 4 | 20\fR (SD_ERROR | SD_EMERG | SD_WARNING | SD_CONSOLE). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_mb\fR (int) -+.ad -+.RS 12n -+Total debug buffer size. -+.sp -+Default value: \fB-1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_debug_panic_on_bug\fR (int) -+.ad -+.RS 12n -+Panic on BUG -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_kmem_cache_expire\fR (uint) -+.ad -+.RS 12n -+By age (0x1) or low memory (0x2) -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_hostid\fR (ulong) -+.ad -+.RS 12n -+The system hostid. -+.sp -+Default value: \fB0xFFFFFFFF\fR (an invalid hostid!) -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspl_hostid_path\fR (charp) -+.ad -+.RS 12n -+The system hostid file -+.sp -+Default value: \fB/etc/hostid\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBmutex_spin_max\fR (int) -+.ad -+.RS 12n -+Spin a maximum of N times to acquire lock -+.sp -+.ne 2 -+.na -+\fBPossible values:\fR -+.sp -+.RS 12n -+ \fB0\fR Never spin when trying to acquire lock -+.sp -+\fB-1\fR Spin until acquired or holder yields without dropping lock -+.sp -+\fB1-MAX_INT\fR Spin for N attempts before sleeping for lock -+.RE -+.sp -+.ne -4 -+Default value: \fB0\fR. -diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c -index 60cf726..8236412 100644 ---- a/module/spl/spl-condvar.c -+++ b/module/spl/spl-condvar.c -@@ -38,4 +38,2 @@ __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) - { -- int flags = KM_SLEEP; -- - SENTRY; -@@ -53,8 +51,2 @@ __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) - -- /* We may be called when there is a non-zero preempt_count or -- * interrupts are disabled is which case we must not sleep. -- */ -- if (current_thread_info()->preempt_count || irqs_disabled()) -- flags = KM_NOSLEEP; -- - SEXIT; -@@ -228,2 +220,83 @@ EXPORT_SYMBOL(__cv_timedwait_interruptible); - -+/* -+ *'expire_time' argument is an absolute clock time in nanoseconds. -+ * Return value is time left (expire_time - now) or -1 if timeout occurred. -+ */ -+static clock_t -+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, -+ hrtime_t expire_time, int state) -+{ -+ DEFINE_WAIT(wait); -+ hrtime_t time_left, now; -+ unsigned long time_left_us; -+ SENTRY; -+ -+ ASSERT(cvp); -+ ASSERT(mp); -+ ASSERT(cvp->cv_magic == CV_MAGIC); -+ ASSERT(mutex_owned(mp)); -+ atomic_inc(&cvp->cv_refs); -+ -+ if (cvp->cv_mutex == NULL) -+ cvp->cv_mutex = mp; -+ -+ /* Ensure the same mutex is used by all callers */ -+ ASSERT(cvp->cv_mutex == mp); -+ -+ now = gethrtime(); -+ time_left = expire_time - now; -+ if (time_left <= 0) { -+ atomic_dec(&cvp->cv_refs); -+ SRETURN(-1); -+ } -+ time_left_us = time_left / NSEC_PER_USEC; -+ -+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); -+ atomic_inc(&cvp->cv_waiters); -+ -+ /* Mutex should be dropped after prepare_to_wait() this -+ * ensures we're linked in to the waiters list and avoids the -+ * race where 'cvp->cv_waiters > 0' but the list is empty. */ -+ mutex_exit(mp); -+ /* Allow a 100 us range to give kernel an opportunity to coalesce -+ * interrupts */ -+ usleep_range(time_left_us, time_left_us + 100); -+ mutex_enter(mp); -+ -+ /* No more waiters a different mutex could be used */ -+ if (atomic_dec_and_test(&cvp->cv_waiters)) { -+ cvp->cv_mutex = NULL; -+ wake_up(&cvp->cv_destroy); -+ } -+ -+ finish_wait(&cvp->cv_event, &wait); -+ atomic_dec(&cvp->cv_refs); -+ -+ time_left = expire_time - gethrtime(); -+ SRETURN(time_left > 0 ? time_left : -1); -+} -+ -+/* -+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. -+ */ -+clock_t -+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, -+ hrtime_t res, int flag) -+{ -+ if (res > 1) { -+ /* -+ * Align expiration to the specified resolution. -+ */ -+ if (flag & CALLOUT_FLAG_ROUNDUP) -+ tim += res - 1; -+ tim = (tim / res) * res; -+ } -+ -+ if (!(flag & CALLOUT_FLAG_ABSOLUTE)) -+ tim += gethrtime(); -+ -+ return __cv_timedwait_hires(cvp, mp, tim, TASK_UNINTERRUPTIBLE); -+} -+EXPORT_SYMBOL(cv_timedwait_hires); -+ - void -diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c -index 0ed6572..602bd74 100644 ---- a/module/spl/spl-cred.c -+++ b/module/spl/spl-cred.c -@@ -46,3 +46,4 @@ cr_groups_search(const struct group_info *group_info, gid_t grp) - { -- unsigned int left, right; -+ unsigned int left, right, mid; -+ int cmp; - -@@ -54,4 +55,6 @@ cr_groups_search(const struct group_info *group_info, gid_t grp) - while (left < right) { -- unsigned int mid = (left+right)/2; -- int cmp = KGID_TO_SGID(grp) - KGID_TO_SGID(GROUP_AT(group_info, mid)); -+ mid = (left + right) / 2; -+ cmp = KGID_TO_SGID(grp) - -+ KGID_TO_SGID(GROUP_AT(group_info, mid)); -+ - if (cmp > 0) -@@ -122,3 +125,3 @@ crgetgroups(const cred_t *cr) - --/* Check if the passed gid is available is in supplied credential. */ -+/* Check if the passed gid is available in supplied credential. */ - int -@@ -130,3 +133,3 @@ groupmember(gid_t gid, const cred_t *cr) - gi = get_group_info(cr->group_info); -- rc = cr_groups_search(cr->group_info, SGID_TO_KGID(gid)); -+ rc = cr_groups_search(gi, SGID_TO_KGID(gid)); - put_group_info(gi); -diff --git a/module/spl/spl-debug.c b/module/spl/spl-debug.c -index d450368..93c3f31 100644 ---- a/module/spl/spl-debug.c -+++ b/module/spl/spl-debug.c -@@ -40,2 +40,3 @@ - #include -+#include - #include -@@ -417,3 +418,3 @@ spl_debug_dumplog(int flags) - -- tsk = kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); -+ tsk = spl_kthread_create(spl_debug_dumplog_thread,(void *)&dp,"spl_debug"); - if (tsk == NULL) -diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c -index 3cef489..351f536 100644 ---- a/module/spl/spl-generic.c -+++ b/module/spl/spl-generic.c -@@ -761 +761,2 @@ MODULE_DESCRIPTION("Solaris Porting Layer"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); -diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c -index a2dcea0..23e4780 100644 ---- a/module/spl/spl-kmem.c -+++ b/module/spl/spl-kmem.c -@@ -1997,3 +1997,2 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) - spl_kmem_magazine_t *skm; -- unsigned long irq_flags; - void *obj = NULL; -@@ -2005,3 +2004,3 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) - atomic_inc(&skc->skc_ref); -- local_irq_save(irq_flags); -+ local_irq_disable(); - -@@ -2027,3 +2026,3 @@ restart: - -- local_irq_restore(irq_flags); -+ local_irq_enable(); - ASSERT(obj); -diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c -index 4e900c0..c604a32 100644 ---- a/module/spl/spl-kstat.c -+++ b/module/spl/spl-kstat.c -@@ -43,3 +43,82 @@ static kid_t kstat_id; - --static void -+static int -+kstat_resize_raw(kstat_t *ksp) -+{ -+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) -+ return ENOMEM; -+ -+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); -+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); -+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); -+ -+ return 0; -+} -+ -+void -+kstat_waitq_enter(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t wcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->wlastupdate; -+ kiop->wlastupdate = new; -+ wcnt = kiop->wcnt++; -+ if (wcnt != 0) { -+ kiop->wlentime += delta * wcnt; -+ kiop->wtime += delta; -+ } -+} -+EXPORT_SYMBOL(kstat_waitq_enter); -+ -+void -+kstat_waitq_exit(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t wcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->wlastupdate; -+ kiop->wlastupdate = new; -+ wcnt = kiop->wcnt--; -+ ASSERT((int)wcnt > 0); -+ kiop->wlentime += delta * wcnt; -+ kiop->wtime += delta; -+} -+EXPORT_SYMBOL(kstat_waitq_exit); -+ -+void -+kstat_runq_enter(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t rcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->rlastupdate; -+ kiop->rlastupdate = new; -+ rcnt = kiop->rcnt++; -+ if (rcnt != 0) { -+ kiop->rlentime += delta * rcnt; -+ kiop->rtime += delta; -+ } -+} -+EXPORT_SYMBOL(kstat_runq_enter); -+ -+void -+kstat_runq_exit(kstat_io_t *kiop) -+{ -+ hrtime_t new, delta; -+ ulong_t rcnt; -+ -+ new = gethrtime(); -+ delta = new - kiop->rlastupdate; -+ kiop->rlastupdate = new; -+ rcnt = kiop->rcnt--; -+ ASSERT((int)rcnt > 0); -+ kiop->rlentime += delta * rcnt; -+ kiop->rtime += delta; -+} -+EXPORT_SYMBOL(kstat_runq_exit); -+ -+static int - kstat_seq_show_headers(struct seq_file *f) -@@ -47,2 +126,4 @@ kstat_seq_show_headers(struct seq_file *f) - kstat_t *ksp = (kstat_t *)f->private; -+ int rc = 0; -+ - ASSERT(ksp->ks_magic == KS_MAGIC); -@@ -56,3 +137,13 @@ kstat_seq_show_headers(struct seq_file *f) - case KSTAT_TYPE_RAW: -- seq_printf(f, "raw data"); -+restart: -+ if (ksp->ks_raw_ops.headers) { -+ rc = ksp->ks_raw_ops.headers( -+ ksp->ks_raw_buf, ksp->ks_raw_bufsize); -+ if (rc == ENOMEM && !kstat_resize_raw(ksp)) -+ goto restart; -+ if (!rc) -+ seq_puts(f, ksp->ks_raw_buf); -+ } else { -+ seq_printf(f, "raw data\n"); -+ } - break; -@@ -83,10 +174,2 @@ kstat_seq_show_headers(struct seq_file *f) - break; -- case KSTAT_TYPE_TXG: -- seq_printf(f, -- "%-8s %-5s %-13s %-12s %-12s %-8s %-8s " -- "%-12s %-12s %-12s\n", -- "txg", "state", "birth", -- "nread", "nwritten", "reads", "writes", -- "otime", "qtime", "stime"); -- break; - default: -@@ -94,2 +177,4 @@ kstat_seq_show_headers(struct seq_file *f) - } -+ -+ return -rc; - } -@@ -204,23 +289,2 @@ kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) - static int --kstat_seq_show_txg(struct seq_file *f, kstat_txg_t *ktp) --{ -- char state; -- -- switch (ktp->state) { -- case TXG_STATE_OPEN: state = 'O'; break; -- case TXG_STATE_QUIESCING: state = 'Q'; break; -- case TXG_STATE_SYNCING: state = 'S'; break; -- case TXG_STATE_COMMITTED: state = 'C'; break; -- default: state = '?'; break; -- } -- -- seq_printf(f, -- "%-8llu %-5c %-13llu %-12llu %-12llu %-8u %-8u " -- "%12lld %12lld %12lld\n", ktp->txg, state, ktp->birth, -- ktp->nread, ktp->nwritten, ktp->reads, ktp->writes, -- ktp->open_time, ktp->quiesce_time, ktp->sync_time); -- return 0; --} -- --static int - kstat_seq_show(struct seq_file *f, void *p) -@@ -234,5 +298,15 @@ kstat_seq_show(struct seq_file *f, void *p) - case KSTAT_TYPE_RAW: -- ASSERT(ksp->ks_ndata == 1); -- rc = kstat_seq_show_raw(f, ksp->ks_data, -- ksp->ks_data_size); -+restart: -+ if (ksp->ks_raw_ops.data) { -+ rc = ksp->ks_raw_ops.data( -+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); -+ if (rc == ENOMEM && !kstat_resize_raw(ksp)) -+ goto restart; -+ if (!rc) -+ seq_puts(f, ksp->ks_raw_buf); -+ } else { -+ ASSERT(ksp->ks_ndata == 1); -+ rc = kstat_seq_show_raw(f, ksp->ks_data, -+ ksp->ks_data_size); -+ } - break; -@@ -250,5 +324,2 @@ kstat_seq_show(struct seq_file *f, void *p) - break; -- case KSTAT_TYPE_TXG: -- rc = kstat_seq_show_txg(f, (kstat_txg_t *)p); -- break; - default: -@@ -257,3 +328,3 @@ kstat_seq_show(struct seq_file *f, void *p) - -- return rc; -+ return -rc; - } -@@ -264,2 +335,6 @@ kstat_default_update(kstat_t *ksp, int rw) - ASSERT(ksp != NULL); -+ -+ if (rw == KSTAT_WRITE) -+ return (EACCES); -+ - return 0; -@@ -275,3 +350,6 @@ kstat_seq_data_addr(kstat_t *ksp, loff_t n) - case KSTAT_TYPE_RAW: -- rc = ksp->ks_data; -+ if (ksp->ks_raw_ops.addr) -+ rc = ksp->ks_raw_ops.addr(ksp, n); -+ else -+ rc = ksp->ks_data; - break; -@@ -289,5 +367,2 @@ kstat_seq_data_addr(kstat_t *ksp, loff_t n) - break; -- case KSTAT_TYPE_TXG: -- rc = ksp->ks_data + n * sizeof(kstat_txg_t); -- break; - default: -@@ -307,3 +382,8 @@ kstat_seq_start(struct seq_file *f, loff_t *pos) - -- mutex_enter(&ksp->ks_lock); -+ mutex_enter(ksp->ks_lock); -+ -+ if (ksp->ks_type == KSTAT_TYPE_RAW) { -+ ksp->ks_raw_bufsize = PAGE_SIZE; -+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); -+ } - -@@ -314,4 +394,4 @@ kstat_seq_start(struct seq_file *f, loff_t *pos) - -- if (!n) -- kstat_seq_show_headers(f); -+ if (!n && kstat_seq_show_headers(f)) -+ SRETURN(NULL); - -@@ -340,6 +420,9 @@ kstat_seq_stop(struct seq_file *f, void *v) - { -- kstat_t *ksp = (kstat_t *)f->private; -- ASSERT(ksp->ks_magic == KS_MAGIC); -+ kstat_t *ksp = (kstat_t *)f->private; -+ ASSERT(ksp->ks_magic == KS_MAGIC); -+ -+ if (ksp->ks_type == KSTAT_TYPE_RAW) -+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); - -- mutex_exit(&ksp->ks_lock); -+ mutex_exit(ksp->ks_lock); - } -@@ -410,9 +493,43 @@ proc_kstat_open(struct inode *inode, struct file *filp) - -+static ssize_t -+proc_kstat_write(struct file *filp, const char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct seq_file *f = filp->private_data; -+ kstat_t *ksp = f->private; -+ int rc; -+ -+ ASSERT(ksp->ks_magic == KS_MAGIC); -+ -+ mutex_enter(ksp->ks_lock); -+ rc = ksp->ks_update(ksp, KSTAT_WRITE); -+ mutex_exit(ksp->ks_lock); -+ -+ if (rc) -+ return (-rc); -+ -+ *ppos += len; -+ return (len); -+} -+ - static struct file_operations proc_kstat_operations = { -- .open = proc_kstat_open, -- .read = seq_read, -- .llseek = seq_lseek, -- .release = seq_release, -+ .open = proc_kstat_open, -+ .write = proc_kstat_write, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, - }; - -+void -+__kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)) -+{ -+ ksp->ks_raw_ops.headers = headers; -+ ksp->ks_raw_ops.data = data; -+ ksp->ks_raw_ops.addr = addr; -+} -+EXPORT_SYMBOL(__kstat_set_raw_ops); -+ - kstat_t * -@@ -442,3 +559,4 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - ksp->ks_magic = KS_MAGIC; -- mutex_init(&ksp->ks_lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); -+ ksp->ks_lock = &ksp->ks_private_lock; - INIT_LIST_HEAD(&ksp->ks_list); -@@ -455,2 +573,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - ksp->ks_private = NULL; -+ ksp->ks_raw_ops.headers = NULL; -+ ksp->ks_raw_ops.data = NULL; -+ ksp->ks_raw_ops.addr = NULL; -+ ksp->ks_raw_buf = NULL; -+ ksp->ks_raw_bufsize = 0; - -@@ -477,6 +600,2 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - break; -- case KSTAT_TYPE_TXG: -- ksp->ks_ndata = ks_ndata; -- ksp->ks_data_size = ks_ndata * sizeof(kstat_timer_t); -- break; - default: -@@ -488,3 +607,3 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - } else { -- ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_SLEEP); -+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); - if (ksp->ks_data == NULL) { -@@ -526,5 +645,5 @@ __kstat_install(kstat_t *ksp) - -- mutex_enter(&ksp->ks_lock); -+ mutex_enter(ksp->ks_lock); - ksp->ks_owner = module; -- ksp->ks_proc = proc_create_data(ksp->ks_name, 0444, -+ ksp->ks_proc = proc_create_data(ksp->ks_name, 0644, - module->ksm_proc, &proc_kstat_operations, (void *)ksp); -@@ -535,3 +654,3 @@ __kstat_install(kstat_t *ksp) - } -- mutex_exit(&ksp->ks_lock); -+ mutex_exit(ksp->ks_lock); - out: -@@ -561,3 +680,4 @@ __kstat_delete(kstat_t *ksp) - -- mutex_destroy(&ksp->ks_lock); -+ ksp->ks_lock = NULL; -+ mutex_destroy(&ksp->ks_private_lock); - kmem_free(ksp, sizeof(*ksp)); -diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c -index bcdc98f..48feb1d 100644 ---- a/module/spl/spl-taskq.c -+++ b/module/spl/spl-taskq.c -@@ -841,3 +841,3 @@ taskq_create(const char *name, int nthreads, pri_t pri, - -- tqt->tqt_thread = kthread_create(taskq_thread, tqt, -+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, - "%s/%d", name, i); -@@ -845,3 +845,2 @@ taskq_create(const char *name, int nthreads, pri_t pri, - list_add(&tqt->tqt_thread_list, &tq->tq_thread_list); -- kthread_bind(tqt->tqt_thread, i % num_online_cpus()); - set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(pri)); -diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c -index 6b3bec5..5c85140 100644 ---- a/module/spl/spl-thread.c -+++ b/module/spl/spl-thread.c -@@ -128,3 +128,3 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, - -- tsk = kthread_create(thread_generic_wrapper, (void *)tp, -+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, - "%s", tp->tp_name); -@@ -139 +139,32 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, - EXPORT_SYMBOL(__thread_create); -+ -+/* -+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for -+ * kthread_create() in which it is not killable and less likely -+ * to return -ENOMEM. -+ */ -+struct task_struct * -+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) -+{ -+ struct task_struct *tsk; -+ va_list args; -+ char name[TASK_COMM_LEN]; -+ -+ va_start(args, namefmt); -+ vsnprintf(name, sizeof(name), namefmt, args); -+ va_end(args); -+ do { -+ tsk = kthread_create(func, data, "%s", name); -+ if (IS_ERR(tsk)) { -+ if (signal_pending(current)) { -+ clear_thread_flag(TIF_SIGPENDING); -+ continue; -+ } -+ if (PTR_ERR(tsk) == -ENOMEM) -+ continue; -+ return (NULL); -+ } else -+ return (tsk); -+ } while (1); -+} -+EXPORT_SYMBOL(spl_kthread_create); -diff --git a/module/spl/spl-time.c b/module/spl/spl-time.c -index 20fd0e3..0ed49cc 100644 ---- a/module/spl/spl-time.c -+++ b/module/spl/spl-time.c -@@ -42,3 +42,5 @@ __gethrestime(timestruc_t *ts) - { -- struct timespec tspec = current_kernel_time(); -+ struct timespec tspec; -+ -+ getnstimeofday(&tspec); - -diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c -index 0784ff2..5496067 100644 ---- a/module/spl/spl-vnode.c -+++ b/module/spl/spl-vnode.c -@@ -336,3 +336,7 @@ vn_remove(const char *path, uio_seg_t seg, int flags) - -+#ifdef HAVE_2ARGS_VFS_UNLINK - rc = vfs_unlink(parent.dentry->d_inode, dentry); -+#else -+ rc = vfs_unlink(parent.dentry->d_inode, dentry, NULL); -+#endif /* HAVE_2ARGS_VFS_UNLINK */ - exit1: -@@ -414,6 +418,6 @@ vn_rename(const char *oldname, const char *newname, int x1) - rc = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry); - #else -- rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, -- new_dir->d_inode, new_dentry, newnd.nd_mnt); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry, NULL); - #endif /* HAVE_4ARGS_VFS_RENAME */ -@@ -480,5 +484,5 @@ vn_remove(const char *path, uio_seg_t seg, int flags) - #ifdef HAVE_2ARGS_VFS_UNLINK -- rc = vfs_unlink(nd.nd_dentry->d_inode, dentry); -+ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry); - #else -- rc = vfs_unlink(nd.nd_dentry->d_inode, dentry, nd.nd_mnt); -+ rc = vfs_unlink(nd.nd_dentry->d_inode, dentry, NULL); - #endif /* HAVE_2ARGS_VFS_UNLINK */ -@@ -573,7 +577,7 @@ vn_rename(const char *oldname, const char *newname, int x1) - #ifdef HAVE_4ARGS_VFS_RENAME -- rc = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry); - #else -- rc = vfs_rename(old_dir->d_inode, old_dentry, oldnd.nd_mnt, -- new_dir->d_inode, new_dentry, newnd.nd_mnt); -+ rc = vfs_rename(old_dir->d_inode, old_dentry, -+ new_dir->d_inode, new_dentry, NULL); - #endif /* HAVE_4ARGS_VFS_RENAME */ -diff --git a/module/splat/splat-condvar.c b/module/splat/splat-condvar.c -index 1ddde39..3ee2ffc 100644 ---- a/module/splat/splat-condvar.c -+++ b/module/splat/splat-condvar.c -@@ -110,3 +110,3 @@ splat_condvar_test1(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test12_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test12_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -175,3 +175,3 @@ splat_condvar_test2(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test12_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test12_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -256,3 +256,3 @@ splat_condvar_test3(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test34_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test34_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -@@ -326,3 +326,3 @@ splat_condvar_test4(struct file *file, void *arg) - ct[i].ct_rc = 0; -- ct[i].ct_thread = kthread_create(splat_condvar_test34_thread, -+ ct[i].ct_thread = spl_kthread_create(splat_condvar_test34_thread, - &ct[i], "%s/%d", SPLAT_CONDVAR_TEST_NAME, i); -diff --git a/module/splat/splat-cred.c b/module/splat/splat-cred.c -index 47dfa02..fadf9bc 100644 ---- a/module/splat/splat-cred.c -+++ b/module/splat/splat-cred.c -@@ -27,2 +27,3 @@ - #include -+#include - #include "splat-internal.h" -@@ -168,8 +169,4 @@ splat_cred_test2(struct file *file, void *arg) - /* -- * On most/all systems it can be expected that a task with root -- * permissions also is a member of the root group, Since the -- * test suite is always run as root we check first that CRED() is -- * a member of the root group, and secondly that it is not a member -- * of our fake group. This test will break is someone happens to -- * create group number NGROUPS_MAX-1 and then added root to it. -+ * Verify the groupmember() works correctly by constructing an interesting -+ * CRED() and checking that the expected gids are part of it. - */ -@@ -178,28 +175,79 @@ splat_cred_test3(struct file *file, void *arg) - { -- gid_t root_gid, fake_gid; -- int rc; -+ gid_t known_gid, missing_gid, tmp_gid; -+ unsigned char rnd; -+ struct group_info *gi; -+ int i, rc; -+ -+ get_random_bytes((void *)&rnd, 1); -+ known_gid = (rnd > 0) ? rnd : 1; -+ missing_gid = 0; -+ -+ /* -+ * Create an interesting known set of gids for test purposes. The -+ * gids are pseudo randomly selected are will be in the range of -+ * 1:(NGROUPS_MAX-1). Gid 0 is explicitly avoided so we can reliably -+ * test for its absence in the test cases. -+ */ -+ gi = groups_alloc(NGROUPS_SMALL); -+ if (gi == NULL) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed create " -+ "group_info for known gids: %d\n", -ENOMEM); -+ rc = -ENOMEM; -+ goto show_groups; -+ } -+ -+ for (i = 0, tmp_gid = known_gid; i < NGROUPS_SMALL; i++) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Adding gid %d " -+ "to current CRED() (%d/%d)\n", tmp_gid, i, gi->ngroups); -+#ifdef HAVE_KUIDGID_T -+ GROUP_AT(gi, i) = make_kgid(current_user_ns(), tmp_gid); -+#else -+ GROUP_AT(gi, i) = tmp_gid; -+#endif /* HAVE_KUIDGID_T */ -+ tmp_gid = ((tmp_gid * 17) % (NGROUPS_MAX - 1)) + 1; -+ } - -- root_gid = 0; -- fake_gid = NGROUPS_MAX-1; -+ /* Set the new groups in the CRED() and release our reference. */ -+ rc = set_current_groups(gi); -+ put_group_info(gi); - -- rc = groupmember(root_gid, CRED()); -+ if (rc) { -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed to add " -+ "gid %d to current group: %d\n", known_gid, rc); -+ goto show_groups; -+ } -+ -+ /* Verify groupmember() finds the known_gid in the CRED() */ -+ rc = groupmember(known_gid, CRED()); - if (!rc) { -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, -- "Failed root git %d expected to be member " -- "of CRED() groups: %d\n", root_gid, rc); -- return -EIDRM; -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed to find " -+ "known gid %d in CRED()'s groups.\n", known_gid); -+ rc = -EIDRM; -+ goto show_groups; - } - -- rc = groupmember(fake_gid, CRED()); -+ /* Verify groupmember() does NOT finds the missing gid in the CRED() */ -+ rc = groupmember(missing_gid, CRED()); - if (rc) { -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, -- "Failed fake git %d expected not to be member " -- "of CRED() groups: %d\n", fake_gid, rc); -- return -EIDRM; -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Failed missing " -+ "gid %d was found in CRED()'s groups.\n", missing_gid); -+ rc = -EIDRM; -+ goto show_groups; -+ } -+ -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Success groupmember() " -+ "correctly detects expected gids in CRED(): %d\n", rc); -+ -+show_groups: -+ if (rc) { -+ int i, grps = crgetngroups(CRED()); -+ -+ splat_vprint(file, SPLAT_CRED_TEST3_NAME, "%d groups: ", grps); -+ for (i = 0; i < grps; i++) -+ splat_print(file, "%d ", crgetgroups(CRED())[i]); -+ splat_print(file, "%s", "\n"); - } - -- splat_vprint(file, SPLAT_CRED_TEST3_NAME, "Success root gid " -- "is a member of the expected groups: %d\n", rc); - -- return rc; -+ return (rc); - } /* splat_cred_test3() */ -diff --git a/module/splat/splat-ctl.c b/module/splat/splat-ctl.c -index 54b2ff4..6bbe0ab 100644 ---- a/module/splat/splat-ctl.c -+++ b/module/splat/splat-ctl.c -@@ -723 +723,2 @@ MODULE_DESCRIPTION("Solaris Porting LAyer Tests"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); -diff --git a/module/splat/splat-kmem.c b/module/splat/splat-kmem.c -index c7f36ca..25a52b4 100644 ---- a/module/splat/splat-kmem.c -+++ b/module/splat/splat-kmem.c -@@ -246,3 +246,3 @@ splat_kmem_test4(struct file *file, void *arg) - #define SPLAT_KMEM_OBJ_COUNT 1024 --#define SPLAT_KMEM_OBJ_RECLAIM 1000 /* objects */ -+#define SPLAT_KMEM_OBJ_RECLAIM 32 /* objects */ - #define SPLAT_KMEM_THREADS 32 -@@ -684,3 +684,3 @@ splat_kmem_cache_thread_test(struct file *file, void *arg, char *name, - -- start = current_kernel_time(); -+ getnstimeofday(&start); - -@@ -709,3 +709,3 @@ splat_kmem_cache_thread_test(struct file *file, void *arg, char *name, - -- stop = current_kernel_time(); -+ getnstimeofday(&stop); - delta = timespec_sub(stop, start); -@@ -897,3 +897,4 @@ splat_kmem_test8(struct file *file, void *arg) - -- for (i = 0; i < 60; i++) { -+ /* Force reclaim every 1/10 a second for 60 seconds. */ -+ for (i = 0; i < 600; i++) { - kmem_cache_reap_now(kcp->kcp_cache); -@@ -905,3 +906,3 @@ splat_kmem_test8(struct file *file, void *arg) - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(HZ); -+ schedule_timeout(HZ / 10); - } -@@ -1205,3 +1206,3 @@ splat_kmem_test13(struct file *file, void *arg) - struct list_head list; -- struct timespec start, delta = { 0, 0 }; -+ struct timespec start, stop, delta = { 0, 0 }; - int size, count, slabs, fails = 0; -@@ -1252,3 +1253,3 @@ splat_kmem_test13(struct file *file, void *arg) - INIT_LIST_HEAD(&list); -- start = current_kernel_time(); -+ getnstimeofday(&start); - -@@ -1261,3 +1262,4 @@ splat_kmem_test13(struct file *file, void *arg) - -- delta = timespec_sub(current_kernel_time(), start); -+ getnstimeofday(&stop); -+ delta = timespec_sub(stop, start); - if (delta.tv_sec >= max_time) { -@@ -1272,3 +1274,3 @@ splat_kmem_test13(struct file *file, void *arg) - -- dp = (dummy_page_t *)__get_free_page(GFP_KERNEL | __GFP_NORETRY); -+ dp = (dummy_page_t *)__get_free_page(GFP_KERNEL); - if (!dp) { -diff --git a/module/splat/splat-rwlock.c b/module/splat/splat-rwlock.c -index a865fb3..6faf7d2 100644 ---- a/module/splat/splat-rwlock.c -+++ b/module/splat/splat-rwlock.c -@@ -217,6 +217,6 @@ splat_rwlock_test1(struct file *file, void *arg) - if (i == 0) -- rwt[i].rwt_thread = kthread_create(splat_rwlock_wr_thr, -+ rwt[i].rwt_thread = spl_kthread_create(splat_rwlock_wr_thr, - &rwt[i], "%s/%d", SPLAT_RWLOCK_TEST_NAME, i); - else -- rwt[i].rwt_thread = kthread_create(splat_rwlock_rd_thr, -+ rwt[i].rwt_thread = spl_kthread_create(splat_rwlock_rd_thr, - &rwt[i], "%s/%d", SPLAT_RWLOCK_TEST_NAME, i); -diff --git a/rpm/generic/spl-kmod.spec.in b/rpm/generic/spl-kmod.spec.in -index e8d88c1..50947c0 100644 ---- a/rpm/generic/spl-kmod.spec.in -+++ b/rpm/generic/spl-kmod.spec.in -@@ -164,3 +164,3 @@ rm -rf $RPM_BUILD_ROOT - - Released 0.6.2-1 --* Tue Mar 22 2013 Brian Behlendorf - 0.6.1-1 -+* Fri Mar 22 2013 Brian Behlendorf - 0.6.1-1 - - First official stable release. -diff --git a/rpm/generic/spl.spec.in b/rpm/generic/spl.spec.in -index 804584a..a0fe298 100644 ---- a/rpm/generic/spl.spec.in -+++ b/rpm/generic/spl.spec.in -@@ -15,3 +15,3 @@ ExcludeArch: ppc ppc64 - --Requires: %{name}-kmod >= %{version} -+Requires: %{name}-kmod = %{version} - Provides: %{name}-kmod-common = %{version} -@@ -37,2 +37,3 @@ make install DESTDIR=%{?buildroot} - %{_mandir}/man1/* -+%{_mandir}/man5/* - --- -1.9.2 - diff --git a/zfs-git/20140411-zfs-git-master.patch b/zfs-git/20140411-zfs-git-master.patch deleted file mode 100644 index cd9fc1b..0000000 --- a/zfs-git/20140411-zfs-git-master.patch +++ /dev/null @@ -1,57337 +0,0 @@ -diff --git a/.gitmodules b/.gitmodules -new file mode 100644 -index 0000000..d400f10 ---- /dev/null -+++ b/.gitmodules -@@ -0,0 +1,3 @@ -+[submodule "scripts/zfs-images"] -+ path = scripts/zfs-images -+ url = https://github.com/zfsonlinux/zfs-images -diff --git a/Makefile.am b/Makefile.am -index 9c299a9..dfb006b 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -42,2 +42,6 @@ dist-hook: - -+checkstyle: -+ @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \ -+ ! -name '*.mod.c' -type f -exec scripts/cstyle.pl {} \+ -+ - ctags: -diff --git a/cmd/Makefile.am b/cmd/Makefile.am -index bad1af6..968c6c1 100644 ---- a/cmd/Makefile.am -+++ b/cmd/Makefile.am -@@ -1,2 +1,2 @@ - SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios --SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat -+SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py -index e01dd8b..ba79235 100755 ---- a/cmd/arcstat/arcstat.py -+++ b/cmd/arcstat/arcstat.py -@@ -53,3 +53,3 @@ import copy - from decimal import Decimal --from signal import signal, SIGINT -+from signal import signal, SIGINT, SIG_DFL - -@@ -92,2 +92,3 @@ cols = { - "l2miss%": [7, 100, "L2ARC access miss percentage"], -+ "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], - "l2size": [6, 1024, "Size of the L2ARC"], -@@ -98,5 +99,5 @@ v = {} - hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", -- "mm%", "arcsz", "c"] -+ "mm%", "arcsz", "c"] - xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "rmis", -- "dread", "pread", "read"] -+ "dread", "pread", "read"] - sint = 1 # Default interval is 1 second -@@ -108,4 +109,4 @@ version = "0.4" - l2exist = False --cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " -- "[count]]\n") -+cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval " -+ "[count]]\n") - cur = {} -@@ -131,3 +132,3 @@ def usage(): - sys.stderr.write("\t -v : List all possible field headers and definitions" -- "\n") -+ "\n") - sys.stderr.write("\t -x : Print extended stats\n") -@@ -136,8 +137,8 @@ def usage(): - sys.stderr.write("\t -s : Override default field separator with custom " -- "character or string\n") -+ "character or string\n") - sys.stderr.write("\nExamples:\n") -- sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") -- sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") -- sys.stderr.write("\tarcstat -v\n") -- sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") -+ sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n") -+ sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n") -+ sys.stderr.write("\tarcstat.py -v\n") -+ sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n") - sys.stderr.write("\n") -@@ -193,3 +194,3 @@ def prettynum(sz, scale, num=0): - # Rounding error, return 0 -- elif num > 0 and num < 1: -+ elif 0 < num < 1: - num = 0 -@@ -219,3 +220,3 @@ def print_values(): - sep -- )) -+ )) - sys.stdout.write("\n") -@@ -231,2 +232,10 @@ def print_header(): - -+def get_terminal_lines(): -+ try: -+ import fcntl, termios, struct -+ data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') -+ sz = struct.unpack('hh', data) -+ return sz[0] -+ except: -+ pass - -@@ -236,2 +245,3 @@ def init(): - global hdr -+ global hdr_intr - global xhdr -@@ -261,6 +271,6 @@ def init(): - ) -- -- except getopt.error, msg: -+ except getopt.error as msg: - sys.stderr.write(msg) - usage() -+ opts = None - -@@ -305,2 +315,6 @@ def init(): - -+ lines = get_terminal_lines() -+ if lines: -+ hdr_intr = lines - 3 -+ - # check if L2ARC exists -@@ -328,5 +342,4 @@ def init(): - if len(incompat) > 0: -- sys.stderr.write("Incompatible field specified! -- %s\n" % ( -- incompat, -- )) -+ sys.stderr.write("Incompatible field specified! -- %s\n" % -+ incompat) - usage() -@@ -338,3 +351,3 @@ def init(): - -- except: -+ except IOError: - sys.stderr.write("Cannot open %s for writing\n" % opfile) -@@ -348,3 +361,3 @@ def calculate(): - -- v = {} -+ v = dict() - v["time"] = time.strftime("%H:%M:%S", time.localtime()) -@@ -365,3 +378,3 @@ def calculate(): - v["pmis"] = (d["prefetch_data_misses"] + -- d["prefetch_metadata_misses"]) / sint -+ d["prefetch_metadata_misses"]) / sint - -@@ -372,5 +385,5 @@ def calculate(): - v["mhit"] = (d["prefetch_metadata_hits"] + -- d["demand_metadata_hits"]) / sint -+ d["demand_metadata_hits"]) / sint - v["mmis"] = (d["prefetch_metadata_misses"] + -- d["demand_metadata_misses"]) / sint -+ d["demand_metadata_misses"]) / sint - -@@ -397,2 +410,3 @@ def calculate(): - v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 -+ v["l2asize"] = cur["l2_asize"] - v["l2size"] = cur["l2_size"] -@@ -401,6 +415,2 @@ def calculate(): - --def sighandler(*args): -- sys.exit(0) -- -- - def main(): -@@ -417,3 +427,3 @@ def main(): - -- signal(SIGINT, sighandler) -+ signal(SIGINT, SIG_DFL) - while True: -diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am -new file mode 100644 -index 0000000..0548b24 ---- /dev/null -+++ b/cmd/dbufstat/Makefile.am -@@ -0,0 +1,2 @@ -+bin_SCRIPTS = dbufstat.py -+EXTRA_DIST = $(bin_SCRIPTS) -diff --git a/cmd/dbufstat/dbufstat.py b/cmd/dbufstat/dbufstat.py -new file mode 100755 -index 0000000..5f75376 ---- /dev/null -+++ b/cmd/dbufstat/dbufstat.py -@@ -0,0 +1,582 @@ -+#!/usr/bin/python -+# -+# Print out statistics for all cached dmu buffers. This information -+# is available through the dbufs kstat and may be post-processed as -+# needed by the script. -+# -+# CDDL HEADER START -+# -+# The contents of this file are subject to the terms of the -+# Common Development and Distribution License, Version 1.0 only -+# (the "License"). You may not use this file except in compliance -+# with the License. -+# -+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+# or http://www.opensolaris.org/os/licensing. -+# See the License for the specific language governing permissions -+# and limitations under the License. -+# -+# When distributing Covered Code, include this CDDL HEADER in each -+# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+# If applicable, add the following below this CDDL HEADER, with the -+# fields enclosed by brackets "[]" replaced with your own identifying -+# information: Portions Copyright [yyyy] [name of copyright owner] -+# -+# CDDL HEADER END -+# -+# Copyright (C) 2013 Lawrence Livermore National Security, LLC. -+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+# -+ -+import sys -+import getopt -+import errno -+ -+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] -+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", -+ "meta", "state", "dbholds", "list", "atype", "index", "flags", -+ "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", -+ "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", -+ "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] -+bincompat = ["cached", "direct", "indirect", "bonus", "spill"] -+ -+dhdr = ["pool", "objset", "object", "dtype", "cached"] -+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", -+ "indirect", "bonus", "spill"] -+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", -+ "list", "atype", "index", "flags", "count", "asize", "access", -+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", -+ "l2_comp", "aholds"] -+ -+thdr = ["pool", "objset", "dtype", "cached"] -+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", -+ "bonus", "spill"] -+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", -+ "dbholds", "list", "atype", "index", "flags", "count", "asize", -+ "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", -+ "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize"] -+ -+cols = { -+ # hdr: [size, scale, description] -+ "pool": [15, -1, "pool name"], -+ "objset": [6, -1, "dataset identification number"], -+ "object": [10, -1, "object number"], -+ "level": [5, -1, "indirection level of buffer"], -+ "blkid": [8, -1, "block number of buffer"], -+ "offset": [12, 1024, "offset in object of buffer"], -+ "dbsize": [7, 1024, "size of buffer"], -+ "meta": [4, -1, "is this buffer metadata?"], -+ "state": [5, -1, "state of buffer (read, cached, etc)"], -+ "dbholds": [7, 1000, "number of holds on buffer"], -+ "list": [4, -1, "which ARC list contains this buffer"], -+ "atype": [7, -1, "ARC header type (data or metadata)"], -+ "index": [5, -1, "buffer's index into its ARC list"], -+ "flags": [8, -1, "ARC read flags"], -+ "count": [5, -1, "ARC data count"], -+ "asize": [7, 1024, "size of this ARC buffer"], -+ "access": [10, -1, "time this ARC buffer was last accessed"], -+ "mru": [5, 1000, "hits while on the ARC's MRU list"], -+ "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], -+ "mfu": [5, 1000, "hits while on the ARC's MFU list"], -+ "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], -+ "l2": [5, 1000, "hits while on the L2ARC"], -+ "l2_dattr": [8, -1, "L2ARC disk address/offset"], -+ "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], -+ "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], -+ "aholds": [6, 1000, "number of holds on this ARC buffer"], -+ "dtype": [27, -1, "dnode type"], -+ "btype": [27, -1, "bonus buffer type"], -+ "data_bs": [7, 1024, "data block size"], -+ "meta_bs": [7, 1024, "metadata block size"], -+ "bsize": [6, 1024, "bonus buffer size"], -+ "lvls": [6, -1, "number of indirection levels"], -+ "dholds": [6, 1000, "number of holds on dnode"], -+ "blocks": [8, 1000, "number of allocated blocks"], -+ "dsize": [12, 1024, "size of dnode"], -+ "cached": [6, 1024, "bytes cached for all blocks"], -+ "direct": [6, 1024, "bytes cached for direct blocks"], -+ "indirect": [8, 1024, "bytes cached for indirect blocks"], -+ "bonus": [5, 1024, "bytes cached for bonus buffer"], -+ "spill": [5, 1024, "bytes cached for spill block"], -+} -+ -+hdr = None -+xhdr = None -+sep = " " # Default separator is 2 spaces -+cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] " -+ "[-s string]\n") -+raw = 0 -+ -+ -+def print_incompat_helper(incompat): -+ cnt = 0 -+ for key in sorted(incompat): -+ if cnt is 0: -+ sys.stderr.write("\t") -+ elif cnt > 8: -+ sys.stderr.write(",\n\t") -+ cnt = 0 -+ else: -+ sys.stderr.write(", ") -+ -+ sys.stderr.write("%s" % key) -+ cnt += 1 -+ -+ sys.stderr.write("\n\n") -+ -+ -+def detailed_usage(): -+ sys.stderr.write("%s\n" % cmd) -+ -+ sys.stderr.write("Field definitions incompatible with '-b' option:\n") -+ print_incompat_helper(bincompat) -+ -+ sys.stderr.write("Field definitions incompatible with '-d' option:\n") -+ print_incompat_helper(dincompat) -+ -+ sys.stderr.write("Field definitions incompatible with '-t' option:\n") -+ print_incompat_helper(tincompat) -+ -+ sys.stderr.write("Field definitions are as follows:\n") -+ for key in sorted(cols.keys()): -+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) -+ sys.stderr.write("\n") -+ -+ sys.exit(1) -+ -+ -+def usage(): -+ sys.stderr.write("%s\n" % cmd) -+ sys.stderr.write("\t -b : Print table of information for each dbuf\n") -+ sys.stderr.write("\t -d : Print table of information for each dnode\n") -+ sys.stderr.write("\t -h : Print this help message\n") -+ sys.stderr.write("\t -r : Print raw values\n") -+ sys.stderr.write("\t -t : Print table of information for each dnode type" -+ "\n") -+ sys.stderr.write("\t -v : List all possible field headers and definitions" -+ "\n") -+ sys.stderr.write("\t -x : Print extended stats\n") -+ sys.stderr.write("\t -i : Redirect input from the specified file\n") -+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") -+ sys.stderr.write("\t -o : Redirect output to the specified file\n") -+ sys.stderr.write("\t -s : Override default field separator with custom " -+ "character or string\n") -+ sys.stderr.write("\nExamples:\n") -+ sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n") -+ sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n") -+ sys.stderr.write("\tdbufstat.py -v\n") -+ sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n") -+ sys.stderr.write("\n") -+ -+ sys.exit(1) -+ -+ -+def prettynum(sz, scale, num=0): -+ global raw -+ -+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] -+ index = 0 -+ save = 0 -+ -+ if raw or scale == -1: -+ return "%*s" % (sz, num) -+ -+ # Rounding error, return 0 -+ elif 0 < num < 1: -+ num = 0 -+ -+ while num > scale and index < 5: -+ save = num -+ num = num / scale -+ index += 1 -+ -+ if index == 0: -+ return "%*d" % (sz, num) -+ -+ if (save / scale) < 10: -+ return "%*.1f%s" % (sz - 1, num, suffix[index]) -+ else: -+ return "%*d%s" % (sz - 1, num, suffix[index]) -+ -+ -+def print_values(v): -+ global hdr -+ global sep -+ -+ try: -+ for col in hdr: -+ sys.stdout.write("%s%s" % ( -+ prettynum(cols[col][0], cols[col][1], v[col]), sep)) -+ sys.stdout.write("\n") -+ except IOError as e: -+ if e.errno == errno.EPIPE: -+ sys.exit(1) -+ -+ -+def print_header(): -+ global hdr -+ global sep -+ -+ try: -+ for col in hdr: -+ sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) -+ sys.stdout.write("\n") -+ except IOError as e: -+ if e.errno == errno.EPIPE: -+ sys.exit(1) -+ -+ -+def get_typestring(t): -+ type_strings = ["DMU_OT_NONE", -+ # general: -+ "DMU_OT_OBJECT_DIRECTORY", -+ "DMU_OT_OBJECT_ARRAY", -+ "DMU_OT_PACKED_NVLIST", -+ "DMU_OT_PACKED_NVLIST_SIZE", -+ "DMU_OT_BPOBJ", -+ "DMU_OT_BPOBJ_HDR", -+ # spa: -+ "DMU_OT_SPACE_MAP_HEADER", -+ "DMU_OT_SPACE_MAP", -+ # zil: -+ "DMU_OT_INTENT_LOG", -+ # dmu: -+ "DMU_OT_DNODE", -+ "DMU_OT_OBJSET", -+ # dsl: -+ "DMU_OT_DSL_DIR", -+ "DMU_OT_DSL_DIR_CHILD_MAP", -+ "DMU_OT_DSL_DS_SNAP_MAP", -+ "DMU_OT_DSL_PROPS", -+ "DMU_OT_DSL_DATASET", -+ # zpl: -+ "DMU_OT_ZNODE", -+ "DMU_OT_OLDACL", -+ "DMU_OT_PLAIN_FILE_CONTENTS", -+ "DMU_OT_DIRECTORY_CONTENTS", -+ "DMU_OT_MASTER_NODE", -+ "DMU_OT_UNLINKED_SET", -+ # zvol: -+ "DMU_OT_ZVOL", -+ "DMU_OT_ZVOL_PROP", -+ # other; for testing only! -+ "DMU_OT_PLAIN_OTHER", -+ "DMU_OT_UINT64_OTHER", -+ "DMU_OT_ZAP_OTHER", -+ # new object types: -+ "DMU_OT_ERROR_LOG", -+ "DMU_OT_SPA_HISTORY", -+ "DMU_OT_SPA_HISTORY_OFFSETS", -+ "DMU_OT_POOL_PROPS", -+ "DMU_OT_DSL_PERMS", -+ "DMU_OT_ACL", -+ "DMU_OT_SYSACL", -+ "DMU_OT_FUID", -+ "DMU_OT_FUID_SIZE", -+ "DMU_OT_NEXT_CLONES", -+ "DMU_OT_SCAN_QUEUE", -+ "DMU_OT_USERGROUP_USED", -+ "DMU_OT_USERGROUP_QUOTA", -+ "DMU_OT_USERREFS", -+ "DMU_OT_DDT_ZAP", -+ "DMU_OT_DDT_STATS", -+ "DMU_OT_SA", -+ "DMU_OT_SA_MASTER_NODE", -+ "DMU_OT_SA_ATTR_REGISTRATION", -+ "DMU_OT_SA_ATTR_LAYOUTS", -+ "DMU_OT_SCAN_XLATE", -+ "DMU_OT_DEDUP", -+ "DMU_OT_DEADLIST", -+ "DMU_OT_DEADLIST_HDR", -+ "DMU_OT_DSL_CLONES", -+ "DMU_OT_BPOBJ_SUBOBJ"] -+ -+ # If "-rr" option is used, don't convert to string representation -+ if raw > 1: -+ return "%i" % t -+ -+ try: -+ return type_strings[t] -+ except IndexError: -+ return "%i" % t -+ -+ -+def get_compstring(c): -+ comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", -+ "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", -+ "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", -+ "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", -+ "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", -+ "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", -+ "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", -+ "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", -+ "ZIO_COMPRESS_FUNCTION"] -+ -+ # If "-rr" option is used, don't convert to string representation -+ if raw > 1: -+ return "%i" % c -+ -+ try: -+ return comp_strings[c] -+ except IndexError: -+ return "%i" % c -+ -+ -+def parse_line(line, labels): -+ global hdr -+ -+ new = dict() -+ val = None -+ for col in hdr: -+ # These are "special" fields computed in the update_dict -+ # function, prevent KeyError exception on labels[col] for these. -+ if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: -+ val = line[labels[col]] -+ -+ if col in ['pool', 'flags']: -+ new[col] = str(val) -+ elif col in ['dtype', 'btype']: -+ new[col] = get_typestring(int(val)) -+ elif col in ['l2_comp']: -+ new[col] = get_compstring(int(val)) -+ else: -+ new[col] = int(val) -+ -+ return new -+ -+ -+def update_dict(d, k, line, labels): -+ pool = line[labels['pool']] -+ objset = line[labels['objset']] -+ key = line[labels[k]] -+ -+ dbsize = int(line[labels['dbsize']]) -+ blkid = int(line[labels['blkid']]) -+ level = int(line[labels['level']]) -+ -+ if pool not in d: -+ d[pool] = dict() -+ -+ if objset not in d[pool]: -+ d[pool][objset] = dict() -+ -+ if key not in d[pool][objset]: -+ d[pool][objset][key] = parse_line(line, labels) -+ d[pool][objset][key]['bonus'] = 0 -+ d[pool][objset][key]['cached'] = 0 -+ d[pool][objset][key]['direct'] = 0 -+ d[pool][objset][key]['indirect'] = 0 -+ d[pool][objset][key]['spill'] = 0 -+ -+ d[pool][objset][key]['cached'] += dbsize -+ -+ if blkid == -1: -+ d[pool][objset][key]['bonus'] += dbsize -+ elif blkid == -2: -+ d[pool][objset][key]['spill'] += dbsize -+ else: -+ if level == 0: -+ d[pool][objset][key]['direct'] += dbsize -+ else: -+ d[pool][objset][key]['indirect'] += dbsize -+ -+ return d -+ -+ -+def print_dict(d): -+ print_header() -+ for pool in d.keys(): -+ for objset in d[pool].keys(): -+ for v in d[pool][objset].values(): -+ print_values(v) -+ -+ -+def dnodes_build_dict(filehandle): -+ labels = dict() -+ dnodes = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ update_dict(dnodes, 'object', line.split(), labels) -+ -+ return dnodes -+ -+ -+def types_build_dict(filehandle): -+ labels = dict() -+ types = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ update_dict(types, 'dtype', line.split(), labels) -+ -+ return types -+ -+ -+def buffers_print_all(filehandle): -+ labels = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ print_header() -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ print_values(parse_line(line.split(), labels)) -+ -+ -+def main(): -+ global hdr -+ global sep -+ global raw -+ -+ desired_cols = None -+ bflag = False -+ dflag = False -+ hflag = False -+ ifile = None -+ ofile = None -+ tflag = False -+ vflag = False -+ xflag = False -+ -+ try: -+ opts, args = getopt.getopt( -+ sys.argv[1:], -+ "bdf:hi:o:rs:tvx", -+ [ -+ "buffers", -+ "dnodes", -+ "columns", -+ "help", -+ "infile", -+ "outfile", -+ "seperator", -+ "types", -+ "verbose", -+ "extended" -+ ] -+ ) -+ except getopt.error: -+ usage() -+ opts = None -+ -+ for opt, arg in opts: -+ if opt in ('-b', '--buffers'): -+ bflag = True -+ if opt in ('-d', '--dnodes'): -+ dflag = True -+ if opt in ('-f', '--columns'): -+ desired_cols = arg -+ if opt in ('-h', '--help'): -+ hflag = True -+ if opt in ('-i', '--infile'): -+ ifile = arg -+ if opt in ('-o', '--outfile'): -+ ofile = arg -+ if opt in ('-r', '--raw'): -+ raw += 1 -+ if opt in ('-s', '--seperator'): -+ sep = arg -+ if opt in ('-t', '--types'): -+ tflag = True -+ if opt in ('-v', '--verbose'): -+ vflag = True -+ if opt in ('-x', '--extended'): -+ xflag = True -+ -+ if hflag or (xflag and desired_cols): -+ usage() -+ -+ if vflag: -+ detailed_usage() -+ -+ # Ensure at most only one of b, d, or t flags are set -+ if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): -+ usage() -+ -+ if bflag: -+ hdr = bxhdr if xflag else bhdr -+ elif tflag: -+ hdr = txhdr if xflag else thdr -+ else: # Even if dflag is False, it's the default if none set -+ dflag = True -+ hdr = dxhdr if xflag else dhdr -+ -+ if desired_cols: -+ hdr = desired_cols.split(",") -+ -+ invalid = [] -+ incompat = [] -+ for ele in hdr: -+ if ele not in cols: -+ invalid.append(ele) -+ elif ((bflag and bincompat and ele in bincompat) or -+ (dflag and dincompat and ele in dincompat) or -+ (tflag and tincompat and ele in tincompat)): -+ incompat.append(ele) -+ -+ if len(invalid) > 0: -+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid) -+ usage() -+ -+ if len(incompat) > 0: -+ sys.stderr.write("Incompatible field specified! -- %s\n" % -+ incompat) -+ usage() -+ -+ if ofile: -+ try: -+ tmp = open(ofile, "w") -+ sys.stdout = tmp -+ -+ except IOError: -+ sys.stderr.write("Cannot open %s for writing\n" % ofile) -+ sys.exit(1) -+ -+ if not ifile: -+ ifile = '/proc/spl/kstat/zfs/dbufs' -+ -+ if ifile is not "-": -+ try: -+ tmp = open(ifile, "r") -+ sys.stdin = tmp -+ except IOError: -+ sys.stderr.write("Cannot open %s for reading\n" % ifile) -+ sys.exit(1) -+ -+ if bflag: -+ buffers_print_all(sys.stdin) -+ -+ if dflag: -+ print_dict(dnodes_build_dict(sys.stdin)) -+ -+ if tflag: -+ print_dict(types_build_dict(sys.stdin)) -+ -+if __name__ == '__main__': -+ main() -diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am -index 7abcc30..e5f3d08 100644 ---- a/cmd/mount_zfs/Makefile.am -+++ b/cmd/mount_zfs/Makefile.am -@@ -20,5 +20,3 @@ mount_zfs_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --mount_zfs_LDFLAGS = \ -- -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) $(LIBSELINUX) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la -diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c -index 4db33ed..6cb23d1 100644 ---- a/cmd/mount_zfs/mount_zfs.c -+++ b/cmd/mount_zfs/mount_zfs.c -@@ -33,5 +33,2 @@ - #include --#ifdef HAVE_LIBSELINUX --#include --#endif /* HAVE_LIBSELINUX */ - -@@ -63,2 +60,6 @@ static const option_map_t option_map[] = { - { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, -+ /* acl flags passed with util-linux-2.24 mount command */ -+ { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, -+ { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, - #ifdef MS_NOATIME -@@ -75,7 +76,6 @@ static const option_map_t option_map[] = { - #endif -- { MNTOPT_CONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_NOCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -+ { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, - #ifdef MS_I_VERSION -@@ -270,3 +270,3 @@ out: - /* Do not add one when cwd already ends in a trailing '/' */ -- if (!strncmp(cwd, dataset, len)) -+ if (strncmp(cwd, dataset, len) == 0) - return (dataset + len + (cwd[len-1] != '/')); -@@ -336,2 +336,26 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) - -+static void -+__zfs_selinux_setcontext(const char *name, const char *context, char *mntopts, -+ char *mtabopt) -+{ -+ char tmp[MNT_LINE_MAX]; -+ -+ snprintf(tmp, MNT_LINE_MAX, ",%s=\"%s\"", name, context); -+ strlcat(mntopts, tmp, MNT_LINE_MAX); -+ strlcat(mtabopt, tmp, MNT_LINE_MAX); -+} -+ -+static void -+zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, -+ char *mntopts, char *mtabopt) -+{ -+ char context[ZFS_MAXPROPLEN]; -+ -+ if (zfs_prop_get(zhp, zpt, context, sizeof (context), -+ NULL, NULL, 0, B_FALSE) == 0) { -+ if (strcmp(context, "none") != 0) -+ __zfs_selinux_setcontext(name, context, mntopts, mtabopt); -+ } -+} -+ - int -@@ -340,3 +364,3 @@ main(int argc, char **argv) - zfs_handle_t *zhp; -- char legacy[ZFS_MAXPROPLEN]; -+ char prop[ZFS_MAXPROPLEN]; - char mntopts[MNT_LINE_MAX] = { '\0' }; -@@ -422,3 +446,3 @@ main(int argc, char **argv) - (void) fprintf(stderr, gettext("filesystem '%s' " -- "cannot be mounted of due invalid option " -+ "cannot be mounted due to invalid option " - "'%s'.\n"), dataset, badopt); -@@ -435,18 +459,2 @@ main(int argc, char **argv) - --#ifdef HAVE_LIBSELINUX -- /* -- * Automatically add the default zfs context when selinux is enabled -- * and the caller has not specified their own context. This must be -- * done until zfs is added to the default selinux policy configuration -- * as a known filesystem type which supports xattrs. -- */ -- if (is_selinux_enabled() && !(zfsflags & ZS_NOCONTEXT)) { -- (void) strlcat(mntopts, ",context=\"system_u:" -- "object_r:file_t:s0\"", sizeof (mntopts)); -- (void) strlcat(mtabopt, ",context=\"system_u:" -- "object_r:file_t:s0\"", sizeof (mtabopt)); -- } --#endif /* HAVE_LIBSELINUX */ -- -- - if (verbose) -@@ -478,8 +486,32 @@ main(int argc, char **argv) - -+ /* -+ * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists -+ * if it does, create a tmp variable in case it's needed -+ * checks to see if the selinux context is set to the default -+ * if it is, allow the setting of the other context properties -+ * this is needed because the 'context' property overrides others -+ * if it is not the default, set the 'context' property -+ */ -+ if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), -+ NULL, NULL, 0, B_FALSE) == 0) { -+ if (strcmp(prop, "none") == 0) { -+ zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, -+ MNTOPT_FSCONTEXT, mntopts, mtabopt); -+ zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, -+ MNTOPT_DEFCONTEXT, mntopts, mtabopt); -+ zfs_selinux_setcontext(zhp, -+ ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, -+ mntopts, mtabopt); -+ } else { -+ __zfs_selinux_setcontext(MNTOPT_CONTEXT, -+ prop, mntopts, mtabopt); -+ } -+ } -+ - /* treat all snapshots as legacy mount points */ - if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) -- (void) strlcpy(legacy, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); -+ (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); - else -- (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, legacy, -- sizeof (legacy), NULL, NULL, 0, B_FALSE); -+ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop, -+ sizeof (prop), NULL, NULL, 0, B_FALSE); - -@@ -499,3 +531,3 @@ main(int argc, char **argv) - */ -- if (zfsutil && !strcmp(legacy, ZFS_MOUNTPOINT_LEGACY)) { -+ if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) { - (void) fprintf(stderr, gettext( -@@ -504,3 +536,3 @@ main(int argc, char **argv) - "See zfs(8) for more information.\n"), -- dataset, mntpoint, dataset, mntpoint); -+ dataset, mntpoint, dataset, mntpoint); - return (MOUNT_USAGE); -@@ -509,3 +541,3 @@ main(int argc, char **argv) - if (!zfsutil && !(remount || fake) && -- strcmp(legacy, ZFS_MOUNTPOINT_LEGACY)) { -+ strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) { - (void) fprintf(stderr, gettext( -@@ -530,3 +562,3 @@ main(int argc, char **argv) - "'%s' is already mounted\n"), dataset); -- return (MOUNT_SYSERR); -+ return (MOUNT_BUSY); - default: -diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id -index 3cf1b58..b6752ba 100755 ---- a/cmd/vdev_id/vdev_id -+++ b/cmd/vdev_id/vdev_id -@@ -41,14 +41,16 @@ - # --# # Linux Mapped --# # Slot Slot --# slot 1 7 --# slot 2 10 --# slot 3 3 --# slot 4 6 --# slot 5 2 --# slot 6 8 --# slot 7 1 --# slot 8 4 --# slot 9 9 --# slot 10 5 -+# # Custom mapping for Channel A -+# -+# # Linux Mapped -+# # Slot Slot Channel -+# slot 1 7 A -+# slot 2 10 A -+# slot 3 3 A -+# slot 4 6 A -+# -+# # Default mapping for B, C, and D -+# slot 1 4 -+# slot 2 2 -+# slot 3 1 -+# slot 4 3 - -@@ -112,6 +114,7 @@ map_slot() { - local LINUX_SLOT=$1 -+ local CHANNEL=$2 - local MAPPED_SLOT= - -- MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} \ -- { print \\$3; exit }" $CONFIG` -+ MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ -+ \\$4 ~ /^(${CHANNEL}|)$/ { print \\$3; exit }" $CONFIG` - if [ -z "$MAPPED_SLOT" ] ; then -@@ -181,4 +184,6 @@ sas_handler() { - -- # Get the raw scsi device name from multipath -l. -- DEV=`multipath -l $DM_NAME |awk '/running/{print $3 ; exit}'` -+ # Get the raw scsi device name from multipath -l. Strip off -+ # leading pipe symbols to make field numbering consistent. -+ DEV=`multipath -l $DM_NAME | -+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` - if [ -z "$DEV" ] ; then -@@ -254,4 +259,4 @@ sas_handler() { - -- SLOT=`map_slot $SLOT` - CHAN=`map_channel $PCI_ID $PORT` -+ SLOT=`map_slot $SLOT $CHAN` - if [ -z "$CHAN" ] ; then -diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am -index f82f1a3..854fbab 100644 ---- a/cmd/zdb/Makefile.am -+++ b/cmd/zdb/Makefile.am -@@ -16,4 +16,5 @@ zdb_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zdb_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zdb_LDADD += $(ZLIB) -diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c -index 82491ad..8e60b9b 100644 ---- a/cmd/zdb/zdb.c -+++ b/cmd/zdb/zdb.c -@@ -59,2 +59,3 @@ - #include -+#include - #undef ZFS_MAXNAMELEN -@@ -166,3 +167,4 @@ usage(void) - (void) fprintf(stderr, " -M -- " -- "specify the maximum number of checksumming I/Os [default is 200]\n"); -+ "specify the maximum number of checksumming I/Os " -+ "[default is 200]\n"); - (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " -@@ -211,2 +213,23 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) - -+/* ARGSUSED */ -+static void -+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) -+{ -+ spa_history_phys_t *shp = data; -+ -+ if (shp == NULL) -+ return; -+ -+ (void) printf("\t\tpool_create_len = %llu\n", -+ (u_longlong_t)shp->sh_pool_create_len); -+ (void) printf("\t\tphys_max_off = %llu\n", -+ (u_longlong_t)shp->sh_phys_max_off); -+ (void) printf("\t\tbof = %llu\n", -+ (u_longlong_t)shp->sh_bof); -+ (void) printf("\t\teof = %llu\n", -+ (u_longlong_t)shp->sh_eof); -+ (void) printf("\t\trecords_lost = %llu\n", -+ (u_longlong_t)shp->sh_records_lost); -+} -+ - static void -@@ -871,2 +894,3 @@ dump_history(spa_t *spa) - char *cmd, *intstr; -+ boolean_t printed = B_FALSE; - -@@ -874,3 +898,3 @@ dump_history(spa_t *spa) - &time) != 0) -- continue; -+ goto next; - if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, -@@ -879,3 +903,3 @@ dump_history(spa_t *spa) - ZPOOL_HIST_INT_EVENT, &ievent) != 0) -- continue; -+ goto next; - verify(nvlist_lookup_uint64(events[i], -@@ -884,4 +908,4 @@ dump_history(spa_t *spa) - ZPOOL_HIST_INT_STR, &intstr) == 0); -- if (ievent >= LOG_END) -- continue; -+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) -+ goto next; - -@@ -898,2 +922,10 @@ dump_history(spa_t *spa) - (void) printf("%s %s\n", tbuf, cmd); -+ printed = B_TRUE; -+ -+next: -+ if (dump_opt['h'] > 1) { -+ if (!printed) -+ (void) printf("unrecognized record:\n"); -+ dump_nvlist(events[i], 2); -+ } - } -@@ -1204,3 +1236,3 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - static void --dump_bpobj(bpobj_t *bpo, char *name) -+dump_bpobj(bpobj_t *bpo, char *name, int indent) - { -@@ -1209,2 +1241,3 @@ dump_bpobj(bpobj_t *bpo, char *name) - char uncomp[32]; -+ uint64_t i; - -@@ -1214,13 +1247,35 @@ dump_bpobj(bpobj_t *bpo, char *name) - zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); -- if (bpo->bpo_havesubobj) { -+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { - zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); - zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); -- (void) printf("\n %s: %llu local blkptrs, %llu subobjs, " -- "%s (%s/%s comp)\n", -- name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, -+ (void) printf(" %*s: object %llu, %llu local blkptrs, " -+ "%llu subobjs, %s (%s/%s comp)\n", -+ indent * 8, name, -+ (u_longlong_t)bpo->bpo_object, -+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, - bytes, comp, uncomp); -+ -+ for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { -+ uint64_t subobj; -+ bpobj_t subbpo; -+ int error; -+ VERIFY0(dmu_read(bpo->bpo_os, -+ bpo->bpo_phys->bpo_subobjs, -+ i * sizeof (subobj), sizeof (subobj), &subobj, 0)); -+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj); -+ if (error != 0) { -+ (void) printf("ERROR %u while trying to open " -+ "subobj id %llu\n", -+ error, (u_longlong_t)subobj); -+ continue; -+ } -+ dump_bpobj(&subbpo, "subobj", indent + 1); -+ } - } else { -- (void) printf("\n %s: %llu blkptrs, %s\n", -- name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); -+ (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", -+ indent * 8, name, -+ (u_longlong_t)bpo->bpo_object, -+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, -+ bytes); - } -@@ -1230,5 +1285,7 @@ dump_bpobj(bpobj_t *bpo, char *name) - -- (void) printf("\n"); - -- (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); -+ if (indent == 0) { -+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); -+ (void) printf("\n"); -+ } - } -@@ -1239,2 +1296,3 @@ dump_deadlist(dsl_deadlist_t *dl) - dsl_deadlist_entry_t *dle; -+ uint64_t unused; - char bytes[32]; -@@ -1257,10 +1315,21 @@ dump_deadlist(dsl_deadlist_t *dl) - -+ /* force the tree to be loaded */ -+ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); -+ - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { -- (void) printf(" mintxg %llu -> obj %llu\n", -- (longlong_t)dle->dle_mintxg, -- (longlong_t)dle->dle_bpobj.bpo_object); -+ if (dump_opt['d'] >= 5) { -+ char buf[128]; -+ (void) snprintf(buf, sizeof (buf), -+ "mintxg %llu -> obj %llu", -+ (longlong_t)dle->dle_mintxg, -+ (longlong_t)dle->dle_bpobj.bpo_object); -+ -+ dump_bpobj(&dle->dle_bpobj, buf, 0); -+ } else { -+ (void) printf("mintxg %llu -> obj %llu\n", -+ (longlong_t)dle->dle_mintxg, -+ (longlong_t)dle->dle_bpobj.bpo_object); - -- if (dump_opt['d'] >= 5) -- dump_bpobj(&dle->dle_bpobj, ""); -+ } - } -@@ -1287,3 +1356,3 @@ fuid_table_destroy(void) - * For CIFS files with FUID the fuid is printed in hex followed by -- * the doman-rid string. -+ * the domain-rid string. - */ -@@ -1371,3 +1440,3 @@ dump_znode_sa_xattr(sa_handle_t *hdl) - nvpair_value_byte_array(elem, &value, &cnt); -- for (idx = 0 ; idx < cnt ; ++idx) { -+ for (idx = 0; idx < cnt; ++idx) { - if (isprint(value[idx])) -@@ -1531,3 +1600,3 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { - dump_uint8, /* SPA history */ -- dump_uint64, /* SPA history offsets */ -+ dump_history_offsets, /* SPA history offsets */ - dump_zap, /* Pool properties */ -@@ -1696,3 +1765,5 @@ dump_dir(objset_t *os) - -+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); -+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - -@@ -2142,3 +2213,2 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); -- - } -@@ -2328,3 +2398,3 @@ dump_block_stats(spa_t *spa) - */ -- bzero(&zcb, sizeof(zdb_cb_t)); -+ bzero(&zcb, sizeof (zdb_cb_t)); - zdb_leak_init(spa, &zcb); -@@ -2336,4 +2406,6 @@ dump_block_stats(spa_t *spa) - count_block_cb, &zcb, NULL); -- (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, -- count_block_cb, &zcb, NULL); -+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { -+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, -+ count_block_cb, &zcb, NULL); -+ } - if (spa_feature_is_active(spa, -@@ -2637,6 +2709,7 @@ dump_zpool(spa_t *spa) - if (dump_opt['d'] >= 3) { -- dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); -+ dump_bpobj(&spa->spa_deferred_bpobj, -+ "Deferred frees", 0); - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, -- "Pool snapshot frees"); -+ "Pool snapshot frees", 0); - } -diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore -new file mode 100644 -index 0000000..76557bb ---- /dev/null -+++ b/cmd/zed/.gitignore -@@ -0,0 +1 @@ -+/zed -diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am -new file mode 100644 -index 0000000..f1404de ---- /dev/null -+++ b/cmd/zed/Makefile.am -@@ -0,0 +1,66 @@ -+include $(top_srcdir)/config/Rules.am -+ -+DEFAULT_INCLUDES += \ -+ -I$(top_srcdir)/include \ -+ -I$(top_srcdir)/lib/libspl/include -+ -+sbin_PROGRAMS = zed -+ -+zed_SOURCES = \ -+ $(top_srcdir)/cmd/zed/zed.c \ -+ $(top_srcdir)/cmd/zed/zed.h \ -+ $(top_srcdir)/cmd/zed/zed_conf.c \ -+ $(top_srcdir)/cmd/zed/zed_conf.h \ -+ $(top_srcdir)/cmd/zed/zed_event.c \ -+ $(top_srcdir)/cmd/zed/zed_event.h \ -+ $(top_srcdir)/cmd/zed/zed_exec.c \ -+ $(top_srcdir)/cmd/zed/zed_exec.h \ -+ $(top_srcdir)/cmd/zed/zed_file.c \ -+ $(top_srcdir)/cmd/zed/zed_file.h \ -+ $(top_srcdir)/cmd/zed/zed_log.c \ -+ $(top_srcdir)/cmd/zed/zed_log.h \ -+ $(top_srcdir)/cmd/zed/zed_strings.c \ -+ $(top_srcdir)/cmd/zed/zed_strings.h -+ -+zed_LDADD = \ -+ $(top_builddir)/lib/libavl/libavl.la \ -+ $(top_builddir)/lib/libnvpair/libnvpair.la \ -+ $(top_builddir)/lib/libspl/libspl.la \ -+ $(top_builddir)/lib/libzfs/libzfs.la -+ -+zedconfdir = $(sysconfdir)/zfs/zed.d -+ -+dist_zedconf_DATA = \ -+ $(top_srcdir)/cmd/zed/zed.d/zed.rc -+ -+zedexecdir = $(libexecdir)/zfs/zed.d -+ -+dist_zedexec_SCRIPTS = \ -+ $(top_srcdir)/cmd/zed/zed.d/all-debug.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/data-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/generic-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/io-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh -+ -+zedconfdefaults = \ -+ all-syslog.sh \ -+ checksum-email.sh \ -+ checksum-spare.sh \ -+ data-email.sh \ -+ io-email.sh \ -+ io-spare.sh \ -+ resilver.finish-email.sh \ -+ scrub.finish-email.sh -+ -+install-data-local: -+ $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" -+ for f in $(zedconfdefaults); do \ -+ test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ -+ -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ -+ ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ -+ done -diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c -new file mode 100644 -index 0000000..c54a59b ---- /dev/null -+++ b/cmd/zed/zed.c -@@ -0,0 +1,235 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_event.h" -+#include "zed_file.h" -+#include "zed_log.h" -+ -+static volatile sig_atomic_t _got_exit = 0; -+static volatile sig_atomic_t _got_hup = 0; -+ -+/* -+ * Signal handler for SIGINT & SIGTERM. -+ */ -+static void -+_exit_handler(int signum) -+{ -+ _got_exit = 1; -+} -+ -+/* -+ * Signal handler for SIGHUP. -+ */ -+static void -+_hup_handler(int signum) -+{ -+ _got_hup = 1; -+} -+ -+/* -+ * Register signal handlers. -+ */ -+static void -+_setup_sig_handlers(void) -+{ -+ struct sigaction sa; -+ -+ if (sigemptyset(&sa.sa_mask) < 0) -+ zed_log_die("Failed to initialize sigset"); -+ -+ sa.sa_flags = SA_RESTART; -+ sa.sa_handler = SIG_IGN; -+ -+ if (sigaction(SIGPIPE, &sa, NULL) < 0) -+ zed_log_die("Failed to ignore SIGPIPE"); -+ -+ sa.sa_handler = _exit_handler; -+ if (sigaction(SIGINT, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGINT handler"); -+ -+ if (sigaction(SIGTERM, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGTERM handler"); -+ -+ sa.sa_handler = _hup_handler; -+ if (sigaction(SIGHUP, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGHUP handler"); -+} -+ -+/* -+ * Lock all current and future pages in the virtual memory address space. -+ * Access to locked pages will never be delayed by a page fault. -+ * EAGAIN is tested up to max_tries in case this is a transient error. -+ */ -+static void -+_lock_memory(void) -+{ -+#if HAVE_MLOCKALL -+ int i = 0; -+ const int max_tries = 10; -+ -+ for (i = 0; i < max_tries; i++) { -+ if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) { -+ zed_log_msg(LOG_INFO, "Locked all pages in memory"); -+ return; -+ } -+ if (errno != EAGAIN) -+ break; -+ } -+ zed_log_die("Failed to lock memory pages: %s", strerror(errno)); -+ -+#else /* HAVE_MLOCKALL */ -+ zed_log_die("Failed to lock memory pages: mlockall() not supported"); -+#endif /* HAVE_MLOCKALL */ -+} -+ -+/* -+ * Transform the process into a daemon. -+ */ -+static void -+_become_daemon(void) -+{ -+ pid_t pid; -+ int fd; -+ -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_die("Failed to create child process: %s", -+ strerror(errno)); -+ } else if (pid > 0) { -+ _exit(EXIT_SUCCESS); -+ } -+ if (setsid() < 0) -+ zed_log_die("Failed to create new session: %s", -+ strerror(errno)); -+ -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_die("Failed to create grandchild process: %s", -+ strerror(errno)); -+ } else if (pid > 0) { -+ _exit(EXIT_SUCCESS); -+ } -+ fd = open("/dev/null", O_RDWR); -+ -+ if (fd < 0) -+ zed_log_die("Failed to open /dev/null: %s", strerror(errno)); -+ -+ if (dup2(fd, STDIN_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stdin: %s", -+ strerror(errno)); -+ -+ if (dup2(fd, STDOUT_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stdout: %s", -+ strerror(errno)); -+ -+ if (dup2(fd, STDERR_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stderr: %s", -+ strerror(errno)); -+ -+ if (close(fd) < 0) -+ zed_log_die("Failed to close /dev/null: %s", strerror(errno)); -+} -+ -+/* -+ * ZFS Event Daemon (ZED). -+ */ -+int -+main(int argc, char *argv[]) -+{ -+ struct zed_conf *zcp; -+ uint64_t saved_eid; -+ int64_t saved_etime[2]; -+ -+ zed_log_init(argv[0]); -+ zed_log_stderr_open(LOG_NOTICE); -+ zcp = zed_conf_create(); -+ zed_conf_parse_opts(zcp, argc, argv); -+ if (zcp->do_verbose) -+ zed_log_stderr_open(LOG_INFO); -+ -+ if (geteuid() != 0) -+ zed_log_die("Must be run as root"); -+ -+ (void) umask(0); -+ -+ _setup_sig_handlers(); -+ -+ zed_conf_parse_file(zcp); -+ -+ zed_file_close_from(STDERR_FILENO + 1); -+ -+ if (chdir("/") < 0) -+ zed_log_die("Failed to change to root directory"); -+ -+ if (zed_conf_scan_dir(zcp) < 0) -+ exit(EXIT_FAILURE); -+ -+ if (zcp->do_memlock) -+ _lock_memory(); -+ -+ if (!zcp->do_foreground) { -+ _become_daemon(); -+ zed_log_syslog_open(LOG_DAEMON); -+ zed_log_stderr_close(); -+ } -+ zed_log_msg(LOG_NOTICE, -+ "ZFS Event Daemon %s-%s", ZFS_META_VERSION, ZFS_META_RELEASE); -+ -+ (void) zed_conf_write_pid(zcp); -+ -+ if (zed_conf_open_state(zcp) < 0) -+ exit(EXIT_FAILURE); -+ -+ if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) -+ exit(EXIT_FAILURE); -+ -+ zed_event_init(zcp); -+ zed_event_seek(zcp, saved_eid, saved_etime); -+ -+ while (!_got_exit) { -+ if (_got_hup) { -+ _got_hup = 0; -+ (void) zed_conf_scan_dir(zcp); -+ } -+ zed_event_service(zcp); -+ } -+ zed_log_msg(LOG_NOTICE, "Exiting"); -+ zed_event_fini(zcp); -+ zed_conf_destroy(zcp); -+ zed_log_fini(); -+ exit(EXIT_SUCCESS); -+} -diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh -new file mode 100755 -index 0000000..ae64e0a ---- /dev/null -+++ b/cmd/zed/zed.d/all-debug.sh -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# -+# Log all environment variables to ZED_DEBUG_LOG. -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+# Override the default umask to restrict access to a newly-created logfile. -+umask 077 -+ -+# Append stdout to the logfile after obtaining an advisory lock. -+exec >> "${ZED_DEBUG_LOG:=/tmp/zed.debug.log}" -+flock -x 1 -+ -+printenv | sort -+echo -+ -+exit 0 -diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh -new file mode 100755 -index 0000000..b8bd307 ---- /dev/null -+++ b/cmd/zed/zed.d/all-syslog.sh -@@ -0,0 +1,11 @@ -+#!/bin/sh -+# -+# Log the zevent via syslog. -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+logger -t "${ZED_SYSLOG_TAG:=zed}" -p "${ZED_SYSLOG_PRIORITY:=daemon.notice}" \ -+ eid="${ZEVENT_EID}" class="${ZEVENT_SUBCLASS}" \ -+ "${ZEVENT_POOL:+pool=$ZEVENT_POOL}" -+ -+exit 0 -diff --git a/cmd/zed/zed.d/checksum-email.sh b/cmd/zed/zed.d/checksum-email.sh -new file mode 120000 -index 0000000..f95bec2 ---- /dev/null -+++ b/cmd/zed/zed.d/checksum-email.sh -@@ -0,0 +1 @@ -+io-email.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh -new file mode 120000 -index 0000000..f564f93 ---- /dev/null -+++ b/cmd/zed/zed.d/checksum-spare.sh -@@ -0,0 +1 @@ -+io-spare.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/data-email.sh b/cmd/zed/zed.d/data-email.sh -new file mode 100755 -index 0000000..9f83161 ---- /dev/null -+++ b/cmd/zed/zed.d/data-email.sh -@@ -0,0 +1,81 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a DATA zevent. -+# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -+# class/pool combination. This protects against spamming the recipient -+# should multiple events occur together in time for the same pool. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# State File Format: -+# POOL:TIME_OF_LAST_EMAIL -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" != "data"; then \ -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+NAME="zed.${ZEVENT_SUBCLASS}.email" -+LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -+STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" -+ -+# Obtain lock to ensure mutual exclusion for accessing state. -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Query state for last time email was sent for this pool. -+TIME_NOW=`date +%s` -+TIME_LAST=`egrep "^${ZEVENT_POOL}:" "${STATEFILE}" 2>/dev/null | cut -d: -f2` -+if test -n "${TIME_LAST}"; then -+ TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` -+ if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then -+ exit 2 -+ fi -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" </dev/null > "${STATEFILE}.$$" -+echo "${ZEVENT_POOL}:${TIME_NOW}" >> "${STATEFILE}.$$" -+mv -f "${STATEFILE}.$$" "${STATEFILE}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/generic-email.sh b/cmd/zed/zed.d/generic-email.sh -new file mode 100755 -index 0000000..16bbdb1 ---- /dev/null -+++ b/cmd/zed/zed.d/generic-email.sh -@@ -0,0 +1,59 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a given zevent. -+# This is a generic script than can be symlinked to a file in the zed -+# enabled-scripts directory in order to have email sent when a particular -+# class of zevents occurs. The symlink filename must begin with the zevent -+# (sub)class string (eg, "probe_failure-email.sh" for the "probe_failure" -+# subclass). Refer to the zed(8) manpage for details. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+# Override the default umask to restrict access to the msgbody tmpfile. -+umask 077 -+ -+SUBJECT="ZFS ${ZEVENT_SUBCLASS} event" -+test -n "${ZEVENT_POOL}" && SUBJECT="${SUBJECT} for ${ZEVENT_POOL}" -+SUBJECT="${SUBJECT} on `hostname`" -+ -+MSGBODY="${TMPDIR:=/tmp}/`basename \"$0\"`.$$" -+{ -+ echo "A ZFS ${ZEVENT_SUBCLASS} event has been posted:" -+ echo -+ echo " eid: ${ZEVENT_EID}" -+ echo " host: `hostname`" -+ echo " time: ${ZEVENT_TIME_STRING}" -+ test -n "${ZEVENT_VDEV_TYPE}" -a -n "${ZEVENT_VDEV_PATH}" && \ -+ echo " vdev: ${ZEVENT_VDEV_TYPE}:${ZEVENT_VDEV_PATH}" -+ test -n "${ZEVENT_POOL}" -a -x "${ZPOOL}" && \ -+ "${ZPOOL}" status "${ZEVENT_POOL}" -+} > "${MSGBODY}" -+ -+test -f "${MSGBODY}" && "${MAIL}" -s "${SUBJECT}" "${ZED_EMAIL}" < "${MSGBODY}" -+MAIL_STATUS=$? -+rm -f "${MSGBODY}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/io-email.sh b/cmd/zed/zed.d/io-email.sh -new file mode 100755 -index 0000000..6cfe3c7 ---- /dev/null -+++ b/cmd/zed/zed.d/io-email.sh -@@ -0,0 +1,86 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a CHECKSUM or IO zevent. -+# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -+# class/pool/vdev combination. This protects against spamming the recipient -+# should multiple events occur together in time for the same pool/device. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# State File Format: -+# POOL:VDEV_PATH:TIME_OF_LAST_EMAIL -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+test -n "${ZEVENT_VDEV_PATH}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" != "checksum" \ -+ -a "${ZEVENT_SUBCLASS}" != "io"; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+NAME="zed.${ZEVENT_SUBCLASS}.email" -+LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -+STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" -+ -+# Obtain lock to ensure mutual exclusion for accessing state. -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Query state for last time email was sent for this pool/vdev. -+TIME_NOW=`date +%s` -+TIME_LAST=`egrep "^${ZEVENT_POOL}:${ZEVENT_VDEV_PATH}:" "${STATEFILE}" \ -+ 2>/dev/null | cut -d: -f3` -+if test -n "${TIME_LAST}"; then -+ TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` -+ if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then -+ exit 2 -+ fi -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" </dev/null > "${STATEFILE}.$$" -+echo "${ZEVENT_POOL}:${ZEVENT_VDEV_PATH}:${TIME_NOW}" >> "${STATEFILE}.$$" -+mv -f "${STATEFILE}.$$" "${STATEFILE}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh -new file mode 100755 -index 0000000..dd5bf4e ---- /dev/null -+++ b/cmd/zed/zed.d/io-spare.sh -@@ -0,0 +1,125 @@ -+#!/bin/sh -+# -+# Replace a device with a hot spare in response to IO or checksum errors. -+# The following actions will be performed automatically when the number -+# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -+# ZED_SPARE_ON_CHECKSUM_ERRORS. -+# -+# 1) FAULT the device on IO errors, no futher IO will be attempted. -+# DEGRADE the device on checksum errors, the device is still -+# functional and can be used to service IO requests. -+# 2) Set the SES fault beacon for the device. -+# 3) Replace the device with a hot spare if any are available. -+# -+# Once the hot sparing operation is complete either the failed device or -+# the hot spare must be manually retired using the 'zpool detach' command. -+# The 'autoreplace' functionality which would normally take care of this -+# under Illumos has not yet been implemented. -+# -+# Full support for autoreplace is planned, but it requires that the full -+# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -+# the majority of the expected hot spare functionality. -+# -+# Exit codes: -+# 0: replaced by hot spare -+# 1: no hot spare device available -+# 2: hot sparing disabled -+# 3: already faulted or degraded -+# 4: unsupported event class -+# 5: internal error -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+test -n "${ZEVENT_VDEV_PATH}" || exit 5 -+test -n "${ZEVENT_VDEV_GUID}" || exit 5 -+ -+# Defaults to disabled, enable in the zed.rc file. -+ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} -+ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} -+ -+if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ -+ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then -+ exit 2 -+fi -+ -+# A lock file is used to serialize execution. -+ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} -+LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" -+ -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Given a and return the status, (ONLINE, FAULTED, etc...). -+vdev_status() { -+ local POOL=$1 -+ local VDEV=`basename $2` -+ -+ ${ZPOOL} status ${POOL} | \ -+ awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }' -+ return 0 -+} -+ -+# Fault devices after N I/O errors. -+if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then -+ ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` -+ -+ if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ -+ ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then -+ ACTION="fault" -+ fi -+# Degrade devices after N checksum errors. -+elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then -+ ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} -+ -+ if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ -+ ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then -+ ACTION="degrade" -+ fi -+else -+ ACTION= -+fi -+ -+if [ -n "${ACTION}" ]; then -+ -+ # Device is already FAULTED or DEGRADED -+ set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` -+ ZEVENT_VDEV_PATH_FOUND=$1 -+ STATUS=$2 -+ if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then -+ exit 3 -+ fi -+ -+ # Step 1) FAULT or DEGRADE the device -+ # -+ ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} -+ -+ # Step 2) Set the SES fault beacon. -+ # -+ # XXX: Set the 'fault' or 'ident' beacon for the device. This can -+ # be done through the sg_ses utility, the only hard part is to map -+ # the sd device to its corresponding enclosure and slot. We may -+ # be able to leverage the existing vdev_id scripts for this. -+ # -+ # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 -+ # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 -+ -+ # Step 3) Replace the device with a hot spare. -+ # -+ # Round robin through the spares selecting those which are available. -+ # -+ for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do -+ set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` -+ SPARE_VDEV_FOUND=$1 -+ STATUS=$2 -+ if [ "${STATUS}" = "AVAIL" ]; then -+ ${ZPOOL} replace ${ZEVENT_POOL} \ -+ ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 -+ fi -+ done -+ -+ exit 1 -+fi -+ -+exit 4 -diff --git a/cmd/zed/zed.d/resilver.finish-email.sh b/cmd/zed/zed.d/resilver.finish-email.sh -new file mode 120000 -index 0000000..1afad32 ---- /dev/null -+++ b/cmd/zed/zed.d/resilver.finish-email.sh -@@ -0,0 +1 @@ -+scrub.finish-email.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/scrub.finish-email.sh b/cmd/zed/zed.d/scrub.finish-email.sh -new file mode 100755 -index 0000000..b5ce3f7 ---- /dev/null -+++ b/cmd/zed/zed.d/scrub.finish-email.sh -@@ -0,0 +1,73 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a RESILVER.FINISH or SCRUB.FINISH. -+# By default, "zpool status" output will only be included in the email for -+# a scrub.finish zevent if the pool is not healthy; to always include its -+# output, set ZED_EMAIL_VERBOSE=1. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" = "resilver.finish"; then -+ ACTION="resilvering" -+elif test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then -+ ACTION="scrubbing" -+else -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+if ! test -x "${ZPOOL}"; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${ZPOOL}" not installed -+ exit 3 -+fi -+ -+# For scrub, suppress email if pool is healthy and verbosity is not enabled. -+if test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then -+ HEALTHY=`"${ZPOOL}" status -x "${ZEVENT_POOL}" | \ -+ grep "'${ZEVENT_POOL}' is healthy"` -+ test -n "${HEALTHY}" -a "${ZED_EMAIL_VERBOSE:=0}" = 0 && exit 2 -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" <. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_H -+#define ZED_H -+ -+/* -+ * Absolute path for the default zed configuration file. -+ */ -+#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" -+ -+/* -+ * Absolute path for the default zed pid file. -+ */ -+#define ZED_PID_FILE RUNSTATEDIR "/zed.pid" -+ -+/* -+ * Absolute path for the default zed state file. -+ */ -+#define ZED_STATE_FILE RUNSTATEDIR "/zed.state" -+ -+/* -+ * Absolute path for the default zed script directory. -+ */ -+#define ZED_SCRIPT_DIR SYSCONFDIR "/zfs/zed.d" -+ -+/* -+ * Reserved for future use. -+ */ -+#define ZED_MAX_EVENTS 0 -+ -+/* -+ * Reserved for future use. -+ */ -+#define ZED_MIN_EVENTS 0 -+ -+/* -+ * String prefix for ZED variables passed via environment variables. -+ */ -+#define ZED_VAR_PREFIX "ZED_" -+ -+/* -+ * String prefix for ZFS event names passed via environment variables. -+ */ -+#define ZEVENT_VAR_PREFIX "ZEVENT_" -+ -+#endif /* !ZED_H */ -diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c -new file mode 100644 -index 0000000..78b45e9 ---- /dev/null -+++ b/cmd/zed/zed_conf.c -@@ -0,0 +1,680 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+/* -+ * Return a new configuration with default values. -+ */ -+struct zed_conf * -+zed_conf_create(void) -+{ -+ struct zed_conf *zcp; -+ -+ zcp = malloc(sizeof (*zcp)); -+ if (!zcp) -+ goto nomem; -+ -+ memset(zcp, 0, sizeof (*zcp)); -+ -+ zcp->syslog_facility = LOG_DAEMON; -+ zcp->min_events = ZED_MIN_EVENTS; -+ zcp->max_events = ZED_MAX_EVENTS; -+ zcp->scripts = NULL; /* created via zed_conf_scan_dir() */ -+ zcp->state_fd = -1; /* opened via zed_conf_open_state() */ -+ zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ -+ zcp->zevent_fd = -1; /* opened via zed_event_init() */ -+ -+ if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) -+ goto nomem; -+ -+ if (!(zcp->pid_file = strdup(ZED_PID_FILE))) -+ goto nomem; -+ -+ if (!(zcp->script_dir = strdup(ZED_SCRIPT_DIR))) -+ goto nomem; -+ -+ if (!(zcp->state_file = strdup(ZED_STATE_FILE))) -+ goto nomem; -+ -+ return (zcp); -+ -+nomem: -+ zed_log_die("Failed to create conf: %s", strerror(errno)); -+ return (NULL); -+} -+ -+/* -+ * Destroy the configuration [zcp]. -+ * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini(). -+ */ -+void -+zed_conf_destroy(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ return; -+ -+ if (zcp->state_fd >= 0) { -+ if (close(zcp->state_fd) < 0) -+ zed_log_msg(LOG_WARNING, -+ "Failed to close state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ } -+ if (zcp->pid_file) { -+ if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT)) -+ zed_log_msg(LOG_WARNING, -+ "Failed to remove pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } -+ if (zcp->conf_file) -+ free(zcp->conf_file); -+ -+ if (zcp->pid_file) -+ free(zcp->pid_file); -+ -+ if (zcp->script_dir) -+ free(zcp->script_dir); -+ -+ if (zcp->state_file) -+ free(zcp->state_file); -+ -+ if (zcp->scripts) -+ zed_strings_destroy(zcp->scripts); -+ -+ free(zcp); -+} -+ -+/* -+ * Display command-line help and exit. -+ * If [got_err] is 0, output to stdout and exit normally; -+ * otherwise, output to stderr and exit with a failure status. -+ */ -+static void -+_zed_conf_display_help(const char *prog, int got_err) -+{ -+ FILE *fp = got_err ? stderr : stdout; -+ int w1 = 4; /* width of leading whitespace */ -+ int w2 = 8; /* width of L-justified option field */ -+ -+ fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); -+ fprintf(fp, "\n"); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", -+ "Display help."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", -+ "Display license information."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", -+ "Display version information."); -+ fprintf(fp, "\n"); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", -+ "Be verbose."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", -+ "Force daemon to run."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", -+ "Run daemon in the foreground."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", -+ "Lock all pages in memory."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", -+ "Zero state file."); -+ fprintf(fp, "\n"); -+#if 0 -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", -+ "Read configuration from FILE.", ZED_CONF_FILE); -+#endif -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", -+ "Read enabled scripts from DIR.", ZED_SCRIPT_DIR); -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", -+ "Write daemon's PID to FILE.", ZED_PID_FILE); -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", -+ "Write daemon's state to FILE.", ZED_STATE_FILE); -+ fprintf(fp, "\n"); -+ -+ exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); -+} -+ -+/* -+ * Display license information to stdout and exit. -+ */ -+static void -+_zed_conf_display_license(void) -+{ -+ const char **pp; -+ const char *text[] = { -+ "The ZFS Event Daemon (ZED) is distributed under the terms of the", -+ " Common Development and Distribution License (CDDL-1.0)", -+ " .", -+ "Developed at Lawrence Livermore National Laboratory" -+ " (LLNL-CODE-403049).", -+ "Copyright (C) 2013-2014" -+ " Lawrence Livermore National Security, LLC.", -+ "", -+ NULL -+ }; -+ -+ for (pp = text; *pp; pp++) -+ printf("%s\n", *pp); -+ -+ exit(EXIT_SUCCESS); -+} -+ -+/* -+ * Display version information to stdout and exit. -+ */ -+static void -+_zed_conf_display_version(void) -+{ -+ printf("%s-%s-%s\n", -+ ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE); -+ -+ exit(EXIT_SUCCESS); -+} -+ -+/* -+ * Copy the [path] string to the [resultp] ptr. -+ * If [path] is not an absolute path, prefix it with the current working dir. -+ * If [resultp] is non-null, free its existing string before assignment. -+ */ -+static void -+_zed_conf_parse_path(char **resultp, const char *path) -+{ -+ char buf[PATH_MAX]; -+ -+ assert(resultp != NULL); -+ assert(path != NULL); -+ -+ if (*resultp) -+ free(*resultp); -+ -+ if (path[0] == '/') { -+ *resultp = strdup(path); -+ } else if (!getcwd(buf, sizeof (buf))) { -+ zed_log_die("Failed to get current working dir: %s", -+ strerror(errno)); -+ } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { -+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); -+ } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { -+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); -+ } else { -+ *resultp = strdup(buf); -+ } -+ if (!*resultp) -+ zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); -+} -+ -+/* -+ * Parse the command-line options into the configuration [zcp]. -+ */ -+void -+zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) -+{ -+ const char * const opts = ":hLVc:d:p:s:vfFMZ"; -+ int opt; -+ -+ if (!zcp || !argv || !argv[0]) -+ zed_log_die("Failed to parse options: Internal error"); -+ -+ opterr = 0; /* suppress default getopt err msgs */ -+ -+ while ((opt = getopt(argc, argv, opts)) != -1) { -+ switch (opt) { -+ case 'h': -+ _zed_conf_display_help(argv[0], EXIT_SUCCESS); -+ break; -+ case 'L': -+ _zed_conf_display_license(); -+ break; -+ case 'V': -+ _zed_conf_display_version(); -+ break; -+ case 'c': -+ _zed_conf_parse_path(&zcp->conf_file, optarg); -+ break; -+ case 'd': -+ _zed_conf_parse_path(&zcp->script_dir, optarg); -+ break; -+ case 'p': -+ _zed_conf_parse_path(&zcp->pid_file, optarg); -+ break; -+ case 's': -+ _zed_conf_parse_path(&zcp->state_file, optarg); -+ break; -+ case 'v': -+ zcp->do_verbose = 1; -+ break; -+ case 'f': -+ zcp->do_force = 1; -+ break; -+ case 'F': -+ zcp->do_foreground = 1; -+ break; -+ case 'M': -+ zcp->do_memlock = 1; -+ break; -+ case 'Z': -+ zcp->do_zero = 1; -+ break; -+ case '?': -+ default: -+ if (optopt == '?') -+ _zed_conf_display_help(argv[0], EXIT_SUCCESS); -+ -+ fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], -+ "Invalid option", optopt); -+ _zed_conf_display_help(argv[0], EXIT_FAILURE); -+ break; -+ } -+ } -+} -+ -+/* -+ * Parse the configuration file into the configuration [zcp]. -+ * FIXME: Not yet implemented. -+ */ -+void -+zed_conf_parse_file(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed to parse config: %s", strerror(EINVAL)); -+} -+ -+/* -+ * Scan the [zcp] script_dir for files to exec based on the event class. -+ * Files must be executable by user, but not writable by group or other. -+ * Dotfiles are ignored. -+ * Return 0 on success with an updated set of scripts, -+ * or -1 on error with errno set. -+ * FIXME: Check if script_dir and all parent dirs are secure. -+ */ -+int -+zed_conf_scan_dir(struct zed_conf *zcp) -+{ -+ zed_strings_t *scripts; -+ DIR *dirp; -+ struct dirent *direntp; -+ char pathname[PATH_MAX]; -+ struct stat st; -+ int n; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to scan script dir: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ scripts = zed_strings_create(); -+ if (!scripts) { -+ errno = ENOMEM; -+ zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ return (-1); -+ } -+ dirp = opendir(zcp->script_dir); -+ if (!dirp) { -+ int errno_bak = errno; -+ zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ zed_strings_destroy(scripts); -+ errno = errno_bak; -+ return (-1); -+ } -+ while ((direntp = readdir(dirp))) { -+ if (direntp->d_name[0] == '.') -+ continue; -+ -+ n = snprintf(pathname, sizeof (pathname), -+ "%s/%s", zcp->script_dir, direntp->d_name); -+ if ((n < 0) || (n >= sizeof (pathname))) { -+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", -+ direntp->d_name, strerror(ENAMETOOLONG)); -+ continue; -+ } -+ if (stat(pathname, &st) < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", -+ pathname, strerror(errno)); -+ continue; -+ } -+ if (!S_ISREG(st.st_mode)) { -+ zed_log_msg(LOG_INFO, -+ "Ignoring \"%s\": not a regular file", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_uid != 0) && !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": not owned by root", -+ direntp->d_name); -+ continue; -+ } -+ if (!(st.st_mode & S_IXUSR)) { -+ zed_log_msg(LOG_INFO, -+ "Ignoring \"%s\": not executable by user", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_mode & S_IWGRP) & !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": writable by group", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_mode & S_IWOTH) & !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": writable by other", -+ direntp->d_name); -+ continue; -+ } -+ if (zed_strings_add(scripts, direntp->d_name) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to register \"%s\": %s", -+ direntp->d_name, strerror(errno)); -+ continue; -+ } -+ if (zcp->do_verbose) -+ zed_log_msg(LOG_INFO, -+ "Registered script \"%s\"", direntp->d_name); -+ } -+ if (closedir(dirp) < 0) { -+ int errno_bak = errno; -+ zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ zed_strings_destroy(scripts); -+ errno = errno_bak; -+ return (-1); -+ } -+ if (zcp->scripts) -+ zed_strings_destroy(zcp->scripts); -+ -+ zcp->scripts = scripts; -+ return (0); -+} -+ -+/* -+ * Write the PID file specified in [zcp]. -+ * Return 0 on success, -1 on error. -+ * XXX: This must be called after fork()ing to become a daemon. -+ */ -+int -+zed_conf_write_pid(struct zed_conf *zcp) -+{ -+ char dirbuf[PATH_MAX]; -+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; -+ int n; -+ char *p; -+ mode_t mask; -+ FILE *fp; -+ -+ if (!zcp || !zcp->pid_file) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to write pid file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ n = strlcpy(dirbuf, zcp->pid_file, sizeof (dirbuf)); -+ if (n >= sizeof (dirbuf)) { -+ errno = ENAMETOOLONG; -+ zed_log_msg(LOG_WARNING, "Failed to write pid file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ p = strrchr(dirbuf, '/'); -+ if (p) -+ *p = '\0'; -+ -+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to create directory \"%s\": %s", -+ dirbuf, strerror(errno)); -+ return (-1); -+ } -+ (void) unlink(zcp->pid_file); -+ -+ mask = umask(0); -+ umask(mask | 022); -+ fp = fopen(zcp->pid_file, "w"); -+ umask(mask); -+ -+ if (!fp) { -+ zed_log_msg(LOG_WARNING, "Failed to open pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else if (fprintf(fp, "%d\n", (int) getpid()) == EOF) { -+ zed_log_msg(LOG_WARNING, "Failed to write pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else if (fclose(fp) == EOF) { -+ zed_log_msg(LOG_WARNING, "Failed to close pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else { -+ return (0); -+ } -+ (void) unlink(zcp->pid_file); -+ return (-1); -+} -+ -+/* -+ * Open and lock the [zcp] state_file. -+ * Return 0 on success, -1 on error. -+ * FIXME: If state_file exists, verify ownership & permissions. -+ * FIXME: Move lock to pid_file instead. -+ */ -+int -+zed_conf_open_state(struct zed_conf *zcp) -+{ -+ char dirbuf[PATH_MAX]; -+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; -+ int n; -+ char *p; -+ int rv; -+ -+ if (!zcp || !zcp->state_file) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to open state file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf)); -+ if (n >= sizeof (dirbuf)) { -+ errno = ENAMETOOLONG; -+ zed_log_msg(LOG_WARNING, "Failed to open state file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ p = strrchr(dirbuf, '/'); -+ if (p) -+ *p = '\0'; -+ -+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to create directory \"%s\": %s", -+ dirbuf, strerror(errno)); -+ return (-1); -+ } -+ if (zcp->state_fd >= 0) { -+ if (close(zcp->state_fd) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to close state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ } -+ if (zcp->do_zero) -+ (void) unlink(zcp->state_file); -+ -+ zcp->state_fd = open(zcp->state_file, -+ (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); -+ if (zcp->state_fd < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ rv = zed_file_lock(zcp->state_fd); -+ if (rv < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ if (rv > 0) { -+ pid_t pid = zed_file_is_locked(zcp->state_fd); -+ if (pid < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to test lock on state file \"%s\"", -+ zcp->state_file); -+ } else if (pid > 0) { -+ zed_log_msg(LOG_WARNING, -+ "Found pid %d bound to state file \"%s\"", -+ pid, zcp->state_file); -+ } else { -+ zed_log_msg(LOG_WARNING, -+ "Inconsistent lock state on state file \"%s\"", -+ zcp->state_file); -+ } -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Read the opened [zcp] state_file to obtain the eid & etime -+ * of the last event processed. -+ * Write the state from the last event to the [eidp] & [etime] args -+ * passed by reference. -+ * Note that etime[] is an array of size 2. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) -+{ -+ ssize_t len; -+ struct iovec iov[3]; -+ ssize_t n; -+ -+ if (!zcp || !eidp || !etime) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, -+ "Failed to read state file: %s", strerror(errno)); -+ return (-1); -+ } -+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to reposition state file offset: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ len = 0; -+ iov[0].iov_base = eidp; -+ len += iov[0].iov_len = sizeof (*eidp); -+ iov[1].iov_base = &etime[0]; -+ len += iov[1].iov_len = sizeof (etime[0]); -+ iov[2].iov_base = &etime[1]; -+ len += iov[2].iov_len = sizeof (etime[1]); -+ -+ n = readv(zcp->state_fd, iov, 3); -+ if (n == 0) { -+ *eidp = 0; -+ } else if (n < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to read state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } else if (n != len) { -+ errno = EIO; -+ zed_log_msg(LOG_WARNING, -+ "Failed to read state file \"%s\": Read %d of %d bytes", -+ zcp->state_file, n, len); -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Write the [eid] & [etime] of the last processed event to the opened -+ * [zcp] state_file. -+ * Note that etime[] is an array of size 2. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) -+{ -+ ssize_t len; -+ struct iovec iov[3]; -+ ssize_t n; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, -+ "Failed to write state file: %s", strerror(errno)); -+ return (-1); -+ } -+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to reposition state file offset: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ len = 0; -+ iov[0].iov_base = &eid; -+ len += iov[0].iov_len = sizeof (eid); -+ iov[1].iov_base = &etime[0]; -+ len += iov[1].iov_len = sizeof (etime[0]); -+ iov[2].iov_base = &etime[1]; -+ len += iov[2].iov_len = sizeof (etime[1]); -+ -+ n = writev(zcp->state_fd, iov, 3); -+ if (n < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to write state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ if (n != len) { -+ errno = EIO; -+ zed_log_msg(LOG_WARNING, -+ "Failed to write state file \"%s\": Wrote %d of %d bytes", -+ zcp->state_file, n, len); -+ return (-1); -+ } -+ if (fdatasync(zcp->state_fd) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to sync state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ return (0); -+} -diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h -new file mode 100644 -index 0000000..51b98ea ---- /dev/null -+++ b/cmd/zed/zed_conf.h -@@ -0,0 +1,71 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_CONF_H -+#define ZED_CONF_H -+ -+#include -+#include -+#include "zed_strings.h" -+ -+struct zed_conf { -+ unsigned do_force:1; /* true if force enabled */ -+ unsigned do_foreground:1; /* true if run in foreground */ -+ unsigned do_memlock:1; /* true if locking memory */ -+ unsigned do_verbose:1; /* true if verbosity enabled */ -+ unsigned do_zero:1; /* true if zeroing state */ -+ int syslog_facility; /* syslog facility value */ -+ int min_events; /* RESERVED FOR FUTURE USE */ -+ int max_events; /* RESERVED FOR FUTURE USE */ -+ char *conf_file; /* abs path to config file */ -+ char *pid_file; /* abs path to pid file */ -+ char *script_dir; /* abs path to script dir */ -+ zed_strings_t *scripts; /* names of enabled scripts */ -+ char *state_file; /* abs path to state file */ -+ int state_fd; /* fd to state file */ -+ libzfs_handle_t *zfs_hdl; /* handle to libzfs */ -+ int zevent_fd; /* fd for access to zevents */ -+}; -+ -+struct zed_conf *zed_conf_create(void); -+ -+void zed_conf_destroy(struct zed_conf *zcp); -+ -+void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); -+ -+void zed_conf_parse_file(struct zed_conf *zcp); -+ -+int zed_conf_scan_dir(struct zed_conf *zcp); -+ -+int zed_conf_write_pid(struct zed_conf *zcp); -+ -+int zed_conf_open_state(struct zed_conf *zcp); -+ -+int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); -+ -+int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); -+ -+#endif /* !ZED_CONF_H */ -diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c -new file mode 100644 -index 0000000..e504aef ---- /dev/null -+++ b/cmd/zed/zed_event.c -@@ -0,0 +1,829 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include /* FIXME: Replace with libzfs_core. */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_exec.h" -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+/* -+ * Open the libzfs interface. -+ */ -+void -+zed_event_init(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); -+ -+ zcp->zfs_hdl = libzfs_init(); -+ if (!zcp->zfs_hdl) -+ zed_log_die("Failed to initialize libzfs"); -+ -+ zcp->zevent_fd = open(ZFS_DEV, O_RDWR); -+ if (zcp->zevent_fd < 0) -+ zed_log_die("Failed to open \"%s\": %s", -+ ZFS_DEV, strerror(errno)); -+} -+ -+/* -+ * Close the libzfs interface. -+ */ -+void -+zed_event_fini(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); -+ -+ if (zcp->zevent_fd >= 0) { -+ if (close(zcp->zevent_fd) < 0) -+ zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", -+ ZFS_DEV, strerror(errno)); -+ -+ zcp->zevent_fd = -1; -+ } -+ if (zcp->zfs_hdl) { -+ libzfs_fini(zcp->zfs_hdl); -+ zcp->zfs_hdl = NULL; -+ } -+} -+ -+/* -+ * Seek to the event specified by [saved_eid] and [saved_etime]. -+ * This protects against processing a given event more than once. -+ * Return 0 upon a successful seek to the specified event, or -1 otherwise. -+ * A zevent is considered to be uniquely specified by its (eid,time) tuple. -+ * The unsigned 64b eid is set to 1 when the kernel module is loaded, and -+ * incremented by 1 for each new event. Since the state file can persist -+ * across a kernel module reload, the time must be checked to ensure a match. -+ */ -+int -+zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) -+{ -+ uint64_t eid; -+ int found; -+ nvlist_t *nvl; -+ int n_dropped; -+ int64_t *etime; -+ uint_t nelem; -+ int rv; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to seek zevent: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ eid = 0; -+ found = 0; -+ while ((eid < saved_eid) && !found) { -+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, -+ ZEVENT_NONBLOCK, zcp->zevent_fd); -+ -+ if ((rv != 0) || !nvl) -+ break; -+ -+ if (n_dropped > 0) { -+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); -+ /* -+ * FIXME: Increase max size of event nvlist in -+ * /sys/module/zfs/parameters/zfs_zevent_len_max ? -+ */ -+ } -+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); -+ } else if (nvlist_lookup_int64_array(nvl, "time", -+ &etime, &nelem) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu)", eid); -+ } else if (nelem != 2) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu, nelem=%u)", -+ eid, nelem); -+ } else if ((eid != saved_eid) || -+ (etime[0] != saved_etime[0]) || -+ (etime[1] != saved_etime[1])) { -+ /* no-op */ -+ } else { -+ found = 1; -+ } -+ free(nvl); -+ } -+ if (!found && (saved_eid > 0)) { -+ if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START, -+ zcp->zevent_fd) < 0) -+ zed_log_msg(LOG_WARNING, "Failed to seek to eid=0"); -+ else -+ eid = 0; -+ } -+ zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid); -+ return (found ? 0 : -1); -+} -+ -+static int -+_zed_event_convert_int8_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int8_t *i8p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int8_array(nvp, &i8p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i8p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint8_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint8_t *u8p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint8_array(nvp, &u8p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u8p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int16_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int16_t *i16p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int16_array(nvp, &i16p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i16p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint16_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint16_t *u16p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint16_array(nvp, &u16p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u16p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int32_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int32_t *i32p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int32_array(nvp, &i32p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i32p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint32_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint32_t *u32p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint32_array(nvp, &u32p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u32p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int64_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int64_t *i64p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int64_array(nvp, &i64p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%lld ", (u_longlong_t) i64p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint64_array(char *buf, int buflen, nvpair_t *nvp, -+ const char *fmt) -+{ -+ uint64_t *u64p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, fmt, (u_longlong_t) u64p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_string_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ char **strp; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_string_array(nvp, &strp, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : ""); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+/* -+ * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0. -+ */ -+static int -+_zed_event_value_is_hex(const char *name) -+{ -+ const char *hex_suffix[] = { -+ "_guid", -+ "_guids", -+ NULL -+ }; -+ const char **pp; -+ char *p; -+ -+ if (!name) -+ return (0); -+ -+ for (pp = hex_suffix; *pp; pp++) { -+ p = strstr(name, *pp); -+ if (p && strlen(p) == strlen(*pp)) -+ return (1); -+ } -+ return (0); -+} -+ -+/* -+ * Convert the nvpair [nvp] to a string which is added to the environment -+ * of the child process. -+ * Return 0 on success, -1 on error. -+ * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? -+ */ -+static void -+_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) -+{ -+ const char *name; -+ data_type_t type; -+ char buf[4096]; -+ int buflen; -+ int n; -+ char *p; -+ const char *q; -+ const char *fmt; -+ -+ boolean_t b; -+ double d; -+ uint8_t i8; -+ uint16_t i16; -+ uint32_t i32; -+ uint64_t i64; -+ char *str; -+ -+ assert(zsp != NULL); -+ assert(nvp != NULL); -+ -+ name = nvpair_name(nvp); -+ type = nvpair_type(nvp); -+ buflen = sizeof (buf); -+ -+ /* Copy NAME prefix for ZED zevent namespace. */ -+ n = strlcpy(buf, ZEVENT_VAR_PREFIX, sizeof (buf)); -+ if (n >= sizeof (buf)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, "Exceeded buffer size"); -+ return; -+ } -+ buflen -= n; -+ p = buf + n; -+ -+ /* Convert NAME to alphanumeric uppercase. */ -+ for (q = name; *q && (buflen > 0); q++) { -+ *p++ = isalnum(*q) ? toupper(*q) : '_'; -+ buflen--; -+ } -+ -+ /* Separate NAME from VALUE. */ -+ if (buflen > 0) { -+ *p++ = '='; -+ buflen--; -+ } -+ *p = '\0'; -+ -+ /* Convert VALUE. */ -+ switch (type) { -+ case DATA_TYPE_BOOLEAN: -+ n = snprintf(p, buflen, "%s", "1"); -+ break; -+ case DATA_TYPE_BOOLEAN_VALUE: -+ (void) nvpair_value_boolean_value(nvp, &b); -+ n = snprintf(p, buflen, "%s", b ? "1" : "0"); -+ break; -+ case DATA_TYPE_BYTE: -+ (void) nvpair_value_byte(nvp, &i8); -+ n = snprintf(p, buflen, "%d", i8); -+ break; -+ case DATA_TYPE_INT8: -+ (void) nvpair_value_int8(nvp, (int8_t *) &i8); -+ n = snprintf(p, buflen, "%d", i8); -+ break; -+ case DATA_TYPE_UINT8: -+ (void) nvpair_value_uint8(nvp, &i8); -+ n = snprintf(p, buflen, "%u", i8); -+ break; -+ case DATA_TYPE_INT16: -+ (void) nvpair_value_int16(nvp, (int16_t *) &i16); -+ n = snprintf(p, buflen, "%d", i16); -+ break; -+ case DATA_TYPE_UINT16: -+ (void) nvpair_value_uint16(nvp, &i16); -+ n = snprintf(p, buflen, "%u", i16); -+ break; -+ case DATA_TYPE_INT32: -+ (void) nvpair_value_int32(nvp, (int32_t *) &i32); -+ n = snprintf(p, buflen, "%d", i32); -+ break; -+ case DATA_TYPE_UINT32: -+ (void) nvpair_value_uint32(nvp, &i32); -+ n = snprintf(p, buflen, "%u", i32); -+ break; -+ case DATA_TYPE_INT64: -+ (void) nvpair_value_int64(nvp, (int64_t *) &i64); -+ n = snprintf(p, buflen, "%lld", (longlong_t) i64); -+ break; -+ case DATA_TYPE_UINT64: -+ (void) nvpair_value_uint64(nvp, &i64); -+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"; -+ n = snprintf(p, buflen, fmt, (u_longlong_t) i64); -+ break; -+ case DATA_TYPE_DOUBLE: -+ (void) nvpair_value_double(nvp, &d); -+ n = snprintf(p, buflen, "%g", d); -+ break; -+ case DATA_TYPE_HRTIME: -+ (void) nvpair_value_hrtime(nvp, (hrtime_t *) &i64); -+ n = snprintf(p, buflen, "%llu", (u_longlong_t) i64); -+ break; -+ case DATA_TYPE_NVLIST: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_STRING: -+ (void) nvpair_value_string(nvp, &str); -+ n = snprintf(p, buflen, "%s", (str ? str : "")); -+ break; -+ case DATA_TYPE_BOOLEAN_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_BYTE_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_INT8_ARRAY: -+ n = _zed_event_convert_int8_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT8_ARRAY: -+ n = _zed_event_convert_uint8_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT16_ARRAY: -+ n = _zed_event_convert_int16_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT16_ARRAY: -+ n = _zed_event_convert_uint16_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT32_ARRAY: -+ n = _zed_event_convert_int32_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT32_ARRAY: -+ n = _zed_event_convert_uint32_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT64_ARRAY: -+ n = _zed_event_convert_int64_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT64_ARRAY: -+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; -+ n = _zed_event_convert_uint64_array(p, buflen, nvp, fmt); -+ break; -+ case DATA_TYPE_STRING_ARRAY: -+ n = _zed_event_convert_string_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_NVLIST_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ default: -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: " -+ "Unrecognized type=%u", name, eid, (unsigned int) type); -+ return; -+ } -+ if ((n < 0) || (n >= sizeof (buf))) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, "Exceeded buffer size"); -+ return; -+ } -+ if (zed_strings_add(zsp, buf) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, strerror(ENOMEM)); -+ return; -+ } -+} -+ -+/* -+ * Add the environment variable specified by the format string [fmt]. -+ */ -+static void -+_zed_event_add_var(uint64_t eid, zed_strings_t *zsp, const char *fmt, ...) -+{ -+ char buf[4096]; -+ va_list vargs; -+ int n; -+ const char *p; -+ size_t namelen; -+ -+ assert(zsp != NULL); -+ assert(fmt != NULL); -+ -+ va_start(vargs, fmt); -+ n = vsnprintf(buf, sizeof (buf), fmt, vargs); -+ va_end(vargs); -+ p = strchr(buf, '='); -+ namelen = (p) ? p - buf : strlen(buf); -+ -+ if ((n < 0) || (n >= sizeof (buf))) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, "Exceeded buffer size"); -+ } else if (!p) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, "Missing assignment"); -+ } else if (zed_strings_add(zsp, buf) < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, strerror(ENOMEM)); -+ } -+} -+ -+/* -+ * Restrict various environment variables to safe and sane values -+ * when constructing the environment for the child process. -+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. -+ */ -+static void -+_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp) -+{ -+ const char *env_restrict[] = { -+ "IFS= \t\n", -+ "PATH=" _PATH_STDPATH, -+ "ZDB=" SBINDIR "/zdb", -+ "ZED=" SBINDIR "/zed", -+ "ZFS=" SBINDIR "/zfs", -+ "ZINJECT=" SBINDIR "/zinject", -+ "ZPOOL=" SBINDIR "/zpool", -+ "ZFS_ALIAS=" ZFS_META_ALIAS, -+ "ZFS_VERSION=" ZFS_META_VERSION, -+ "ZFS_RELEASE=" ZFS_META_RELEASE, -+ NULL -+ }; -+ const char **pp; -+ -+ assert(zsp != NULL); -+ -+ for (pp = env_restrict; *pp; pp++) { -+ _zed_event_add_var(eid, zsp, "%s", *pp); -+ } -+} -+ -+/* -+ * Preserve specified variables from the parent environment -+ * when constructing the environment for the child process. -+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. -+ */ -+static void -+_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp) -+{ -+ const char *env_preserve[] = { -+ "TZ", -+ NULL -+ }; -+ const char **pp; -+ const char *p; -+ -+ assert(zsp != NULL); -+ -+ for (pp = env_preserve; *pp; pp++) { -+ if ((p = getenv(*pp))) -+ _zed_event_add_var(eid, zsp, "%s=%s", *pp, p); -+ } -+} -+ -+/* -+ * Compute the "subclass" by removing the first 3 components of [class] -+ * (which seem to always be either "ereport.fs.zfs" or "resource.fs.zfs"). -+ * Return a pointer inside the string [class], or NULL if insufficient -+ * components exist. -+ */ -+static const char * -+_zed_event_get_subclass(const char *class) -+{ -+ const char *p; -+ int i; -+ -+ if (!class) -+ return (NULL); -+ -+ p = class; -+ for (i = 0; i < 3; i++) { -+ p = strchr(p, '.'); -+ if (!p) -+ break; -+ p++; -+ } -+ return (p); -+} -+ -+/* -+ * Convert the zevent time from a 2-element array of 64b integers -+ * into a more convenient form: -+ * TIME_SECS is the second component of the time. -+ * TIME_NSECS is the nanosecond component of the time. -+ * TIME_STRING is an almost-RFC3339-compliant string representation. -+ */ -+static void -+_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) -+{ -+ struct tm *stp; -+ char buf[32]; -+ -+ assert(zsp != NULL); -+ assert(etime != NULL); -+ -+ _zed_event_add_var(eid, zsp, "%s%s=%lld", -+ ZEVENT_VAR_PREFIX, "TIME_SECS", (long long int) etime[0]); -+ _zed_event_add_var(eid, zsp, "%s%s=%lld", -+ ZEVENT_VAR_PREFIX, "TIME_NSECS", (long long int) etime[1]); -+ -+ if (!(stp = localtime((const time_t *) &etime[0]))) { -+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error"); -+ } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) { -+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error"); -+ } else { -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", buf); -+ } -+} -+ -+/* -+ * Service the next zevent, blocking until one is available. -+ */ -+void -+zed_event_service(struct zed_conf *zcp) -+{ -+ nvlist_t *nvl; -+ nvpair_t *nvp; -+ int n_dropped; -+ zed_strings_t *zsp; -+ uint64_t eid; -+ int64_t *etime; -+ uint_t nelem; -+ char *class; -+ const char *subclass; -+ int rv; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to service zevent: %s", -+ strerror(errno)); -+ return; -+ } -+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, -+ zcp->zevent_fd); -+ -+ if ((rv != 0) || !nvl) -+ return; -+ -+ if (n_dropped > 0) { -+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); -+ /* -+ * FIXME: Increase max size of event nvlist in -+ * /sys/module/zfs/parameters/zfs_zevent_len_max ? -+ */ -+ } -+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); -+ } else if (nvlist_lookup_int64_array( -+ nvl, "time", &etime, &nelem) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu)", eid); -+ } else if (nelem != 2) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu, nelem=%u)", -+ eid, nelem); -+ } else if (nvlist_lookup_string(nvl, "class", &class) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent class (eid=%llu)", eid); -+ } else { -+ zsp = zed_strings_create(); -+ -+ nvp = NULL; -+ while ((nvp = nvlist_next_nvpair(nvl, nvp))) -+ _zed_event_add_nvpair(eid, zsp, nvp); -+ -+ _zed_event_add_env_restrict(eid, zsp); -+ _zed_event_add_env_preserve(eid, zsp); -+ -+ _zed_event_add_var(eid, zsp, "%s%s=%d", -+ ZED_VAR_PREFIX, "PID", (int) getpid()); -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZED_VAR_PREFIX, "SCRIPT_DIR", zcp->script_dir); -+ -+ subclass = _zed_event_get_subclass(class); -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZEVENT_VAR_PREFIX, "SUBCLASS", -+ (subclass ? subclass : class)); -+ _zed_event_add_time_strings(eid, zsp, etime); -+ -+ zed_exec_process(eid, class, subclass, -+ zcp->script_dir, zcp->scripts, zsp, zcp->zevent_fd); -+ -+ zed_conf_write_state(zcp, eid, etime); -+ -+ zed_strings_destroy(zsp); -+ } -+ nvlist_free(nvl); -+} -diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h -new file mode 100644 -index 0000000..71b3a2b ---- /dev/null -+++ b/cmd/zed/zed_event.h -@@ -0,0 +1,41 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_EVENT_H -+#define ZED_EVENT_H -+ -+#include -+ -+void zed_event_init(struct zed_conf *zcp); -+ -+void zed_event_fini(struct zed_conf *zcp); -+ -+int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, -+ int64_t saved_etime[]); -+ -+void zed_event_service(struct zed_conf *zcp); -+ -+#endif /* !ZED_EVENT_H */ -diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c -new file mode 100644 -index 0000000..f461b78 ---- /dev/null -+++ b/cmd/zed/zed_exec.c -@@ -0,0 +1,207 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+#define ZEVENT_FILENO 3 -+ -+/* -+ * Create an environment string array for passing to execve() using the -+ * NAME=VALUE strings in container [zsp]. -+ * Return a newly-allocated environment, or NULL on error. -+ */ -+static char ** -+_zed_exec_create_env(zed_strings_t *zsp) -+{ -+ int num_ptrs; -+ int buflen; -+ char *buf; -+ char **pp; -+ char *p; -+ const char *q; -+ int i; -+ int len; -+ -+ num_ptrs = zed_strings_count(zsp) + 1; -+ buflen = num_ptrs * sizeof (char *); -+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) -+ buflen += strlen(q) + 1; -+ -+ buf = malloc(buflen); -+ if (!buf) -+ return (NULL); -+ -+ pp = (char **) buf; -+ p = buf + (num_ptrs * sizeof (char *)); -+ i = 0; -+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { -+ pp[i] = p; -+ len = strlen(q) + 1; -+ memcpy(p, q, len); -+ p += len; -+ i++; -+ } -+ pp[i] = NULL; -+ assert(buf + buflen == p); -+ return ((char **) buf); -+} -+ -+/* -+ * Fork a child process to handle event [eid]. The program [prog] -+ * in directory [dir] is executed with the envionment [env]. -+ * The file descriptor [zfd] is the zevent_fd used to track the -+ * current cursor location within the zevent nvlist. -+ */ -+static void -+_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, -+ char *env[], int zfd) -+{ -+ char path[PATH_MAX]; -+ int n; -+ pid_t pid; -+ int fd; -+ pid_t wpid; -+ int status; -+ -+ assert(dir != NULL); -+ assert(prog != NULL); -+ assert(env != NULL); -+ assert(zfd >= 0); -+ -+ n = snprintf(path, sizeof (path), "%s/%s", dir, prog); -+ if ((n < 0) || (n >= sizeof (path))) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to fork \"%s\" for eid=%llu: %s", -+ prog, eid, strerror(ENAMETOOLONG)); -+ return; -+ } -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to fork \"%s\" for eid=%llu: %s", -+ prog, eid, strerror(errno)); -+ return; -+ } else if (pid == 0) { -+ (void) umask(022); -+ fd = open("/dev/null", O_RDWR); -+ (void) dup2(fd, STDIN_FILENO); -+ (void) dup2(fd, STDOUT_FILENO); -+ (void) dup2(fd, STDERR_FILENO); -+ (void) dup2(zfd, ZEVENT_FILENO); -+ zed_file_close_from(ZEVENT_FILENO + 1); -+ execle(path, prog, NULL, env); -+ _exit(127); -+ } else { -+ zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", -+ prog, eid, pid); -+ /* FIXME: Timeout rogue child processes with sigalarm? */ -+restart: -+ wpid = waitpid(pid, &status, 0); -+ if (wpid == (pid_t) -1) { -+ if (errno == EINTR) -+ goto restart; -+ zed_log_msg(LOG_WARNING, -+ "Failed to wait for \"%s\" eid=%llu pid=%d", -+ prog, eid, pid); -+ } else if (WIFEXITED(status)) { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d exit=%d", -+ prog, eid, pid, WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", -+ prog, eid, pid, WTERMSIG(status), -+ strsignal(WTERMSIG(status))); -+ } else { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d status=0x%X", -+ prog, eid, (unsigned int) status); -+ } -+ } -+} -+ -+/* -+ * Process the event [eid] by synchronously invoking all scripts with a -+ * matching class prefix. -+ * Each executable in [scripts] from the directory [dir] is matched against -+ * the event's [class], [subclass], and the "all" class (which matches -+ * all events). Every script with a matching class prefix is invoked. -+ * The NAME=VALUE strings in [envs] will be passed to the script as -+ * environment variables. -+ * The file descriptor [zfd] is the zevent_fd used to track the -+ * current cursor location within the zevent nvlist. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_exec_process(uint64_t eid, const char *class, const char *subclass, -+ const char *dir, zed_strings_t *scripts, zed_strings_t *envs, int zfd) -+{ -+ const char *class_strings[4]; -+ const char *allclass = "all"; -+ const char **csp; -+ const char *s; -+ char **e; -+ int n; -+ -+ if (!dir || !scripts || !envs || zfd < 0) -+ return (-1); -+ -+ csp = class_strings; -+ -+ if (class) -+ *csp++ = class; -+ -+ if (subclass) -+ *csp++ = subclass; -+ -+ if (allclass) -+ *csp++ = allclass; -+ -+ *csp = NULL; -+ -+ e = _zed_exec_create_env(envs); -+ -+ for (s = zed_strings_first(scripts); s; s = zed_strings_next(scripts)) { -+ for (csp = class_strings; *csp; csp++) { -+ n = strlen(*csp); -+ if ((strncmp(s, *csp, n) == 0) && !isalpha(s[n])) -+ _zed_exec_fork_child(eid, dir, s, e, zfd); -+ } -+ } -+ free(e); -+ return (0); -+} -diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h -new file mode 100644 -index 0000000..52bdc12 ---- /dev/null -+++ b/cmd/zed/zed_exec.h -@@ -0,0 +1,36 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_EXEC_H -+#define ZED_EXEC_H -+ -+#include -+ -+int zed_exec_process(uint64_t eid, const char *class, const char *subclass, -+ const char *dir, zed_strings_t *scripts, zed_strings_t *envs, -+ int zevent_fd); -+ -+#endif /* !ZED_EXEC_H */ -diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c -new file mode 100644 -index 0000000..7b77345 ---- /dev/null -+++ b/cmd/zed/zed_file.c -@@ -0,0 +1,227 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_log.h" -+ -+/* -+ * Read up to [n] bytes from [fd] into [buf]. -+ * Return the number of bytes read, 0 on EOF, or -1 on error. -+ */ -+ssize_t -+zed_file_read_n(int fd, void *buf, size_t n) -+{ -+ unsigned char *p; -+ size_t n_left; -+ ssize_t n_read; -+ -+ p = buf; -+ n_left = n; -+ while (n_left > 0) { -+ if ((n_read = read(fd, p, n_left)) < 0) { -+ if (errno == EINTR) -+ continue; -+ else -+ return (-1); -+ -+ } else if (n_read == 0) { -+ break; -+ } -+ n_left -= n_read; -+ p += n_read; -+ } -+ return (n - n_left); -+} -+ -+/* -+ * Write [n] bytes from [buf] out to [fd]. -+ * Return the number of bytes written, or -1 on error. -+ */ -+ssize_t -+zed_file_write_n(int fd, void *buf, size_t n) -+{ -+ const unsigned char *p; -+ size_t n_left; -+ ssize_t n_written; -+ -+ p = buf; -+ n_left = n; -+ while (n_left > 0) { -+ if ((n_written = write(fd, p, n_left)) < 0) { -+ if (errno == EINTR) -+ continue; -+ else -+ return (-1); -+ -+ } -+ n_left -= n_written; -+ p += n_written; -+ } -+ return (n); -+} -+ -+/* -+ * Set an exclusive advisory lock on the open file descriptor [fd]. -+ * Return 0 on success, 1 if a conflicting lock is held by another process, -+ * or -1 on error (with errno set). -+ */ -+int -+zed_file_lock(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_WRLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_SETLK, &lock) < 0) { -+ if ((errno == EACCES) || (errno == EAGAIN)) -+ return (1); -+ -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Release an advisory lock held on the open file descriptor [fd]. -+ * Return 0 on success, or -1 on error (with errno set). -+ */ -+int -+zed_file_unlock(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_UNLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_SETLK, &lock) < 0) -+ return (-1); -+ -+ return (0); -+} -+ -+/* -+ * Test whether an exclusive advisory lock could be obtained for the open -+ * file descriptor [fd]. -+ * Return 0 if the file is not locked, >0 for the pid of another process -+ * holding a conflicting lock, or -1 on error (with errno set). -+ */ -+pid_t -+zed_file_is_locked(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_WRLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_GETLK, &lock) < 0) -+ return (-1); -+ -+ if (lock.l_type == F_UNLCK) -+ return (0); -+ -+ return (lock.l_pid); -+} -+ -+/* -+ * Close all open file descriptors greater than or equal to [lowfd]. -+ * Any errors encountered while closing file descriptors are ignored. -+ */ -+void -+zed_file_close_from(int lowfd) -+{ -+ const int maxfd_def = 256; -+ int errno_bak; -+ struct rlimit rl; -+ int maxfd; -+ int fd; -+ -+ errno_bak = errno; -+ -+ if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { -+ maxfd = maxfd_def; -+ } else if (rl.rlim_max == RLIM_INFINITY) { -+ maxfd = maxfd_def; -+ } else { -+ maxfd = rl.rlim_max; -+ } -+ for (fd = lowfd; fd < maxfd; fd++) -+ (void) close(fd); -+ -+ errno = errno_bak; -+} -+ -+/* -+ * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically -+ * closed upon successful execution of one of the exec functions. -+ * Return 0 on success, or -1 on error. -+ * FIXME: No longer needed? -+ */ -+int -+zed_file_close_on_exec(int fd) -+{ -+ int flags; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ flags = fcntl(fd, F_GETFD); -+ if (flags == -1) -+ return (-1); -+ -+ flags |= FD_CLOEXEC; -+ -+ if (fcntl(fd, F_SETFD, flags) == -1) -+ return (-1); -+ -+ return (0); -+} -diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h -new file mode 100644 -index 0000000..df70201 ---- /dev/null -+++ b/cmd/zed/zed_file.h -@@ -0,0 +1,47 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_FILE_H -+#define ZED_FILE_H -+ -+#include -+#include -+ -+ssize_t zed_file_read_n(int fd, void *buf, size_t n); -+ -+ssize_t zed_file_write_n(int fd, void *buf, size_t n); -+ -+int zed_file_lock(int fd); -+ -+int zed_file_unlock(int fd); -+ -+pid_t zed_file_is_locked(int fd); -+ -+void zed_file_close_from(int fd); -+ -+int zed_file_close_on_exec(int fd); -+ -+#endif /* !ZED_FILE_H */ -diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c -new file mode 100644 -index 0000000..bc432bc ---- /dev/null -+++ b/cmd/zed/zed_log.c -@@ -0,0 +1,171 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_log.h" -+ -+#define ZED_LOG_MAX_ID_LEN 64 -+#define ZED_LOG_MAX_LOG_LEN 1024 -+ -+static struct { -+ unsigned do_stderr:1; -+ unsigned do_syslog:1; -+ int level; -+ char id[ZED_LOG_MAX_ID_LEN]; -+} _ctx; -+ -+void -+zed_log_init(const char *identity) -+{ -+ const char *p; -+ -+ if (identity) { -+ p = (p = strrchr(identity, '/')) ? p + 1 : identity; -+ strlcpy(_ctx.id, p, sizeof (_ctx.id)); -+ } else { -+ _ctx.id[0] = '\0'; -+ } -+} -+ -+void -+zed_log_fini() -+{ -+ if (_ctx.do_syslog) { -+ closelog(); -+ } -+} -+ -+void -+zed_log_stderr_open(int level) -+{ -+ _ctx.do_stderr = 1; -+ _ctx.level = level; -+} -+ -+void -+zed_log_stderr_close(void) -+{ -+ _ctx.do_stderr = 0; -+} -+ -+void -+zed_log_syslog_open(int facility) -+{ -+ const char *identity; -+ -+ _ctx.do_syslog = 1; -+ identity = (_ctx.id[0] == '\0') ? NULL : _ctx.id; -+ openlog(identity, LOG_NDELAY, facility); -+} -+ -+void -+zed_log_syslog_close(void) -+{ -+ _ctx.do_syslog = 0; -+ closelog(); -+} -+ -+static void -+_zed_log_aux(int priority, const char *fmt, va_list vargs) -+{ -+ char buf[ZED_LOG_MAX_LOG_LEN]; -+ char *syslogp; -+ char *p; -+ int len; -+ int n; -+ -+ assert(fmt != NULL); -+ -+ syslogp = NULL; -+ p = buf; -+ len = sizeof (buf); -+ -+ if (_ctx.id[0] != '\0') { -+ n = snprintf(p, len, "%s: ", _ctx.id); -+ if ((n < 0) || (n >= len)) { -+ p += len - 1; -+ len = 0; -+ } else { -+ p += n; -+ len -= n; -+ } -+ } -+ if ((len > 0) && fmt) { -+ syslogp = p; -+ n = vsnprintf(p, len, fmt, vargs); -+ if ((n < 0) || (n >= len)) { -+ p += len - 1; -+ len = 0; -+ } else { -+ p += n; -+ len -= n; -+ } -+ } -+ *p = '\0'; -+ -+ if (_ctx.do_syslog && syslogp) -+ syslog(priority, "%s", syslogp); -+ -+ if (_ctx.do_stderr && priority <= _ctx.level) -+ fprintf(stderr, "%s\n", buf); -+} -+ -+/* -+ * Log a message at the given [priority] level specified by the printf-style -+ * format string [fmt]. -+ */ -+void -+zed_log_msg(int priority, const char *fmt, ...) -+{ -+ va_list vargs; -+ -+ if (fmt) { -+ va_start(vargs, fmt); -+ _zed_log_aux(priority, fmt, vargs); -+ va_end(vargs); -+ } -+} -+ -+/* -+ * Log a fatal error message specified by the printf-style format string [fmt]. -+ */ -+void -+zed_log_die(const char *fmt, ...) -+{ -+ va_list vargs; -+ -+ if (fmt) { -+ va_start(vargs, fmt); -+ _zed_log_aux(LOG_ERR, fmt, vargs); -+ va_end(vargs); -+ } -+ exit(EXIT_FAILURE); -+} -diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h -new file mode 100644 -index 0000000..7ae4549 ---- /dev/null -+++ b/cmd/zed/zed_log.h -@@ -0,0 +1,48 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_LOG_H -+#define ZED_LOG_H -+ -+#include -+ -+void zed_log_init(const char *identity); -+ -+void zed_log_fini(void); -+ -+void zed_log_stderr_open(int level); -+ -+void zed_log_stderr_close(void); -+ -+void zed_log_syslog_open(int facility); -+ -+void zed_log_syslog_close(void); -+ -+void zed_log_msg(int priority, const char *fmt, ...); -+ -+void zed_log_die(const char *fmt, ...); -+ -+#endif /* !ZED_LOG_H */ -diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c -new file mode 100644 -index 0000000..05a3740 ---- /dev/null -+++ b/cmd/zed/zed_strings.c -@@ -0,0 +1,200 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_strings.h" -+ -+struct zed_strings { -+ avl_tree_t tree; -+ avl_node_t *iteratorp; -+}; -+ -+struct zed_strings_node { -+ avl_node_t node; -+ char string[]; -+}; -+ -+typedef struct zed_strings_node zed_strings_node_t; -+ -+/* -+ * Compare zed_strings_node_t nodes [x1] and [x2]. -+ * As required for the AVL tree, return exactly -+ * -1 for <, 0 for ==, and +1 for >. -+ */ -+static int -+_zed_strings_node_compare(const void *x1, const void *x2) -+{ -+ const char *s1; -+ const char *s2; -+ int rv; -+ -+ assert(x1 != NULL); -+ assert(x2 != NULL); -+ -+ s1 = ((const zed_strings_node_t *) x1)->string; -+ assert(s1 != NULL); -+ s2 = ((const zed_strings_node_t *) x2)->string; -+ assert(s2 != NULL); -+ rv = strcmp(s1, s2); -+ -+ if (rv < 0) -+ return (-1); -+ -+ if (rv > 0) -+ return (1); -+ -+ return (0); -+} -+ -+/* -+ * Return a new string container, or NULL on error. -+ */ -+zed_strings_t * -+zed_strings_create(void) -+{ -+ zed_strings_t *zsp; -+ -+ zsp = malloc(sizeof (*zsp)); -+ if (!zsp) -+ return (NULL); -+ -+ memset(zsp, 0, sizeof (*zsp)); -+ avl_create(&zsp->tree, _zed_strings_node_compare, -+ sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node)); -+ -+ zsp->iteratorp = NULL; -+ return (zsp); -+} -+ -+/* -+ * Destroy the string container [zsp] and all strings within. -+ */ -+void -+zed_strings_destroy(zed_strings_t *zsp) -+{ -+ void *cookie; -+ zed_strings_node_t *np; -+ -+ if (!zsp) -+ return; -+ -+ cookie = NULL; -+ while ((np = avl_destroy_nodes(&zsp->tree, &cookie))) -+ free(np); -+ -+ avl_destroy(&zsp->tree); -+ free(zsp); -+} -+ -+/* -+ * Add a copy of the string [s] to the container [zsp]. -+ * Return 0 on success, or -1 on error. -+ * FIXME: Handle dup strings. -+ */ -+int -+zed_strings_add(zed_strings_t *zsp, const char *s) -+{ -+ size_t len; -+ zed_strings_node_t *np; -+ -+ if (!zsp || !s) { -+ errno = EINVAL; -+ return (-1); -+ } -+ len = sizeof (zed_strings_node_t) + strlen(s) + 1; -+ np = malloc(len); -+ if (!np) -+ return (-1); -+ -+ memset(np, 0, len); -+ assert((char *) np->string + strlen(s) < (char *) np + len); -+ (void) strcpy(np->string, s); -+ avl_add(&zsp->tree, np); -+ return (0); -+} -+ -+/* -+ * Return the first string in container [zsp]. -+ * Return NULL if there are no strings, or on error. -+ * This can be called multiple times to re-traverse [zsp]. -+ * XXX: Not thread-safe. -+ */ -+const char * -+zed_strings_first(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (NULL); -+ } -+ zsp->iteratorp = avl_first(&zsp->tree); -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ return (((zed_strings_node_t *) zsp->iteratorp)->string); -+ -+} -+ -+/* -+ * Return the next string in container [zsp]. -+ * Return NULL after the last string, or on error. -+ * This must be called after zed_strings_first(). -+ * XXX: Not thread-safe. -+ */ -+const char * -+zed_strings_next(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (NULL); -+ } -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp); -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ return (((zed_strings_node_t *)zsp->iteratorp)->string); -+} -+ -+/* -+ * Return the number of strings in container [zsp], or -1 on error. -+ */ -+int -+zed_strings_count(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (-1); -+ } -+ return (avl_numnodes(&zsp->tree)); -+} -diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h -new file mode 100644 -index 0000000..c1ea804 ---- /dev/null -+++ b/cmd/zed/zed_strings.h -@@ -0,0 +1,44 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_STRINGS_H -+#define ZED_STRINGS_H -+ -+typedef struct zed_strings zed_strings_t; -+ -+zed_strings_t * zed_strings_create(void); -+ -+void zed_strings_destroy(zed_strings_t *zsp); -+ -+int zed_strings_add(zed_strings_t *zsp, const char *s); -+ -+const char * zed_strings_first(zed_strings_t *zsp); -+ -+const char * zed_strings_next(zed_strings_t *zsp); -+ -+int zed_strings_count(zed_strings_t *zsp); -+ -+#endif /* !ZED_STRINGS_H */ -diff --git a/cmd/zfs/Makefile.am b/cmd/zfs/Makefile.am -index 8f381f1..08580c9 100644 ---- a/cmd/zfs/Makefile.am -+++ b/cmd/zfs/Makefile.am -@@ -18,4 +18,6 @@ zfs_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zfs_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zfs_LDADD += $(ZLIB) -+zfs_LDFLAGS = -pthread -diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c -index 6239a8f..8892d91 100644 ---- a/cmd/zfs/zfs_iter.c -+++ b/cmd/zfs/zfs_iter.c -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -23,2 +24,3 @@ - * Copyright (c) 2012 Pawel Jakub Dawidek . -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -110,3 +112,4 @@ zfs_callback(zfs_handle_t *zhp, void *data) - if (zfs_expand_proplist(zhp, cb->cb_proplist, -- (cb->cb_flags & ZFS_ITER_RECVD_PROPS)) -+ (cb->cb_flags & ZFS_ITER_RECVD_PROPS), -+ (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) - != 0) { -@@ -312,4 +315,4 @@ zfs_sort(const void *larg, const void *rarg, void *data) - -- (void) strlcpy(lbuf, zfs_get_name(l), sizeof(lbuf)); -- (void) strlcpy(rbuf, zfs_get_name(r), sizeof(rbuf)); -+ (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); -+ (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); - -diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h -index 7f740e7..2697fbd 100644 ---- a/cmd/zfs/zfs_iter.h -+++ b/cmd/zfs/zfs_iter.h -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -23,2 +24,3 @@ - * Use is subject to license terms. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -45,3 +47,4 @@ typedef struct zfs_sort_column { - #define ZFS_ITER_RECVD_PROPS (1 << 4) --#define ZFS_ITER_SIMPLE (1 << 5) -+#define ZFS_ITER_LITERAL_PROPS (1 << 5) -+#define ZFS_ITER_SIMPLE (1 << 6) - -diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c -index 5753cce..d7c1a2a 100644 ---- a/cmd/zfs/zfs_main.c -+++ b/cmd/zfs/zfs_main.c -@@ -23,5 +23,6 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2012 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -57,2 +58,3 @@ - #include -+#include - #include -@@ -74,2 +76,3 @@ static FILE *mnttab_file; - static char history_str[HIS_MAX_RECORD_LEN]; -+static boolean_t log_history = B_TRUE; - -@@ -233,6 +236,5 @@ get_usage(zfs_help_t idx) - case HELP_LIST: -- return (gettext("\tlist [-rH][-d max] " -- "[-o property[,...]] [-t type[,...]] [-s property] ...\n" -- "\t [-S property] ... " -- "[filesystem|volume|snapshot|snap] ...\n")); -+ return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " -+ "[-s property]...\n\t [-S property]... [-t type[,...]] " -+ "[filesystem|volume|snapshot] ...\n")); - case HELP_MOUNT: -@@ -263,3 +265,3 @@ get_usage(zfs_help_t idx) - return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " -- "\n")); -+ " ...\n")); - case HELP_UNMOUNT: -@@ -292,8 +294,8 @@ get_usage(zfs_help_t idx) - return (gettext("\tuserspace [-Hinp] [-o field[,...]] " -- "[-s field] ...\n\t[-S field] ... " -- "[-t type[,...]] \n")); -+ "[-s field]...\n\t [-S field]... [-t type[,...]] " -+ "\n")); - case HELP_GROUPSPACE: - return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " -- "[-s field] ...\n\t[-S field] ... " -- "[-t type[,...]] \n")); -+ "[-s field]...\n\t [-S field]... [-t type[,...]] " -+ "\n")); - case HELP_HOLD: -@@ -649,2 +651,7 @@ zfs_do_clone(int argc, char **argv) - -+ if (log_history) { -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; -+ } -+ - clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); -@@ -828,2 +835,7 @@ zfs_do_create(int argc, char **argv) - -+ if (log_history) { -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; -+ } -+ - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) -@@ -892,7 +904,8 @@ typedef struct destroy_cbdata { - nvlist_t *cb_nvl; -+ nvlist_t *cb_batchedsnaps; - - /* first snap in contiguous run */ -- zfs_handle_t *cb_firstsnap; -+ char *cb_firstsnap; - /* previous snap in contiguous run */ -- zfs_handle_t *cb_prevsnap; -+ char *cb_prevsnap; - int64_t cb_snapused; -@@ -988,5 +1001,23 @@ destroy_callback(zfs_handle_t *zhp, void *data) - } -+ if (cb->cb_dryrun) { -+ zfs_close(zhp); -+ return (0); -+ } - -- if (!cb->cb_dryrun) { -- if (zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || -+ /* -+ * We batch up all contiguous snapshots (even of different -+ * filesystems) and destroy them with one ioctl. We can't -+ * simply do all snap deletions and then all fs deletions, -+ * because we must delete a clone before its origin. -+ */ -+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { -+ fnvlist_add_boolean(cb->cb_batchedsnaps, name); -+ } else { -+ int error = zfs_destroy_snaps_nvl(g_zfs, -+ cb->cb_batchedsnaps, B_FALSE); -+ fnvlist_free(cb->cb_batchedsnaps); -+ cb->cb_batchedsnaps = fnvlist_alloc(); -+ -+ if (error != 0 || -+ zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { -@@ -1010,7 +1041,9 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) - if (cb->cb_firstsnap == NULL) -- cb->cb_firstsnap = zfs_handle_dup(zhp); -+ cb->cb_firstsnap = strdup(name); - if (cb->cb_prevsnap != NULL) -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - /* this snap continues the current range */ -- cb->cb_prevsnap = zfs_handle_dup(zhp); -+ cb->cb_prevsnap = strdup(name); -+ if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) -+ nomem(); - if (cb->cb_verbose) { -@@ -1029,8 +1062,8 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) - uint64_t used = 0; -- err = zfs_get_snapused_int(cb->cb_firstsnap, -+ err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); - cb->cb_snapused += used; -- zfs_close(cb->cb_firstsnap); -+ free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; -@@ -1051,3 +1084,3 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) - if (err == 0) { -- err = zfs_get_snapused_int(cb->cb_firstsnap, -+ err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); -@@ -1055,5 +1088,5 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) - cb->cb_snapused += used; -- zfs_close(cb->cb_firstsnap); -+ free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; -@@ -1144,4 +1177,6 @@ zfs_do_destroy(int argc, char **argv) - destroy_cbdata_t cb = { 0 }; -+ int rv = 0; -+ int err = 0; - int c; -- zfs_handle_t *zhp; -+ zfs_handle_t *zhp = NULL; - char *at; -@@ -1199,7 +1234,5 @@ zfs_do_destroy(int argc, char **argv) - if (at != NULL) { -- int err = 0; - - /* Build the list of snaps to destroy in cb_nvl. */ -- if (nvlist_alloc(&cb.cb_nvl, NV_UNIQUE_NAME, 0) != 0) -- nomem(); -+ cb.cb_nvl = fnvlist_alloc(); - -@@ -1214,5 +1247,4 @@ zfs_do_destroy(int argc, char **argv) - cb.cb_error) { -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1222,5 +1254,4 @@ zfs_do_destroy(int argc, char **argv) - "snapshots to destroy; check snapshot names.\n")); -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1243,6 +1274,16 @@ zfs_do_destroy(int argc, char **argv) - if (!cb.cb_dryrun) { -- if (cb.cb_doclones) -+ if (cb.cb_doclones) { -+ cb.cb_batchedsnaps = fnvlist_alloc(); - err = destroy_clones(&cb); -+ if (err == 0) { -+ err = zfs_destroy_snaps_nvl(g_zfs, -+ cb.cb_batchedsnaps, B_FALSE); -+ } -+ if (err != 0) { -+ rv = 1; -+ goto out; -+ } -+ } - if (err == 0) { -- err = zfs_destroy_snaps_nvl(zhp, cb.cb_nvl, -+ err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, - cb.cb_defer_destroy); -@@ -1251,6 +1292,4 @@ zfs_do_destroy(int argc, char **argv) - -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); - if (err != 0) -- return (1); -+ rv = 1; - } else { -@@ -1275,4 +1314,4 @@ zfs_do_destroy(int argc, char **argv) - "to destroy the pool itself\n"), zfs_get_name(zhp)); -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1286,4 +1325,4 @@ zfs_do_destroy(int argc, char **argv) - &cb) != 0) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1291,10 +1330,11 @@ zfs_do_destroy(int argc, char **argv) - if (cb.cb_error) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } - -+ cb.cb_batchedsnaps = fnvlist_alloc(); - if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, - &cb) != 0) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1305,7 +1345,18 @@ zfs_do_destroy(int argc, char **argv) - */ -- if (destroy_callback(zhp, &cb) != 0) -- return (1); -+ err = destroy_callback(zhp, &cb); -+ zhp = NULL; -+ if (err == 0) { -+ err = zfs_destroy_snaps_nvl(g_zfs, -+ cb.cb_batchedsnaps, cb.cb_defer_destroy); -+ } -+ if (err != 0) -+ rv = 1; - } - -- return (0); -+out: -+ fnvlist_free(cb.cb_batchedsnaps); -+ fnvlist_free(cb.cb_nvl); -+ if (zhp != NULL) -+ zfs_close(zhp); -+ return (rv); - } -@@ -1910,5 +1961,7 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data) - * be doing ioctls to different pools. We need -- * to log this history once to each pool. -+ * to log this history once to each pool, and bypass -+ * the normal history logging that happens in main(). - */ -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; - } -@@ -2056,3 +2109,3 @@ zfs_do_upgrade(int argc, char **argv) - * -o Control which fields to display. -- * -p Use exact (parseable) numeric output. -+ * -p Use exact (parsable) numeric output. - * -s Specify sort columns, descending order. -@@ -2090,3 +2143,3 @@ static int us_type_bits[] = { - }; --static char *us_type_names[] = { "posixgroup", "posxiuser", "smbgroup", -+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", - "smbuser", "all" }; -@@ -2746,15 +2799,15 @@ zfs_do_userspace(int argc, char **argv) - /* -- * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] -- * [-s property [-s property]...] [-S property [-S property]...] -- * ... -+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] -+ * [-t type[,...]] [filesystem|volume|snapshot] ... - * -+ * -H Scripted mode; elide headers and separate columns by tabs -+ * -p Display values in parsable (literal) format. - * -r Recurse over all children - * -d Limit recursion by depth. -- * -H Scripted mode; elide headers and separate columns by tabs - * -o Control which fields to display. -- * -t Control which object types to display. - * -s Specify sort columns, descending order. - * -S Specify sort columns, ascending order. -+ * -t Control which object types to display. - * -- * When given no arguments, lists all filesystems in the system. -+ * When given no arguments, list all filesystems in the system. - * Otherwise, list the specified datasets, optionally recursing down them if -@@ -2764,2 +2817,3 @@ typedef struct list_cbdata { - boolean_t cb_first; -+ boolean_t cb_literal; - boolean_t cb_scripted; -@@ -2772,4 +2826,5 @@ typedef struct list_cbdata { - static void --print_header(zprop_list_t *pl) -+print_header(list_cbdata_t *cb) - { -+ zprop_list_t *pl = cb->cb_proplist; - char headerbuf[ZFS_MAXPROPLEN]; -@@ -2814,4 +2869,5 @@ print_header(zprop_list_t *pl) - static void --print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) -+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) - { -+ zprop_list_t *pl = cb->cb_proplist; - boolean_t first = B_TRUE; -@@ -2822,3 +2878,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - boolean_t right_justify; -- int width; - -@@ -2826,3 +2881,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (!first) { -- if (scripted) -+ if (cb->cb_scripted) - (void) printf("\t"); -@@ -2836,3 +2891,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - (void) strlcpy(property, zfs_get_name(zhp), -- sizeof(property)); -+ sizeof (property)); - propstr = property; -@@ -2841,3 +2896,4 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get(zhp, pl->pl_prop, property, -- sizeof (property), NULL, NULL, 0, B_FALSE) != 0) -+ sizeof (property), NULL, NULL, 0, -+ cb->cb_literal) != 0) - propstr = "-"; -@@ -2845,3 +2901,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - propstr = property; -- - right_justify = zfs_prop_align_right(pl->pl_prop); -@@ -2849,3 +2904,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, -- property, sizeof (property), B_FALSE) != 0) -+ property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; -@@ -2856,3 +2911,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get_written(zhp, pl->pl_user_prop, -- property, sizeof (property), B_FALSE) != 0) -+ property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; -@@ -2871,4 +2926,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - -- width = pl->pl_width; -- - /* -@@ -2878,8 +2931,8 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - */ -- if (scripted || (pl->pl_next == NULL && !right_justify)) -+ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) - (void) printf("%s", propstr); - else if (right_justify) -- (void) printf("%*s", width, propstr); -+ (void) printf("%*s", (int)pl->pl_width, propstr); - else -- (void) printf("%-*s", width, propstr); -+ (void) printf("%-*s", (int)pl->pl_width, propstr); - } -@@ -2899,3 +2952,3 @@ list_callback(zfs_handle_t *zhp, void *data) - if (!cbp->cb_scripted) -- print_header(cbp->cb_proplist); -+ print_header(cbp); - cbp->cb_first = B_FALSE; -@@ -2903,3 +2956,3 @@ list_callback(zfs_handle_t *zhp, void *data) - -- print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted); -+ print_dataset(zhp, cbp); - -@@ -2912,3 +2965,2 @@ zfs_do_list(int argc, char **argv) - int c; -- boolean_t scripted = B_FALSE; - static char default_fields[] = -@@ -2926,3 +2978,3 @@ zfs_do_list(int argc, char **argv) - /* check options */ -- while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { -+ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { - switch (c) { -@@ -2931,2 +2983,6 @@ zfs_do_list(int argc, char **argv) - break; -+ case 'p': -+ cb.cb_literal = B_TRUE; -+ flags |= ZFS_ITER_LITERAL_PROPS; -+ break; - case 'd': -@@ -2938,3 +2994,3 @@ zfs_do_list(int argc, char **argv) - case 'H': -- scripted = B_TRUE; -+ cb.cb_scripted = B_TRUE; - break; -@@ -3028,3 +3084,2 @@ zfs_do_list(int argc, char **argv) - -- cb.cb_scripted = scripted; - cb.cb_first = B_TRUE; -@@ -3424,2 +3479,34 @@ zfs_do_set(int argc, char **argv) - -+typedef struct snap_cbdata { -+ nvlist_t *sd_nvl; -+ boolean_t sd_recursive; -+ const char *sd_snapname; -+} snap_cbdata_t; -+ -+static int -+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) -+{ -+ snap_cbdata_t *sd = arg; -+ char *name; -+ int rv = 0; -+ int error; -+ -+ if (sd->sd_recursive && -+ zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { -+ zfs_close(zhp); -+ return (0); -+ } -+ -+ error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); -+ if (error == -1) -+ nomem(); -+ fnvlist_add_boolean(sd->sd_nvl, name); -+ free(name); -+ -+ if (sd->sd_recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); -+ zfs_close(zhp); -+ return (rv); -+} -+ - /* -@@ -3433,3 +3520,2 @@ zfs_do_snapshot(int argc, char **argv) - { -- boolean_t recursive = B_FALSE; - int ret = 0; -@@ -3437,2 +3523,4 @@ zfs_do_snapshot(int argc, char **argv) - nvlist_t *props; -+ snap_cbdata_t sd = { 0 }; -+ boolean_t multiple_snaps = B_FALSE; - -@@ -3440,2 +3528,4 @@ zfs_do_snapshot(int argc, char **argv) - nomem(); -+ if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) -+ nomem(); - -@@ -3449,3 +3539,4 @@ zfs_do_snapshot(int argc, char **argv) - case 'r': -- recursive = B_TRUE; -+ sd.sd_recursive = B_TRUE; -+ multiple_snaps = B_TRUE; - break; -@@ -3466,10 +3557,26 @@ zfs_do_snapshot(int argc, char **argv) - } -- if (argc > 1) { -- (void) fprintf(stderr, gettext("too many arguments\n")); -- goto usage; -+ -+ if (argc > 1) -+ multiple_snaps = B_TRUE; -+ for (; argc > 0; argc--, argv++) { -+ char *atp; -+ zfs_handle_t *zhp; -+ -+ atp = strchr(argv[0], '@'); -+ if (atp == NULL) -+ goto usage; -+ *atp = '\0'; -+ sd.sd_snapname = atp + 1; -+ zhp = zfs_open(g_zfs, argv[0], -+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); -+ if (zhp == NULL) -+ goto usage; -+ if (zfs_snapshot_cb(zhp, &sd) != 0) -+ goto usage; - } - -- ret = zfs_snapshot(g_zfs, argv[0], recursive, props); -+ ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); -+ nvlist_free(sd.sd_nvl); - nvlist_free(props); -- if (ret && recursive) -+ if (ret != 0 && multiple_snaps) - (void) fprintf(stderr, gettext("no snapshots were created\n")); -@@ -3478,2 +3585,3 @@ zfs_do_snapshot(int argc, char **argv) - usage: -+ nvlist_free(sd.sd_nvl); - nvlist_free(props); -@@ -5030,10 +5138,2 @@ cleanup2: - --/* -- * zfs allow [-r] [-t] ... -- * -- * -r Recursively hold -- * -t Temporary hold (hidden option) -- * -- * Apply a user-hold with the given tag to the list of snapshots. -- */ - static int -@@ -5044,10 +5144,2 @@ zfs_do_allow(int argc, char **argv) - --/* -- * zfs unallow [-r] [-t] ... -- * -- * -r Recursively hold -- * -t Temporary hold (hidden option) -- * -- * Apply a user-hold with the given tag to the list of snapshots. -- */ - static int -@@ -5065,3 +5157,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - boolean_t recursive = B_FALSE; -- boolean_t temphold = B_FALSE; - const char *opts = holding ? "rt" : "r"; -@@ -5075,5 +5166,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - break; -- case 't': -- temphold = B_TRUE; -- break; - case '?': -@@ -5125,4 +5213,3 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - if (holding) { -- if (zfs_hold(zhp, delim+1, tag, recursive, -- temphold, B_FALSE, -1, 0, 0) != 0) -+ if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) - ++errors; -@@ -5142,3 +5229,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - * -r Recursively hold -- * -t Temporary hold (hidden option) - * -@@ -5778,3 +5864,7 @@ share_mount(int op, int argc, char **argv) - */ -- rewind(mnttab_file); -+ -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while (getmntent(mnttab_file, &entry) == 0) { -@@ -5881,3 +5971,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) - */ -- rewind(mnttab_file); -+ -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { -@@ -6035,3 +6129,6 @@ unshare_unmount(int op, int argc, char **argv) - -- rewind(mnttab_file); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while (getmntent(mnttab_file, &entry) == 0) { -@@ -6386,4 +6483,3 @@ main(int argc, char **argv) - -- zpool_set_history_str("zfs", argc, argv, history_str); -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - -@@ -6394,3 +6490,3 @@ main(int argc, char **argv) - */ -- libzfs_mnttab_cache(g_zfs, B_FALSE); -+ libzfs_mnttab_cache(g_zfs, B_TRUE); - if (find_command_idx(cmdname, &i) == 0) { -@@ -6408,2 +6504,6 @@ main(int argc, char **argv) - } -+ -+ if (ret == 0 && log_history) -+ (void) zpool_log_history(g_zfs, history_str); -+ - libzfs_fini(g_zfs); -diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am -index 47da245..922aef9 100644 ---- a/cmd/zhack/Makefile.am -+++ b/cmd/zhack/Makefile.am -@@ -15,4 +15,5 @@ zhack_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zhack_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zhack_LDADD += $(ZLIB) -diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c -index b2cf815..64ab8ed 100644 ---- a/cmd/zhack/zhack.c -+++ b/cmd/zhack/zhack.c -@@ -23,2 +23,3 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -48,2 +49,3 @@ - #include -+#include - #undef ZFS_MAXNAMELEN -@@ -125,3 +127,3 @@ import_pool(const char *target, boolean_t readonly) - nvlist_t *props; -- const char *name; -+ char *name; - -@@ -153,3 +155,3 @@ import_pool(const char *target, boolean_t readonly) - -- if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) { -+ if (nvlist_empty(pools)) { - if (!g_importargs.can_be_active) { -@@ -275,8 +277,11 @@ zhack_do_feature_stat(int argc, char **argv) - static void --feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_enable_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_enable(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack enable feature", tx, -+ "name=%s can_readonly=%u", -+ feature->fi_guid, feature->fi_can_readonly); - } -@@ -343,4 +348,4 @@ zhack_do_feature_enable(int argc, char **argv) - -- VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, -- feature_enable_sync, spa, &feature, 5)); -+ VERIFY0(dsl_sync_task(spa_name(spa), NULL, -+ feature_enable_sync, &feature, 5)); - -@@ -352,8 +357,10 @@ zhack_do_feature_enable(int argc, char **argv) - static void --feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_incr_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_incr(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack feature incr", tx, -+ "name=%s", feature->fi_guid); - } -@@ -361,8 +368,10 @@ feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static void --feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_decr_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_decr(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack feature decr", tx, -+ "name=%s", feature->fi_guid); - } -@@ -437,4 +446,4 @@ zhack_do_feature_ref(int argc, char **argv) - -- VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, -- decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5)); -+ VERIFY0(dsl_sync_task(spa_name(spa), NULL, -+ decr ? feature_decr_sync : feature_incr_sync, &feature, 5)); - -diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am -index d1d32d5..4adef11 100644 ---- a/cmd/zinject/Makefile.am -+++ b/cmd/zinject/Makefile.am -@@ -17,4 +17,3 @@ zinject_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --zinject_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la -diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c -index b2ccb67..5cc9d9f 100644 ---- a/cmd/zinject/translate.c -+++ b/cmd/zinject/translate.c -@@ -469,3 +469,3 @@ translate_device(const char *pool, const char *device, err_type_t label_type, - -- record->zi_guid = strtoull(device, &end, 16); -+ record->zi_guid = strtoull(device, &end, 0); - if (record->zi_guid == 0 || *end != '\0') { -diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c -index 13d067d..f6c8915 100644 ---- a/cmd/zinject/zinject.c -+++ b/cmd/zinject/zinject.c -@@ -297,7 +297,5 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - int ret; - -- zc.zc_guid = 0; -- - while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) -@@ -424,3 +422,3 @@ cancel_one_handler(int id, const char *pool, zinject_record_t *record, - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -457,3 +455,3 @@ cancel_handler(int id) - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -479,3 +477,3 @@ register_handler(const char *pool, int flags, zinject_record_t *record, - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -536,3 +534,3 @@ perform_action(const char *pool, zinject_record_t *record, int cmd) - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -diff --git a/cmd/zpios/zpios.h b/cmd/zpios/zpios.h -index 23c3237..92d96fc 100644 ---- a/cmd/zpios/zpios.h -+++ b/cmd/zpios/zpios.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,6 +31,6 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_H --#define _ZPIOS_H -+#define _ZPIOS_H - -@@ -38,27 +38,28 @@ - --#define VERSION_SIZE 64 -+#define VERSION_SIZE 64 - - /* Regular expressions */ --#define REGEX_NUMBERS "^[0-9]*[0-9]$" --#define REGEX_NUMBERS_COMMA "^([0-9]+,)*[0-9]+$" --#define REGEX_SIZE "^[0-9][0-9]*[kmgt]$" --#define REGEX_SIZE_COMMA "^([0-9][0-9]*[kmgt]+,)*[0-9][0-9]*[kmgt]$" -+#define REGEX_NUMBERS "^[0-9]*[0-9]$" -+#define REGEX_NUMBERS_COMMA "^([0-9]+,)*[0-9]+$" -+#define REGEX_SIZE "^[0-9][0-9]*[kmgt]$" -+#define REGEX_SIZE_COMMA "^([0-9][0-9]*[kmgt]+,)*[0-9][0-9]*[kmgt]$" - - /* Flags for low, high, incr */ --#define FLAG_SET 0x01 --#define FLAG_LOW 0x02 --#define FLAG_HIGH 0x04 --#define FLAG_INCR 0x08 -+#define FLAG_SET 0x01 -+#define FLAG_LOW 0x02 -+#define FLAG_HIGH 0x04 -+#define FLAG_INCR 0x08 - --#define TRUE 1 --#define FALSE 0 -+#define TRUE 1 -+#define FALSE 0 - --#define KB (1024) --#define MB (KB * 1024) --#define GB (MB * 1024) --#define TB (GB * 1024) -+#define KB (1024) -+#define MB (KB * 1024) -+#define GB (MB * 1024) -+#define TB (GB * 1024) - --#define KMGT_SIZE 16 -+#define KMGT_SIZE 16 - --/* All offsets, sizes and counts can be passed to the application in -+/* -+ * All offsets, sizes and counts can be passed to the application in - * multiple ways. -@@ -69,4 +70,4 @@ - typedef struct pios_range_repeat { -- uint64_t val[32]; /* Comma sep array, or low, high, inc */ -- uint64_t val_count; /* Num of values */ -+ uint64_t val[32]; /* Comma sep array, or low, high, inc */ -+ uint64_t val_count; /* Num of values */ - uint64_t val_low; -@@ -74,3 +75,3 @@ typedef struct pios_range_repeat { - uint64_t val_inc_perc; -- uint64_t next_val; /* Used for multiple runs in get_next() */ -+ uint64_t next_val; /* For multiple runs in get_next() */ - } range_repeat_t; -@@ -78,22 +79,22 @@ typedef struct pios_range_repeat { - typedef struct cmd_args { -- range_repeat_t T; /* Thread count */ -- range_repeat_t N; /* Region count */ -- range_repeat_t O; /* Offset count */ -- range_repeat_t C; /* Chunksize */ -- range_repeat_t S; /* Regionsize */ -- -- const char *pool; /* Pool */ -- const char *name; /* Name */ -- uint32_t flags; /* Flags */ -- uint32_t io_type; /* DMUIO only */ -- uint32_t verbose; /* Verbose */ -- uint32_t human_readable; /* Human readable output */ -- -- uint64_t regionnoise; /* Region noise */ -- uint64_t chunknoise; /* Chunk noise */ -- uint64_t thread_delay; /* Thread delay */ -- -- char pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -- char post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -- char log[ZPIOS_PATH_SIZE]; /* Requested log dir */ -+ range_repeat_t T; /* Thread count */ -+ range_repeat_t N; /* Region count */ -+ range_repeat_t O; /* Offset count */ -+ range_repeat_t C; /* Chunksize */ -+ range_repeat_t S; /* Regionsize */ -+ -+ const char *pool; /* Pool */ -+ const char *name; /* Name */ -+ uint32_t flags; /* Flags */ -+ uint32_t io_type; /* DMUIO only */ -+ uint32_t verbose; /* Verbose */ -+ uint32_t human_readable; /* Human readable output */ -+ -+ uint64_t regionnoise; /* Region noise */ -+ uint64_t chunknoise; /* Chunk noise */ -+ uint64_t thread_delay; /* Thread delay */ -+ -+ char pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -+ char post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -+ char log[ZPIOS_PATH_SIZE]; /* Requested log dir */ - -@@ -111,5 +112,5 @@ typedef struct cmd_args { - int set_count(char *pattern1, char *pattern2, range_repeat_t *range, -- char *optarg, uint32_t *flags, char *arg); -+ char *optarg, uint32_t *flags, char *arg); - int set_lhi(char *pattern, range_repeat_t *range, char *optarg, -- int flag, uint32_t *flag_thread, char *arg); -+ int flag, uint32_t *flag_thread, char *arg); - int set_noise(uint64_t *noise, char *optarg, char *arg); -diff --git a/cmd/zpios/zpios_main.c b/cmd/zpios/zpios_main.c -index 1c01d9a..971a886 100644 ---- a/cmd/zpios/zpios_main.c -+++ b/cmd/zpios/zpios_main.c -@@ -1,5 +1,5 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. - * It is designed to have the test code running in the Linux kernel -- * against ZFS while still being flexibly controled from user space. -+ * against ZFS while still being flexibly controlled from user space. - * -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -44,43 +44,44 @@ - --static const char short_opt[] = "t:l:h:e:n:i:j:k:o:m:q:r:c:a:b:g:s:A:B:C:" -- "L:p:M:xP:R:G:I:N:T:VzOfHv?"; -+static const char short_opt[] = -+ "t:l:h:e:n:i:j:k:o:m:q:r:c:a:b:g:s:A:B:C:" -+ "L:p:M:xP:R:G:I:N:T:VzOfHv?"; - static const struct option long_opt[] = { -- {"threadcount", required_argument, 0, 't' }, -- {"threadcount_low", required_argument, 0, 'l' }, -- {"threadcount_high", required_argument, 0, 'h' }, -- {"threadcount_incr", required_argument, 0, 'e' }, -- {"regioncount", required_argument, 0, 'n' }, -- {"regioncount_low", required_argument, 0, 'i' }, -- {"regioncount_high", required_argument, 0, 'j' }, -- {"regioncount_incr", required_argument, 0, 'k' }, -- {"offset", required_argument, 0, 'o' }, -- {"offset_low", required_argument, 0, 'm' }, -- {"offset_high", required_argument, 0, 'q' }, -- {"offset_incr", required_argument, 0, 'r' }, -- {"chunksize", required_argument, 0, 'c' }, -- {"chunksize_low", required_argument, 0, 'a' }, -- {"chunksize_high", required_argument, 0, 'b' }, -- {"chunksize_incr", required_argument, 0, 'g' }, -- {"regionsize", required_argument, 0, 's' }, -- {"regionsize_low", required_argument, 0, 'A' }, -- {"regionsize_high", required_argument, 0, 'B' }, -- {"regionsize_incr", required_argument, 0, 'C' }, -- {"load", required_argument, 0, 'L' }, -- {"pool", required_argument, 0, 'p' }, -- {"name", required_argument, 0, 'M' }, -- {"cleanup", no_argument, 0, 'x' }, -- {"prerun", required_argument, 0, 'P' }, -- {"postrun", required_argument, 0, 'R' }, -- {"log", required_argument, 0, 'G' }, -- {"regionnoise", required_argument, 0, 'I' }, -- {"chunknoise", required_argument, 0, 'N' }, -- {"threaddelay", required_argument, 0, 'T' }, -- {"verify", no_argument, 0, 'V' }, -- {"zerocopy", no_argument, 0, 'z' }, -- {"nowait", no_argument, 0, 'O' }, -- {"noprefetch", no_argument, 0, 'f' }, -- {"human-readable", no_argument, 0, 'H' }, -- {"verbose", no_argument, 0, 'v' }, -- {"help", no_argument, 0, '?' }, -- { 0, 0, 0, 0 }, -+ {"threadcount", required_argument, 0, 't' }, -+ {"threadcount_low", required_argument, 0, 'l' }, -+ {"threadcount_high", required_argument, 0, 'h' }, -+ {"threadcount_incr", required_argument, 0, 'e' }, -+ {"regioncount", required_argument, 0, 'n' }, -+ {"regioncount_low", required_argument, 0, 'i' }, -+ {"regioncount_high", required_argument, 0, 'j' }, -+ {"regioncount_incr", required_argument, 0, 'k' }, -+ {"offset", required_argument, 0, 'o' }, -+ {"offset_low", required_argument, 0, 'm' }, -+ {"offset_high", required_argument, 0, 'q' }, -+ {"offset_incr", required_argument, 0, 'r' }, -+ {"chunksize", required_argument, 0, 'c' }, -+ {"chunksize_low", required_argument, 0, 'a' }, -+ {"chunksize_high", required_argument, 0, 'b' }, -+ {"chunksize_incr", required_argument, 0, 'g' }, -+ {"regionsize", required_argument, 0, 's' }, -+ {"regionsize_low", required_argument, 0, 'A' }, -+ {"regionsize_high", required_argument, 0, 'B' }, -+ {"regionsize_incr", required_argument, 0, 'C' }, -+ {"load", required_argument, 0, 'L' }, -+ {"pool", required_argument, 0, 'p' }, -+ {"name", required_argument, 0, 'M' }, -+ {"cleanup", no_argument, 0, 'x' }, -+ {"prerun", required_argument, 0, 'P' }, -+ {"postrun", required_argument, 0, 'R' }, -+ {"log", required_argument, 0, 'G' }, -+ {"regionnoise", required_argument, 0, 'I' }, -+ {"chunknoise", required_argument, 0, 'N' }, -+ {"threaddelay", required_argument, 0, 'T' }, -+ {"verify", no_argument, 0, 'V' }, -+ {"zerocopy", no_argument, 0, 'z' }, -+ {"nowait", no_argument, 0, 'O' }, -+ {"noprefetch", no_argument, 0, 'f' }, -+ {"human-readable", no_argument, 0, 'H' }, -+ {"verbose", no_argument, 0, 'v' }, -+ {"help", no_argument, 0, '?' }, -+ { 0, 0, 0, 0 }, - }; -@@ -97,41 +98,41 @@ usage(void) - fprintf(stderr, -- " --threadcount -t =values\n" -- " --threadcount_low -l =value\n" -- " --threadcount_high -h =value\n" -- " --threadcount_incr -e =value\n" -- " --regioncount -n =values\n" -- " --regioncount_low -i =value\n" -- " --regioncount_high -j =value\n" -- " --regioncount_incr -k =value\n" -- " --offset -o =values\n" -- " --offset_low -m =value\n" -- " --offset_high -q =value\n" -- " --offset_incr -r =value\n" -- " --chunksize -c =values\n" -- " --chunksize_low -a =value\n" -- " --chunksize_high -b =value\n" -- " --chunksize_incr -g =value\n" -- " --regionsize -s =values\n" -- " --regionsize_low -A =value\n" -- " --regionsize_high -B =value\n" -- " --regionsize_incr -C =value\n" -- " --load -L =dmuio|ssf|fpp\n" -- " --pool -p =pool name\n" -+ " --threadcount -t =values\n" -+ " --threadcount_low -l =value\n" -+ " --threadcount_high -h =value\n" -+ " --threadcount_incr -e =value\n" -+ " --regioncount -n =values\n" -+ " --regioncount_low -i =value\n" -+ " --regioncount_high -j =value\n" -+ " --regioncount_incr -k =value\n" -+ " --offset -o =values\n" -+ " --offset_low -m =value\n" -+ " --offset_high -q =value\n" -+ " --offset_incr -r =value\n" -+ " --chunksize -c =values\n" -+ " --chunksize_low -a =value\n" -+ " --chunksize_high -b =value\n" -+ " --chunksize_incr -g =value\n" -+ " --regionsize -s =values\n" -+ " --regionsize_low -A =value\n" -+ " --regionsize_high -B =value\n" -+ " --regionsize_incr -C =value\n" -+ " --load -L =dmuio|ssf|fpp\n" -+ " --pool -p =pool name\n" - " --name -M =test name\n" -- " --cleanup -x\n" -- " --prerun -P =pre-command\n" -- " --postrun -R =post-command\n" -- " --log -G =log directory\n" -- " --regionnoise -I =shift\n" -- " --chunknoise -N =bytes\n" -- " --threaddelay -T =jiffies\n" -- " --verify -V\n" -- " --zerocopy -z\n" -- " --nowait -O\n" -+ " --cleanup -x\n" -+ " --prerun -P =pre-command\n" -+ " --postrun -R =post-command\n" -+ " --log -G =log directory\n" -+ " --regionnoise -I =shift\n" -+ " --chunknoise -N =bytes\n" -+ " --threaddelay -T =jiffies\n" -+ " --verify -V\n" -+ " --zerocopy -z\n" -+ " --nowait -O\n" - " --noprefetch -f\n" -- " --human-readable -H\n" -- " --verbose -v =increase verbosity\n" -- " --help -? =this help\n\n"); -+ " --human-readable -H\n" -+ " --verbose -v =increase verbosity\n" -+ " --help -? =this help\n\n"); - -- return 0; -+ return (0); - } -@@ -157,3 +158,3 @@ args_init(int argc, char **argv) - usage(); -- return (cmd_args_t *)NULL; -+ return ((cmd_args_t *)NULL); - } -@@ -161,9 +162,9 @@ args_init(int argc, char **argv) - /* Configure and populate the args structures */ -- args = malloc(sizeof(*args)); -+ args = malloc(sizeof (*args)); - if (args == NULL) -- return NULL; -+ return (NULL); - -- memset(args, 0, sizeof(*args)); -+ memset(args, 0, sizeof (*args)); - -- while ((c=getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { -+ while ((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { - rc = 0; -@@ -172,4 +173,4 @@ args_init(int argc, char **argv) - case 't': /* --thread count */ -- rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -- &args->T, optarg, &fl_th, "threadcount"); -+ rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -+ &args->T, optarg, &fl_th, "threadcount"); - break; -@@ -177,3 +178,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_LOW, &fl_th, "threadcount_low"); -+ FLAG_LOW, &fl_th, "threadcount_low"); - break; -@@ -181,3 +182,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_HIGH, &fl_th, "threadcount_high"); -+ FLAG_HIGH, &fl_th, "threadcount_high"); - break; -@@ -185,3 +186,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_INCR, &fl_th, "threadcount_incr"); -+ FLAG_INCR, &fl_th, "threadcount_incr"); - break; -@@ -189,3 +190,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -- &args->N, optarg, &fl_rc, "regioncount"); -+ &args->N, optarg, &fl_rc, "regioncount"); - break; -@@ -193,3 +194,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_LOW, &fl_rc, "regioncount_low"); -+ FLAG_LOW, &fl_rc, "regioncount_low"); - break; -@@ -197,3 +198,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_HIGH, &fl_rc, "regioncount_high"); -+ FLAG_HIGH, &fl_rc, "regioncount_high"); - break; -@@ -201,3 +202,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_INCR, &fl_rc, "regioncount_incr"); -+ FLAG_INCR, &fl_rc, "regioncount_incr"); - break; -@@ -205,3 +206,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->O, optarg, &fl_of, "offset"); -+ &args->O, optarg, &fl_of, "offset"); - break; -@@ -209,3 +210,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->O, optarg, -- FLAG_LOW, &fl_of, "offset_low"); -+ FLAG_LOW, &fl_of, "offset_low"); - break; -@@ -213,3 +214,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->O, optarg, -- FLAG_HIGH, &fl_of, "offset_high"); -+ FLAG_HIGH, &fl_of, "offset_high"); - break; -@@ -217,3 +218,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->O, optarg, -- FLAG_INCR, &fl_of, "offset_incr"); -+ FLAG_INCR, &fl_of, "offset_incr"); - break; -@@ -221,3 +222,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->C, optarg, &fl_cs, "chunksize"); -+ &args->C, optarg, &fl_cs, "chunksize"); - break; -@@ -225,3 +226,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->C, optarg, -- FLAG_LOW, &fl_cs, "chunksize_low"); -+ FLAG_LOW, &fl_cs, "chunksize_low"); - break; -@@ -229,3 +230,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->C, optarg, -- FLAG_HIGH, &fl_cs, "chunksize_high"); -+ FLAG_HIGH, &fl_cs, "chunksize_high"); - break; -@@ -233,3 +234,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->C, optarg, -- FLAG_INCR, &fl_cs, "chunksize_incr"); -+ FLAG_INCR, &fl_cs, "chunksize_incr"); - break; -@@ -237,3 +238,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->S, optarg, &fl_rs, "regionsize"); -+ &args->S, optarg, &fl_rs, "regionsize"); - break; -@@ -241,3 +242,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->S, optarg, -- FLAG_LOW, &fl_rs, "regionsize_low"); -+ FLAG_LOW, &fl_rs, "regionsize_low"); - break; -@@ -245,3 +246,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->S, optarg, -- FLAG_HIGH, &fl_rs, "regionsize_high"); -+ FLAG_HIGH, &fl_rs, "regionsize_high"); - break; -@@ -249,3 +250,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->S, optarg, -- FLAG_INCR, &fl_rs, "regionsize_incr"); -+ FLAG_INCR, &fl_rs, "regionsize_incr"); - break; -@@ -273,3 +274,4 @@ args_init(int argc, char **argv) - case 'I': /* --regionnoise */ -- rc = set_noise(&args->regionnoise, optarg, "regionnoise"); -+ rc = set_noise(&args->regionnoise, optarg, -+ "regionnoise"); - break; -@@ -279,3 +281,4 @@ args_init(int argc, char **argv) - case 'T': /* --threaddelay */ -- rc = set_noise(&args->thread_delay, optarg, "threaddelay"); -+ rc = set_noise(&args->thread_delay, optarg, -+ "threaddelay"); - break; -@@ -303,3 +306,4 @@ args_init(int argc, char **argv) - default: -- fprintf(stderr,"Unknown option '%s'\n",argv[optind-1]); -+ fprintf(stderr, "Unknown option '%s'\n", -+ argv[optind - 1]); - rc = EINVAL; -@@ -311,3 +315,3 @@ args_init(int argc, char **argv) - args_fini(args); -- return NULL; -+ return (NULL); - } -@@ -325,3 +329,3 @@ args_init(int argc, char **argv) - args_fini(args); -- return NULL; -+ return (NULL); - } -@@ -330,10 +334,10 @@ args_init(int argc, char **argv) - (args->flags & DMU_VERIFY)) { -- fprintf(stderr, "Error, --zerocopy incompatible --verify, " -- "used for performance analysis only\n"); -+ fprintf(stderr, "Error, --zerocopy incompatible --verify, " -+ "used for performance analysis only\n"); - usage(); - args_fini(args); -- return NULL; -+ return (NULL); - } - -- return args; -+ return (args); - } -@@ -346,5 +350,5 @@ dev_clear(void) - -- memset(&cfg, 0, sizeof(cfg)); -+ memset(&cfg, 0, sizeof (cfg)); - cfg.cfg_magic = ZPIOS_CFG_MAGIC; -- cfg.cfg_cmd = ZPIOS_CFG_BUFFER_CLEAR; -+ cfg.cfg_cmd = ZPIOS_CFG_BUFFER_CLEAR; - cfg.cfg_arg1 = 0; -@@ -354,3 +358,3 @@ dev_clear(void) - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", -- (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -+ (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); - -@@ -358,3 +362,3 @@ dev_clear(void) - -- return rc; -+ return (rc); - } -@@ -368,5 +372,5 @@ dev_size(int size) - -- memset(&cfg, 0, sizeof(cfg)); -+ memset(&cfg, 0, sizeof (cfg)); - cfg.cfg_magic = ZPIOS_CFG_MAGIC; -- cfg.cfg_cmd = ZPIOS_CFG_BUFFER_SIZE; -+ cfg.cfg_cmd = ZPIOS_CFG_BUFFER_SIZE; - cfg.cfg_arg1 = size; -@@ -376,7 +380,7 @@ dev_size(int size) - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", -- (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -- return rc; -+ (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -+ return (rc); - } - -- return cfg.cfg_rc1; -+ return (cfg.cfg_rc1); - } -@@ -392,3 +396,3 @@ dev_fini(void) - fprintf(stderr, "Unable to close %s: %d\n", -- ZPIOS_DEV, errno); -+ ZPIOS_DEV, errno); - } -@@ -405,3 +409,3 @@ dev_init(void) - fprintf(stderr, "Unable to open %s: %d\n" -- "Is the zpios module loaded?\n", ZPIOS_DEV, errno); -+ "Is the zpios module loaded?\n", ZPIOS_DEV, errno); - rc = errno; -@@ -424,3 +428,3 @@ dev_init(void) - memset(zpios_buffer, 0, zpios_buffer_size); -- return 0; -+ return (0); - error: -@@ -429,3 +433,3 @@ error: - fprintf(stderr, "Unable to close %s: %d\n", -- ZPIOS_DEV, errno); -+ ZPIOS_DEV, errno); - } -@@ -433,3 +437,3 @@ error: - -- return rc; -+ return (rc); - } -@@ -442,6 +446,6 @@ get_next(uint64_t *val, range_repeat_t *range) - *val = (range->val_low) + -- (range->val_low * range->next_val / 100); -+ (range->val_low * range->next_val / 100); - - if (*val > range->val_high) -- return 0; /* No more values, limit exceeded */ -+ return (0); /* No more values, limit exceeded */ - -@@ -450,5 +454,5 @@ get_next(uint64_t *val, range_repeat_t *range) - else -- range->next_val = range->next_val+range->val_inc_perc; -+ range->next_val = range->next_val + range->val_inc_perc; - -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - -@@ -457,3 +461,3 @@ get_next(uint64_t *val, range_repeat_t *range) - if (range->next_val) -- return 0; /* No more values, we only have one */ -+ return (0); /* No more values, we only have one */ - -@@ -461,3 +465,3 @@ get_next(uint64_t *val, range_repeat_t *range) - range->next_val = 1; -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - -@@ -466,3 +470,3 @@ get_next(uint64_t *val, range_repeat_t *range) - if (range->next_val > range->val_count - 1) -- return 0; /* No more values, limit exceeded */ -+ return (0); /* No more values, limit exceeded */ - -@@ -470,6 +474,6 @@ get_next(uint64_t *val, range_repeat_t *range) - range->next_val++; -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - } - -- return 0; -+ return (0); - } -@@ -478,16 +482,18 @@ static int - run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, -- uint64_t C, uint64_t S, uint64_t O) -+ uint64_t C, uint64_t S, uint64_t O) - { - zpios_cmd_t *cmd; -- int rc, rc2, cmd_size; -+ int rc, rc2, cmd_size; - -- dev_clear(); -+ dev_clear(); - -- cmd_size = sizeof(zpios_cmd_t) + ((T + N + 1) * sizeof(zpios_stats_t)); -- cmd = (zpios_cmd_t *)malloc(cmd_size); -- if (cmd == NULL) -- return ENOMEM; -+ cmd_size = -+ sizeof (zpios_cmd_t) -+ + ((T + N + 1) * sizeof (zpios_stats_t)); -+ cmd = (zpios_cmd_t *)malloc(cmd_size); -+ if (cmd == NULL) -+ return (ENOMEM); - -- memset(cmd, 0, cmd_size); -- cmd->cmd_magic = ZPIOS_CMD_MAGIC; -+ memset(cmd, 0, cmd_size); -+ cmd->cmd_magic = ZPIOS_CMD_MAGIC; - strncpy(cmd->cmd_pool, args->pool, ZPIOS_NAME_SIZE - 1); -@@ -496,15 +502,15 @@ run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, - strncpy(cmd->cmd_log, args->log, ZPIOS_PATH_SIZE - 1); -- cmd->cmd_id = id; -- cmd->cmd_chunk_size = C; -+ cmd->cmd_id = id; -+ cmd->cmd_chunk_size = C; - cmd->cmd_thread_count = T; - cmd->cmd_region_count = N; -- cmd->cmd_region_size = S; -- cmd->cmd_offset = O; -+ cmd->cmd_region_size = S; -+ cmd->cmd_offset = O; - cmd->cmd_region_noise = args->regionnoise; -- cmd->cmd_chunk_noise = args->chunknoise; -+ cmd->cmd_chunk_noise = args->chunknoise; - cmd->cmd_thread_delay = args->thread_delay; -- cmd->cmd_flags = args->flags; -- cmd->cmd_data_size = (T + N + 1) * sizeof(zpios_stats_t); -+ cmd->cmd_flags = args->flags; -+ cmd->cmd_data_size = (T + N + 1) * sizeof (zpios_stats_t); - -- rc = ioctl(zpiosctl_fd, ZPIOS_CMD, cmd); -+ rc = ioctl(zpiosctl_fd, ZPIOS_CMD, cmd); - if (rc) -@@ -514,15 +520,15 @@ run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, - -- if (args->verbose) { -- rc2 = read(zpiosctl_fd, zpios_buffer, zpios_buffer_size - 1); -- if (rc2 < 0) { -- fprintf(stdout, "Error reading results: %d\n", rc2); -- } else if ((rc2 > 0) && (strlen(zpios_buffer) > 0)) { -- fprintf(stdout, "\n%s\n", zpios_buffer); -- fflush(stdout); -- } -- } -+ if (args->verbose) { -+ rc2 = read(zpiosctl_fd, zpios_buffer, zpios_buffer_size - 1); -+ if (rc2 < 0) { -+ fprintf(stdout, "Error reading results: %d\n", rc2); -+ } else if ((rc2 > 0) && (strlen(zpios_buffer) > 0)) { -+ fprintf(stdout, "\n%s\n", zpios_buffer); -+ fflush(stdout); -+ } -+ } - -- free(cmd); -+ free(cmd); - -- return rc; -+ return (rc); - } -@@ -536,4 +542,4 @@ run_offsets(cmd_args_t *args) - rc = run_one(args, args->current_id, -- args->current_T, args->current_N, args->current_C, -- args->current_S, args->current_O); -+ args->current_T, args->current_N, args->current_C, -+ args->current_S, args->current_O); - args->current_id++; -@@ -542,3 +548,3 @@ run_offsets(cmd_args_t *args) - args->O.next_val = 0; -- return rc; -+ return (rc); - } -@@ -551,6 +557,6 @@ run_region_counts(cmd_args_t *args) - while (rc == 0 && get_next((uint64_t *)&args->current_N, &args->N)) -- rc = run_offsets(args); -+ rc = run_offsets(args); - - args->N.next_val = 0; -- return rc; -+ return (rc); - } -@@ -564,5 +570,5 @@ run_region_sizes(cmd_args_t *args) - if (args->current_S < args->current_C) { -- fprintf(stderr, "Error: in any run chunksize can " -- "not be smaller than regionsize.\n"); -- return EINVAL; -+ fprintf(stderr, "Error: in any run chunksize must " -+ "be strictly smaller than regionsize.\n"); -+ return (EINVAL); - } -@@ -573,3 +579,3 @@ run_region_sizes(cmd_args_t *args) - args->S.next_val = 0; -- return rc; -+ return (rc); - } -@@ -582,3 +588,3 @@ run_chunk_sizes(cmd_args_t *args) - while (rc == 0 && get_next(&args->current_C, &args->C)) { -- rc = run_region_sizes(args); -+ rc = run_region_sizes(args); - } -@@ -586,3 +592,3 @@ run_chunk_sizes(cmd_args_t *args) - args->C.next_val = 0; -- return rc; -+ return (rc); - } -@@ -597,3 +603,3 @@ run_thread_counts(cmd_args_t *args) - -- return rc; -+ return (rc); - } -@@ -627,3 +633,3 @@ out: - dev_fini(); -- return rc; -+ return (rc); - } -diff --git a/cmd/zpios/zpios_util.c b/cmd/zpios/zpios_util.c -index 9b06655..b226322 100644 ---- a/cmd/zpios/zpios_util.c -+++ b/cmd/zpios/zpios_util.c -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -51,3 +51,3 @@ kmgt_to_uint64(const char *str, uint64_t *val) - if ((str == endptr) && (*val == 0)) -- return EINVAL; -+ return (EINVAL); - -@@ -72,3 +72,3 @@ kmgt_to_uint64(const char *str, uint64_t *val) - -- return rc; -+ return (rc); - } -@@ -87,8 +87,8 @@ uint64_to_kmgt(char *str, uint64_t val) - if (i >= 4) -- (void)snprintf(str, KMGT_SIZE-1, "inf"); -+ (void) snprintf(str, KMGT_SIZE-1, "inf"); - else -- (void)snprintf(str, KMGT_SIZE-1, "%lu%c", (unsigned long)val, -- (i == -1) ? '\0' : postfix[i]); -+ (void) snprintf(str, KMGT_SIZE-1, "%lu%c", (unsigned long)val, -+ (i == -1) ? '\0' : postfix[i]); - -- return str; -+ return (str); - } -@@ -108,8 +108,8 @@ kmgt_per_sec(char *str, uint64_t v, double t) - if (i >= 4) -- (void)snprintf(str, KMGT_SIZE-1, "inf"); -+ (void) snprintf(str, KMGT_SIZE-1, "inf"); - else -- (void)snprintf(str, KMGT_SIZE-1, "%.2f%c", val, -- (i == -1) ? '\0' : postfix[i]); -+ (void) snprintf(str, KMGT_SIZE-1, "%.2f%c", val, -+ (i == -1) ? '\0' : postfix[i]); - -- return str; -+ return (str); - } -@@ -128,3 +128,3 @@ print_flags(char *str, uint32_t flags) - -- return str; -+ return (str); - } -@@ -140,3 +140,3 @@ regex_match(const char *string, char *pattern) - fprintf(stderr, "Error: Couldn't do regcomp, %d\n", rc); -- return rc; -+ return (rc); - } -@@ -146,3 +146,3 @@ regex_match(const char *string, char *pattern) - -- return rc; -+ return (rc); - } -@@ -158,3 +158,3 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - if ((rc = regex_match(optarg, pattern))) -- return rc; -+ return (rc); - -@@ -162,6 +162,7 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - if (cp == NULL) -- return ENOMEM; -+ return (ENOMEM); - - do { -- /* STRTOK(3) Each subsequent call, with a null pointer as the -+ /* -+ * STRTOK(3) Each subsequent call, with a null pointer as the - * value of the * first argument, starts searching from the -@@ -179,3 +180,3 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - free(cp); -- return 0; -+ return (0); - } -@@ -184,3 +185,3 @@ int - set_count(char *pattern1, char *pattern2, range_repeat_t *range, -- char *optarg, uint32_t *flags, char *arg) -+ char *optarg, uint32_t *flags, char *arg) - { -@@ -196,14 +197,16 @@ set_count(char *pattern1, char *pattern2, range_repeat_t *range, - fprintf(stderr, "Error: Incorrect pattern for %s, '%s'\n", -- arg, optarg); -- return EINVAL; -+ arg, optarg); -+ return (EINVAL); - } - -- return 0; -+ return (0); - } - --/* validates the value with regular expression and sets low, high, incr -- * according to value at which flag will be set. Sets the flag after. */ -+/* -+ * Validates the value with regular expression and sets low, high, incr -+ * according to value at which flag will be set. Sets the flag after. -+ */ - int - set_lhi(char *pattern, range_repeat_t *range, char *optarg, -- int flag, uint32_t *flag_thread, char *arg) -+ int flag, uint32_t *flag_thread, char *arg) - { -@@ -214,3 +217,3 @@ set_lhi(char *pattern, range_repeat_t *range, char *optarg, - arg, optarg); -- return rc; -+ return (rc); - } -@@ -233,3 +236,3 @@ set_lhi(char *pattern, range_repeat_t *range, char *optarg, - -- return 0; -+ return (0); - } -@@ -243,6 +246,6 @@ set_noise(uint64_t *noise, char *optarg, char *arg) - fprintf(stderr, "Error: Incorrect pattern for %s\n", arg); -- return EINVAL; -+ return (EINVAL); - } - -- return 0; -+ return (0); - } -@@ -257,3 +260,3 @@ set_load_params(cmd_args_t *args, char *optarg) - if (search == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -277,3 +280,3 @@ set_load_params(cmd_args_t *args, char *optarg) - -- return rc; -+ return (rc); - } -@@ -281,5 +284,7 @@ set_load_params(cmd_args_t *args, char *optarg) - --/* checks the low, high, increment values against the single value for -+/* -+ * Checks the low, high, increment values against the single value for - * mutual exclusion, for e.g threadcount is mutually exclusive to -- * threadcount_low, ..._high, ..._incr */ -+ * threadcount_low, ..._high, ..._incr -+ */ - int -@@ -289,12 +294,12 @@ check_mutual_exclusive_command_lines(uint32_t flag, char *arg) - fprintf(stderr, "Error: --%s can not be given with --%s_low, " -- "--%s_high or --%s_incr.\n", arg, arg, arg, arg); -- return 0; -+ "--%s_high or --%s_incr.\n", arg, arg, arg, arg); -+ return (0); - } - -- if ((flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) && !(flag & FLAG_SET)){ -+ if ((flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) && !(flag & FLAG_SET)) { - if (flag != (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) { - fprintf(stderr, "Error: One or more values missing " -- "from --%s_low, --%s_high, --%s_incr.\n", -- arg, arg, arg); -- return 0; -+ "from --%s_low, --%s_high, --%s_incr.\n", -+ arg, arg, arg); -+ return (0); - } -@@ -302,3 +307,3 @@ check_mutual_exclusive_command_lines(uint32_t flag, char *arg) - -- return 1; -+ return (1); - } -@@ -309,16 +314,20 @@ print_stats_header(cmd_args_t *args) - if (args->verbose) { -- printf("status name id\tth-cnt\trg-cnt\trg-sz\t" -- "ch-sz\toffset\trg-no\tch-no\tth-dly\tflags\ttime\t" -- "cr-time\trm-time\twr-time\trd-time\twr-data\twr-ch\t" -- "wr-bw\trd-data\trd-ch\trd-bw\n"); -- printf("------------------------------------------------" -- "------------------------------------------------" -- "------------------------------------------------" -- "----------------------------------------------\n"); -+ printf( -+ "status name id\tth-cnt\trg-cnt\trg-sz\t" -+ "ch-sz\toffset\trg-no\tch-no\tth-dly\tflags\ttime\t" -+ "cr-time\trm-time\twr-time\trd-time\twr-data\twr-ch\t" -+ "wr-bw\trd-data\trd-ch\trd-bw\n"); -+ printf( -+ "------------------------------------------------" -+ "------------------------------------------------" -+ "------------------------------------------------" -+ "----------------------------------------------\n"); - } else { -- printf("status name id\t" -- "wr-data\twr-ch\twr-bw\t" -- "rd-data\trd-ch\trd-bw\n"); -- printf("-----------------------------------------" -- "--------------------------------------\n"); -+ printf( -+ "status name id\t" -+ "wr-data\twr-ch\twr-bw\t" -+ "rd-data\trd-ch\trd-bw\n"); -+ printf( -+ "-----------------------------------------" -+ "--------------------------------------\n"); - } -@@ -339,13 +348,13 @@ print_stats_human_readable(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%-12s", args->name ? args->name : ZPIOS_NAME); -- printf("%2u\t", cmd->cmd_id); -+ printf("%2u\t", cmd->cmd_id); - - if (args->verbose) { -- printf("%u\t", cmd->cmd_thread_count); -- printf("%u\t", cmd->cmd_region_count); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_size)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_size)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_offset)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_noise)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_noise)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_thread_delay)); -+ printf("%u\t", cmd->cmd_thread_count); -+ printf("%u\t", cmd->cmd_region_count); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_size)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_size)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_offset)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_noise)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_noise)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_thread_delay)); - printf("%s\t", print_flags(str, cmd->cmd_flags)); -@@ -373,8 +382,8 @@ print_stats_human_readable(cmd_args_t *args, zpios_cmd_t *cmd) - -- printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_data)); -- printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_chunks)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_data)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_chunks)); - printf("%s\t", kmgt_per_sec(str, summary_stats->wr_data, wr_time)); - -- printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_data)); -- printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_chunks)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_data)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_chunks)); - printf("%s\n", kmgt_per_sec(str, summary_stats->rd_data, rd_time)); -@@ -395,13 +404,13 @@ print_stats_table(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%-12s", args->name ? args->name : ZPIOS_NAME); -- printf("%2u\t", cmd->cmd_id); -+ printf("%2u\t", cmd->cmd_id); - - if (args->verbose) { -- printf("%u\t", cmd->cmd_thread_count); -- printf("%u\t", cmd->cmd_region_count); -- printf("%llu\t", (long long unsigned)cmd->cmd_region_size); -- printf("%llu\t", (long long unsigned)cmd->cmd_chunk_size); -- printf("%llu\t", (long long unsigned)cmd->cmd_offset); -- printf("%u\t", cmd->cmd_region_noise); -- printf("%u\t", cmd->cmd_chunk_noise); -- printf("%u\t", cmd->cmd_thread_delay); -+ printf("%u\t", cmd->cmd_thread_count); -+ printf("%u\t", cmd->cmd_region_count); -+ printf("%llu\t", (long long unsigned)cmd->cmd_region_size); -+ printf("%llu\t", (long long unsigned)cmd->cmd_chunk_size); -+ printf("%llu\t", (long long unsigned)cmd->cmd_offset); -+ printf("%u\t", cmd->cmd_region_noise); -+ printf("%u\t", cmd->cmd_chunk_noise); -+ printf("%u\t", cmd->cmd_thread_delay); - printf("0x%x\t", cmd->cmd_flags); -@@ -420,24 +429,24 @@ print_stats_table(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%ld.%02ld\t", -- (long)summary_stats->total_time.delta.ts_sec, -- (long)summary_stats->total_time.delta.ts_nsec); -+ (long)summary_stats->total_time.delta.ts_sec, -+ (long)summary_stats->total_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->cr_time.delta.ts_sec, -- (long)summary_stats->cr_time.delta.ts_nsec); -+ (long)summary_stats->cr_time.delta.ts_sec, -+ (long)summary_stats->cr_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->rm_time.delta.ts_sec, -- (long)summary_stats->rm_time.delta.ts_nsec); -+ (long)summary_stats->rm_time.delta.ts_sec, -+ (long)summary_stats->rm_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->wr_time.delta.ts_sec, -- (long)summary_stats->wr_time.delta.ts_nsec); -+ (long)summary_stats->wr_time.delta.ts_sec, -+ (long)summary_stats->wr_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->rd_time.delta.ts_sec, -- (long)summary_stats->rd_time.delta.ts_nsec); -+ (long)summary_stats->rd_time.delta.ts_sec, -+ (long)summary_stats->rd_time.delta.ts_nsec); - } - -- printf("%lld\t", (long long unsigned)summary_stats->wr_data); -- printf("%lld\t", (long long unsigned)summary_stats->wr_chunks); -+ printf("%lld\t", (long long unsigned)summary_stats->wr_data); -+ printf("%lld\t", (long long unsigned)summary_stats->wr_chunks); - printf("%.4f\t", (double)summary_stats->wr_data / wr_time); - -- printf("%lld\t", (long long unsigned)summary_stats->rd_data); -- printf("%lld\t", (long long unsigned)summary_stats->rd_chunks); -+ printf("%lld\t", (long long unsigned)summary_stats->rd_data); -+ printf("%lld\t", (long long unsigned)summary_stats->rd_chunks); - printf("%.4f\n", (double)summary_stats->rd_data / rd_time); -diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am -index 2ce8efc..a39a240 100644 ---- a/cmd/zpool/Makefile.am -+++ b/cmd/zpool/Makefile.am -@@ -19,4 +19,4 @@ zpool_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --zpool_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ -+ $(LIBBLKID) -diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c -index b96fbe4..e38213c 100644 ---- a/cmd/zpool/zpool_main.c -+++ b/cmd/zpool/zpool_main.c -@@ -51,2 +51,3 @@ - #include -+#include - -@@ -198,5 +199,5 @@ static zpool_command_t command_table[] = { - --zpool_command_t *current_command; -+static zpool_command_t *current_command; - static char history_str[HIS_MAX_RECORD_LEN]; -- -+static boolean_t log_history = B_TRUE; - static uint_t timestamp_fmt = NODATE; -@@ -258,3 +259,3 @@ get_usage(zpool_help_t idx) { - case HELP_STATUS: -- return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " -+ return (gettext("\tstatus [-vxD] [-T d|u] [pool] ... [interval " - "[count]]\n")); -@@ -267,3 +268,3 @@ get_usage(zpool_help_t idx) { - case HELP_GET: -- return (gettext("\tget <\"all\" | property[,...]> " -+ return (gettext("\tget [-p] <\"all\" | property[,...]> " - " ...\n")); -@@ -833,2 +834,3 @@ zpool_do_create(int argc, char **argv) - case 'm': -+ /* Equivalent to -O mountpoint=optarg */ - mountpoint = optarg; -@@ -871,4 +873,14 @@ zpool_do_create(int argc, char **argv) - -- if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) -+ /* -+ * Mountpoints are checked and then added later. -+ * Uniquely among properties, they can be specified -+ * more than once, to avoid conflict with -m. -+ */ -+ if (0 == strcmp(optarg, -+ zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { -+ mountpoint = propval; -+ } else if (add_prop_list(optarg, propval, &fsprops, -+ B_FALSE)) { - goto errout; -+ } - break; -@@ -989,2 +1001,14 @@ zpool_do_create(int argc, char **argv) - -+ /* -+ * Now that the mountpoint's validity has been checked, ensure that -+ * the property is set appropriately prior to creating the pool. -+ */ -+ if (mountpoint != NULL) { -+ ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), -+ mountpoint, &fsprops, B_FALSE); -+ if (ret != 0) -+ goto errout; -+ } -+ -+ ret = 1; - if (dryrun) { -@@ -1023,4 +1047,5 @@ zpool_do_create(int argc, char **argv) - -- if (add_prop_list(propname, ZFS_FEATURE_ENABLED, -- &props, B_TRUE) != 0) -+ ret = add_prop_list(propname, -+ ZFS_FEATURE_ENABLED, &props, B_TRUE); -+ if (ret != 0) - goto errout; -@@ -1028,2 +1053,4 @@ zpool_do_create(int argc, char **argv) - } -+ -+ ret = 1; - if (zpool_create(g_zfs, poolname, -@@ -1033,7 +1060,2 @@ zpool_do_create(int argc, char **argv) - if (pool != NULL) { -- if (mountpoint != NULL) -- verify(zfs_prop_set(pool, -- zfs_prop_to_name( -- ZFS_PROP_MOUNTPOINT), -- mountpoint) == 0); - if (zfs_mount(pool, NULL, 0) == 0) -@@ -1121,3 +1143,6 @@ zpool_do_destroy(int argc, char **argv) - -- ret = (zpool_destroy(zhp) != 0); -+ /* The history must be logged as part of the export */ -+ log_history = B_FALSE; -+ -+ ret = (zpool_destroy(zhp, history_str) != 0); - -@@ -1185,6 +1210,9 @@ zpool_do_export(int argc, char **argv) - -+ /* The history must be logged as part of the export */ -+ log_history = B_FALSE; -+ - if (hardforce) { -- if (zpool_export_force(zhp) != 0) -+ if (zpool_export_force(zhp, history_str) != 0) - ret = 1; -- } else if (zpool_export(zhp, force) != 0) { -+ } else if (zpool_export(zhp, force, history_str) != 0) { - ret = 1; -@@ -1583,3 +1611,4 @@ show_import(nvlist_t *config) - nvlist_t *nvroot; -- int reason; -+ zpool_status_t reason; -+ zpool_errata_t errata; - const char *health; -@@ -1602,3 +1631,3 @@ show_import(nvlist_t *config) - -- reason = zpool_import_status(config, &msgid); -+ reason = zpool_import_status(config, &msgid, &errata); - -@@ -1690,2 +1719,7 @@ show_import(nvlist_t *config) - -+ case ZPOOL_STATUS_ERRATA: -+ (void) printf(gettext(" status: Errata #%d detected.\n"), -+ errata); -+ break; -+ - default: -@@ -1711,2 +1745,30 @@ show_import(nvlist_t *config) - "identifier and\n\tthe '-f' flag.\n")); -+ } else if (reason == ZPOOL_STATUS_ERRATA) { -+ switch (errata) { -+ case ZPOOL_ERRATA_NONE: -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_SCRUB: -+ (void) printf(gettext(" action: The pool can " -+ "be imported using its name or numeric " -+ "identifier,\n\thowever there is a compat" -+ "ibility issue which should be corrected" -+ "\n\tby running 'zpool scrub'\n")); -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: -+ (void) printf(gettext(" action: The pool can" -+ "not be imported with this version of ZFS " -+ "due to\n\tan active asynchronous destroy. " -+ "Revert to an earlier version\n\tand " -+ "allow the destroy to complete before " -+ "updating.\n")); -+ break; -+ -+ default: -+ /* -+ * All errata must contain an action message. -+ */ -+ assert(0); -+ } - } else { -@@ -1954,3 +2016,3 @@ zpool_do_import(int argc, char **argv) - /* check options */ -- while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) { -+ while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:tT:VX")) != -1) { - switch (c) { -@@ -2016,2 +2078,6 @@ zpool_do_import(int argc, char **argv) - break; -+ case 't': -+ flags |= ZFS_IMPORT_TEMP_NAME; -+ break; -+ - case 'T': -@@ -2543,3 +2609,3 @@ get_columns(void) - -- return columns; -+ return (columns); - } -@@ -4100,3 +4166,4 @@ status_callback(zpool_handle_t *zhp, void *data) - char *msgid; -- int reason; -+ zpool_status_t reason; -+ zpool_errata_t errata; - const char *health; -@@ -4106,3 +4173,3 @@ status_callback(zpool_handle_t *zhp, void *data) - config = zpool_get_config(zhp, NULL); -- reason = zpool_get_status(zhp, &msgid); -+ reason = zpool_get_status(zhp, &msgid, &errata); - -@@ -4324,2 +4391,24 @@ status_callback(zpool_handle_t *zhp, void *data) - -+ case ZPOOL_STATUS_ERRATA: -+ (void) printf(gettext("status: Errata #%d detected.\n"), -+ errata); -+ -+ switch (errata) { -+ case ZPOOL_ERRATA_NONE: -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_SCRUB: -+ (void) printf(gettext("action: To correct the issue " -+ "run 'zpool scrub'.\n")); -+ break; -+ -+ default: -+ /* -+ * All errata which allow the pool to be imported -+ * must contain an action message. -+ */ -+ assert(0); -+ } -+ break; -+ - default: -@@ -4587,9 +4676,2 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) - --#if 0 -- /* -- * XXX: This code can be enabled when Illumos commit -- * 4445fffbbb1ea25fd0e9ea68b9380dd7a6709025 is merged. -- * It reworks the history logging among other things. -- */ -- - /* -@@ -4602,3 +4684,2 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) - log_history = B_FALSE; --#endif - } -@@ -4702,2 +4783,10 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) - } -+ /* -+ * If they did "zpool upgrade -a", then we could -+ * be doing ioctls to different pools. We need -+ * to log this history once to each pool, and bypass -+ * the normal history logging that happens in main(). -+ */ -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; - } -@@ -4957,4 +5046,4 @@ typedef struct hist_cbdata { - boolean_t first; -- int longfmt; -- int internal; -+ boolean_t longfmt; -+ boolean_t internal; - } hist_cbdata_t; -@@ -4970,17 +5059,4 @@ get_history_one(zpool_handle_t *zhp, void *data) - uint_t numrecords; -- char *cmdstr; -- char *pathstr; -- uint64_t dst_time; -- time_t tsec; -- struct tm t; -- char tbuf[30]; - int ret, i; -- uint64_t who; -- struct passwd *pwd; -- char *hostname; -- char *zonename; -- char internalstr[MAXPATHLEN]; - hist_cbdata_t *cb = (hist_cbdata_t *)data; -- uint64_t txg; -- uint64_t ievent; - -@@ -4996,32 +5072,71 @@ get_history_one(zpool_handle_t *zhp, void *data) - for (i = 0; i < numrecords; i++) { -- if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME, -- &dst_time) != 0) -- continue; -+ nvlist_t *rec = records[i]; -+ char tbuf[30] = ""; - -- /* is it an internal event or a standard event? */ -- if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD, -- &cmdstr) != 0) { -- if (cb->internal == 0) -- continue; -+ if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { -+ time_t tsec; -+ struct tm t; -+ -+ tsec = fnvlist_lookup_uint64(records[i], -+ ZPOOL_HIST_TIME); -+ (void) localtime_r(&tsec, &t); -+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); -+ } - -- if (nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_INT_EVENT, &ievent) != 0) -+ if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { -+ (void) printf("%s %s", tbuf, -+ fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { -+ int ievent = -+ fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); -+ if (!cb->internal) -+ continue; -+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { -+ (void) printf("%s unrecognized record:\n", -+ tbuf); -+ dump_nvlist(rec, 4); -+ continue; -+ } -+ (void) printf("%s [internal %s txg:%lld] %s", tbuf, -+ zfs_history_event_names[ievent], -+ (longlong_t) fnvlist_lookup_uint64( -+ rec, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { -+ if (!cb->internal) - continue; -- verify(nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_TXG, &txg) == 0); -- verify(nvlist_lookup_string(records[i], -- ZPOOL_HIST_INT_STR, &pathstr) == 0); -- if (ievent >= LOG_END) -+ (void) printf("%s [txg:%lld] %s", tbuf, -+ (longlong_t) fnvlist_lookup_uint64( -+ rec, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); -+ if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { -+ (void) printf(" %s (%llu)", -+ fnvlist_lookup_string(rec, -+ ZPOOL_HIST_DSNAME), -+ (u_longlong_t)fnvlist_lookup_uint64(rec, -+ ZPOOL_HIST_DSID)); -+ } -+ (void) printf(" %s", fnvlist_lookup_string(rec, -+ ZPOOL_HIST_INT_STR)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { -+ if (!cb->internal) -+ continue; -+ (void) printf("%s ioctl %s\n", tbuf, -+ fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); -+ if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { -+ (void) printf(" input:\n"); -+ dump_nvlist(fnvlist_lookup_nvlist(rec, -+ ZPOOL_HIST_INPUT_NVL), 8); -+ } -+ if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { -+ (void) printf(" output:\n"); -+ dump_nvlist(fnvlist_lookup_nvlist(rec, -+ ZPOOL_HIST_OUTPUT_NVL), 8); -+ } -+ } else { -+ if (!cb->internal) - continue; -- (void) snprintf(internalstr, -- sizeof (internalstr), -- "[internal %s txg:%llu] %s", -- zfs_history_event_names[ievent], (u_longlong_t)txg, -- pathstr); -- cmdstr = internalstr; -+ (void) printf("%s unrecognized record:\n", tbuf); -+ dump_nvlist(rec, 4); - } -- tsec = dst_time; -- (void) localtime_r(&tsec, &t); -- (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); -- (void) printf("%s %s", tbuf, cmdstr); - -@@ -5032,22 +5147,16 @@ get_history_one(zpool_handle_t *zhp, void *data) - (void) printf(" ["); -- if (nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_WHO, &who) == 0) { -- pwd = getpwuid((uid_t)who); -- if (pwd) -- (void) printf("user %s on", -- pwd->pw_name); -- else -- (void) printf("user %d on", -- (int)who); -- } else { -- (void) printf(gettext("no info]\n")); -- continue; -+ if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { -+ uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); -+ struct passwd *pwd = getpwuid(who); -+ (void) printf("user %d ", (int)who); -+ if (pwd != NULL) -+ (void) printf("(%s) ", pwd->pw_name); - } -- if (nvlist_lookup_string(records[i], -- ZPOOL_HIST_HOST, &hostname) == 0) { -- (void) printf(" %s", hostname); -+ if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { -+ (void) printf("on %s", -+ fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); - } -- if (nvlist_lookup_string(records[i], -- ZPOOL_HIST_ZONE, &zonename) == 0) { -- (void) printf(":%s", zonename); -+ if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { -+ (void) printf(":%s", -+ fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); - } -@@ -5068,4 +5177,2 @@ get_history_one(zpool_handle_t *zhp, void *data) - */ -- -- - int -@@ -5082,6 +5189,6 @@ zpool_do_history(int argc, char **argv) - case 'l': -- cbdata.longfmt = 1; -+ cbdata.longfmt = B_TRUE; - break; - case 'i': -- cbdata.internal = 1; -+ cbdata.internal = B_TRUE; - break; -@@ -5124,6 +5231,6 @@ zpool_do_events_short(nvlist_t *nvl) - (void) ctime_r((const time_t *)&tv[0], ctime_str); -- (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ -- (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ -- (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ -- (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]);/* '.123456789' */ -+ (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ -+ (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ -+ (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ -+ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ - (void) printf(gettext("%s "), str); -@@ -5235,6 +5342,6 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - printf(gettext("%*s%s[%d] = %s\n"), -- depth, "", name, i, "(embedded nvlist)"); -+ depth, "", name, i, "(embedded nvlist)"); - zpool_do_events_nvprint(val[i], depth + 8); - printf(gettext("%*s(end %s[%i])\n"), -- depth, "", name, i); -+ depth, "", name, i); - } -@@ -5316,3 +5423,4 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - for (i = 0; i < nelem; i++) -- printf(gettext("0x%llx "), (u_longlong_t)val[i]); -+ printf(gettext("0x%llx "), -+ (u_longlong_t)val[i]); - -@@ -5327,3 +5435,16 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - for (i = 0; i < nelem; i++) -- printf(gettext("0x%llx "), (u_longlong_t)val[i]); -+ printf(gettext("0x%llx "), -+ (u_longlong_t)val[i]); -+ -+ break; -+ } -+ -+ case DATA_TYPE_STRING_ARRAY: { -+ char **str; -+ uint_t i, nelem; -+ -+ (void) nvpair_value_string_array(nvp, &str, &nelem); -+ for (i = 0; i < nelem; i++) -+ printf(gettext("\"%s\" "), -+ str[i] ? str[i] : ""); - -@@ -5332,3 +5453,2 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - -- case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_BOOLEAN_ARRAY: -@@ -5349,6 +5469,6 @@ zpool_do_events_next(ev_opts_t *opts) - nvlist_t *nvl; -- int cleanup_fd, ret, dropped; -+ int zevent_fd, ret, dropped; - -- cleanup_fd = open(ZFS_DEV, O_RDWR); -- VERIFY(cleanup_fd >= 0); -+ zevent_fd = open(ZFS_DEV, O_RDWR); -+ VERIFY(zevent_fd >= 0); - -@@ -5359,3 +5479,3 @@ zpool_do_events_next(ev_opts_t *opts) - ret = zpool_events_next(g_zfs, &nvl, &dropped, -- !!opts->follow, cleanup_fd); -+ (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); - if (ret || nvl == NULL) -@@ -5377,3 +5497,3 @@ zpool_do_events_next(ev_opts_t *opts) - -- VERIFY(0 == close(cleanup_fd)); -+ VERIFY(0 == close(zevent_fd)); - -@@ -5435,3 +5555,3 @@ zpool_do_events(int argc, char **argv) - -- return ret; -+ return (ret); - } -@@ -5468,4 +5588,4 @@ get_callback(zpool_handle_t *zhp, void *data) - } else { -- if (zpool_get_prop(zhp, pl->pl_prop, value, -- sizeof (value), &srctype) != 0) -+ if (zpool_get_prop_literal(zhp, pl->pl_prop, value, -+ sizeof (value), &srctype, cbp->cb_literal) != 0) - continue; -@@ -5485,5 +5605,22 @@ zpool_do_get(int argc, char **argv) - zprop_list_t fake_name = { 0 }; -- int ret; -+ int c, ret; - -- if (argc < 2) { -+ /* check options */ -+ while ((c = getopt(argc, argv, "p")) != -1) { -+ switch (c) { -+ case 'p': -+ cb.cb_literal = B_TRUE; -+ break; -+ -+ case '?': -+ (void) fprintf(stderr, gettext("invalid option '%c'\n"), -+ optopt); -+ usage(B_FALSE); -+ } -+ } -+ -+ argc -= optind; -+ argv += optind; -+ -+ if (argc < 1) { - (void) fprintf(stderr, gettext("missing property " -@@ -5501,6 +5638,8 @@ zpool_do_get(int argc, char **argv) - -- if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, -- ZFS_TYPE_POOL) != 0) -+ if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) - usage(B_FALSE); - -+ argc--; -+ argv++; -+ - if (cb.cb_proplist != NULL) { -@@ -5512,3 +5651,3 @@ zpool_do_get(int argc, char **argv) - -- ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist, -+ ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - get_callback, &cb); -@@ -5630,4 +5769,3 @@ main(int argc, char **argv) - */ -- if ((strcmp(cmdname, "-?") == 0) || -- strcmp(cmdname, "--help") == 0) -+ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) - usage(B_TRUE); -@@ -5639,4 +5777,3 @@ main(int argc, char **argv) - -- zpool_set_history_str("zpool", argc, argv, history_str); -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - -@@ -5668,2 +5805,5 @@ main(int argc, char **argv) - -+ if (ret == 0 && log_history) -+ (void) zpool_log_history(g_zfs, history_str); -+ - libzfs_fini(g_zfs); -diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h -index b67ff8b..1b4ce51 100644 ---- a/cmd/zpool/zpool_util.h -+++ b/cmd/zpool/zpool_util.h -@@ -46,3 +46,4 @@ uint_t num_logs(nvlist_t *nv); - nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, -- int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv); -+ int check_rep, boolean_t replacing, boolean_t dryrun, int argc, -+ char **argv); - nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, -diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c -index 723e10b..316e291 100644 ---- a/cmd/zpool/zpool_vdev.c -+++ b/cmd/zpool/zpool_vdev.c -@@ -82,3 +82,3 @@ - #else --#define blkid_cache void * -+#define blkid_cache void * - #endif /* HAVE_LIBBLKID */ -@@ -108,8 +108,47 @@ typedef struct vdev_disk_db_entry - static vdev_disk_db_entry_t vdev_disk_database[] = { -+ {"ATA ADATA SSD S396 3", 8192}, -+ {"ATA APPLE SSD SM128E", 8192}, -+ {"ATA APPLE SSD SM256E", 8192}, -+ {"ATA APPLE SSD SM512E", 8192}, -+ {"ATA APPLE SSD SM768E", 8192}, -+ {"ATA C400-MTFDDAC064M", 8192}, -+ {"ATA C400-MTFDDAC128M", 8192}, -+ {"ATA C400-MTFDDAC256M", 8192}, -+ {"ATA C400-MTFDDAC512M", 8192}, - {"ATA Corsair Force 3 ", 8192}, -+ {"ATA Corsair Force GS", 8192}, - {"ATA INTEL SSDSA2CT04", 8192}, -+ {"ATA INTEL SSDSA2BZ10", 8192}, -+ {"ATA INTEL SSDSA2BZ20", 8192}, -+ {"ATA INTEL SSDSA2BZ30", 8192}, -+ {"ATA INTEL SSDSA2CW04", 8192}, -+ {"ATA INTEL SSDSA2CW08", 8192}, -+ {"ATA INTEL SSDSA2CW12", 8192}, - {"ATA INTEL SSDSA2CW16", 8192}, -+ {"ATA INTEL SSDSA2CW30", 8192}, -+ {"ATA INTEL SSDSA2CW60", 8192}, -+ {"ATA INTEL SSDSC2BA10", 8192}, -+ {"ATA INTEL SSDSC2BA20", 8192}, -+ {"ATA INTEL SSDSC2BA40", 8192}, -+ {"ATA INTEL SSDSC2BA80", 8192}, -+ {"ATA INTEL SSDSC2BB08", 8192}, -+ {"ATA INTEL SSDSC2BB12", 8192}, -+ {"ATA INTEL SSDSC2BB16", 8192}, -+ {"ATA INTEL SSDSC2BB24", 8192}, -+ {"ATA INTEL SSDSC2BB30", 8192}, -+ {"ATA INTEL SSDSC2BB40", 8192}, -+ {"ATA INTEL SSDSC2BB48", 8192}, -+ {"ATA INTEL SSDSC2BB60", 8192}, -+ {"ATA INTEL SSDSC2BB80", 8192}, -+ {"ATA INTEL SSDSC2CT06", 8192}, -+ {"ATA INTEL SSDSC2CT12", 8192}, - {"ATA INTEL SSDSC2CT18", 8192}, -+ {"ATA INTEL SSDSC2CT24", 8192}, -+ {"ATA INTEL SSDSC2CW06", 8192}, - {"ATA INTEL SSDSC2CW12", 8192}, -+ {"ATA INTEL SSDSC2CW18", 8192}, -+ {"ATA INTEL SSDSC2CW24", 8192}, -+ {"ATA INTEL SSDSC2CW48", 8192}, - {"ATA KINGSTON SH100S3", 8192}, -+ {"ATA KINGSTON SH103S3", 8192}, - {"ATA M4-CT064M4SSD2 ", 8192}, -@@ -119,2 +158,3 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA OCZ-AGILITY2 ", 8192}, -+ {"ATA OCZ-AGILITY3 ", 8192}, - {"ATA OCZ-VERTEX2 3.5 ", 8192}, -@@ -123,4 +163,18 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA OCZ-VERTEX3 MI ", 8192}, -+ {"ATA OCZ-VERTEX4 ", 8192}, -+ {"ATA SAMSUNG MZ7WD120", 8192}, -+ {"ATA SAMSUNG MZ7WD240", 8192}, -+ {"ATA SAMSUNG MZ7WD480", 8192}, -+ {"ATA SAMSUNG MZ7WD960", 8192}, - {"ATA SAMSUNG SSD 830 ", 8192}, - {"ATA Samsung SSD 840 ", 8192}, -+ {"ATA SanDisk SSD U100", 8192}, -+ {"ATA TOSHIBA THNSNH06", 8192}, -+ {"ATA TOSHIBA THNSNH12", 8192}, -+ {"ATA TOSHIBA THNSNH25", 8192}, -+ {"ATA TOSHIBA THNSNH51", 8192}, -+ {"ATA APPLE SSD TS064C", 4096}, -+ {"ATA APPLE SSD TS128C", 4096}, -+ {"ATA APPLE SSD TS256C", 4096}, -+ {"ATA APPLE SSD TS512C", 4096}, - {"ATA INTEL SSDSA2M040", 4096}, -@@ -128,3 +182,10 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA INTEL SSDSA2M160", 4096}, -- /* Imported from Open Solaris*/ -+ {"ATA INTEL SSDSC2MH12", 4096}, -+ {"ATA INTEL SSDSC2MH25", 4096}, -+ {"ATA OCZ CORE_SSD ", 4096}, -+ {"ATA OCZ-VERTEX ", 4096}, -+ {"ATA SAMSUNG MCCOE32G", 4096}, -+ {"ATA SAMSUNG MCCOE64G", 4096}, -+ {"ATA SAMSUNG SSD PM80", 4096}, -+ /* Imported from Open Solaris */ - {"ATA MARVELL SD88SA02", 4096}, -@@ -148,2 +209,4 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"OI COMSTAR ", 8192}, -+ {"SUN COMSTAR ", 8192}, -+ {"NETAPP LUN ", 8192}, - #endif -@@ -170,6 +233,6 @@ check_sector_size_database(char *path, int *sector_size) - /* Prepare INQUIRY command */ -- memset(&io_hdr, 0, sizeof(sg_io_hdr_t)); -+ memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); - io_hdr.interface_id = 'S'; -- io_hdr.cmd_len = sizeof(inq_cmd_blk); -- io_hdr.mx_sb_len = sizeof(sense_buffer); -+ io_hdr.cmd_len = sizeof (inq_cmd_blk); -+ io_hdr.mx_sb_len = sizeof (sense_buffer); - io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; -@@ -179,3 +242,3 @@ check_sector_size_database(char *path, int *sector_size) - io_hdr.sbp = sense_buffer; -- io_hdr.timeout = 10; /* 10 milliseconds is ample time */ -+ io_hdr.timeout = 10; /* 10 milliseconds is ample time */ - -@@ -324,3 +387,3 @@ check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) - vdev_error(gettext("%s contains a filesystem of " -- "type '%s'\n"), path, value); -+ "type '%s'\n"), path, value); - } -@@ -342,3 +405,3 @@ static int - check_disk(const char *path, blkid_cache cache, int force, -- boolean_t isspare, boolean_t iswholedisk) -+ boolean_t isspare, boolean_t iswholedisk) - { -@@ -351,3 +414,3 @@ check_disk(const char *path, blkid_cache cache, int force, - if (!iswholedisk) -- return check_slice(path, cache, force, isspare); -+ return (check_slice(path, cache, force, isspare)); - -@@ -363,3 +426,3 @@ check_disk(const char *path, blkid_cache cache, int force, - check_error(errno); -- return -1; -+ return (-1); - } -@@ -370,3 +433,3 @@ check_disk(const char *path, blkid_cache cache, int force, - if (force) { -- return 0; -+ return (0); - } else { -@@ -375,3 +438,3 @@ check_disk(const char *path, blkid_cache cache, int force, - "information in the MBR.\n"), path); -- return -1; -+ return (-1); - } -@@ -390,3 +453,3 @@ check_disk(const char *path, blkid_cache cache, int force, - /* Partitions will no be created using the backup */ -- return 0; -+ return (0); - } else { -@@ -394,3 +457,3 @@ check_disk(const char *path, blkid_cache cache, int force, - "EFI label.\n"), path); -- return -1; -+ return (-1); - } -@@ -425,3 +488,3 @@ static int - check_device(const char *path, boolean_t force, -- boolean_t isspare, boolean_t iswholedisk) -+ boolean_t isspare, boolean_t iswholedisk) - { -@@ -439,3 +502,3 @@ check_device(const char *path, boolean_t force, - check_error(err); -- return -1; -+ return (-1); - } -@@ -445,3 +508,3 @@ check_device(const char *path, boolean_t force, - check_error(err); -- return -1; -+ return (-1); - } -@@ -450,3 +513,3 @@ check_device(const char *path, boolean_t force, - -- return check_disk(path, cache, force, isspare, iswholedisk); -+ return (check_disk(path, cache, force, isspare, iswholedisk)); - } -@@ -465,3 +528,3 @@ is_whole_disk(const char *path) - struct dk_gpt *label; -- int fd; -+ int fd; - -@@ -486,3 +549,3 @@ static int - is_shorthand_path(const char *arg, char *path, -- struct stat64 *statbuf, boolean_t *wholedisk) -+ struct stat64 *statbuf, boolean_t *wholedisk) - { -@@ -497,4 +560,4 @@ is_shorthand_path(const char *arg, char *path, - -- strlcpy(path, arg, sizeof(path)); -- memset(statbuf, 0, sizeof(*statbuf)); -+ strlcpy(path, arg, sizeof (path)); -+ memset(statbuf, 0, sizeof (*statbuf)); - *wholedisk = B_FALSE; -@@ -1075,3 +1138,3 @@ zero_label(char *path) - -- return 0; -+ return (0); - } -@@ -1164,3 +1227,3 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) - if (!is_exclusive || !is_spare(NULL, udevpath)) { -- ret = strncmp(udevpath,UDISK_ROOT,strlen(UDISK_ROOT)); -+ ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); - if (ret == 0) { -@@ -1175,3 +1238,3 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) - -- ret = zpool_label_disk_wait(udevpath, 1000); -+ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); - if (ret) { -@@ -1238,3 +1301,3 @@ check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, - verify(!nvlist_lookup_uint64(nv, -- ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); -+ ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); - -@@ -1441,4 +1504,4 @@ construct_spec(nvlist_t *props, int argc, char **argv) - zpool_no_memory(); -- if ((nv = make_leaf_vdev(props, argv[c], B_FALSE)) -- == NULL) -+ if ((nv = make_leaf_vdev(props, argv[c], -+ B_FALSE)) == NULL) - return (NULL); -@@ -1497,3 +1560,4 @@ construct_spec(nvlist_t *props, int argc, char **argv) - */ -- if ((nv = make_leaf_vdev(props, argv[0], is_log)) == NULL) -+ if ((nv = make_leaf_vdev(props, argv[0], -+ is_log)) == NULL) - return (NULL); -diff --git a/cmd/zstreamdump/Makefile.am b/cmd/zstreamdump/Makefile.am -index 3d7ec41..d6c64f5 100644 ---- a/cmd/zstreamdump/Makefile.am -+++ b/cmd/zstreamdump/Makefile.am -@@ -15,4 +15,5 @@ zstreamdump_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zstreamdump_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zstreamdump_LDADD += $(ZLIB) -diff --git a/cmd/ztest/Makefile.am b/cmd/ztest/Makefile.am -index 3989201..a3cd834 100644 ---- a/cmd/ztest/Makefile.am -+++ b/cmd/ztest/Makefile.am -@@ -17,4 +17,5 @@ ztest_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --ztest_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ztest_LDADD += -lm -ldl -diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c -index 93a5f1e..d392a62 100644 ---- a/cmd/ztest/ztest.c -+++ b/cmd/ztest/ztest.c -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -108,2 +109,3 @@ - #include -+#include - #include -@@ -112,2 +114,3 @@ - #include -+#include - #include -@@ -207,2 +210,3 @@ enum ztest_io_type { - ZTEST_IO_SETATTR, -+ ZTEST_IO_REWRITE, - ZTEST_IO_TYPES -@@ -364,3 +368,3 @@ ztest_info_t ztest_info[] = { - { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, -- { ztest_reguid, 1, &zopt_sometimes }, -+ { ztest_reguid, 1, &zopt_rarely }, - { ztest_spa_rename, 1, &zopt_rarely }, -@@ -369,3 +373,3 @@ ztest_info_t ztest_info[] = { - { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, -- { ztest_vdev_attach_detach, 1, &zopt_rarely }, -+ { ztest_vdev_attach_detach, 1, &zopt_sometimes }, - { ztest_vdev_LUN_growth, 1, &zopt_rarely }, -@@ -787,2 +791,14 @@ ztest_kill(ztest_shared_t *zs) - zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); -+ -+ /* -+ * Before we kill off ztest, make sure that the config is updated. -+ * See comment above spa_config_sync(). -+ */ -+ mutex_enter(&spa_namespace_lock); -+ spa_config_sync(ztest_spa, B_FALSE, B_FALSE); -+ mutex_exit(&spa_namespace_lock); -+ -+ if (ztest_opts.zo_verbose >= 3) -+ zfs_dbgmsg_print(FTAG); -+ - (void) kill(getpid(), SIGKILL); -@@ -1033,5 +1049,4 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, - -- error = dsl_prop_set(osname, propname, -- (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), -- sizeof (value), 1, &value); -+ error = dsl_prop_set_int(osname, propname, -+ (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); - -@@ -1044,4 +1059,3 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, - setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); -- VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), -- 1, &curval, setpoint), ==, 0); -+ VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); - -@@ -1901,2 +1915,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -2049,2 +2069,5 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) - -+ /* -+ * No object was found. -+ */ - if (od->od_object == 0) -@@ -2164,2 +2187,3 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) - { -+ int err; - ztest_block_tag_t wbt; -@@ -2218,2 +2242,21 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) - break; -+ -+ case ZTEST_IO_REWRITE: -+ (void) rw_enter(&ztest_name_lock, RW_READER); -+ err = ztest_dsl_prop_set_uint64(zd->zd_name, -+ ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), -+ B_FALSE); -+ VERIFY(err == 0 || err == ENOSPC); -+ err = ztest_dsl_prop_set_uint64(zd->zd_name, -+ ZFS_PROP_COMPRESSION, -+ ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), -+ B_FALSE); -+ VERIFY(err == 0 || err == ENOSPC); -+ (void) rw_exit(&ztest_name_lock); -+ -+ VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, -+ DMU_READ_NO_PREFETCH)); -+ -+ (void) ztest_write(zd, object, offset, blocksize, data); -+ break; - } -@@ -2305,2 +2348,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) - -+ /* -+ * We grab the zd_dirobj_lock to ensure that no other thread is -+ * updating the zil (i.e. adding in-memory log records) and the -+ * zd_zilog_lock to block any I/O. -+ */ - mutex_enter(&zd->zd_dirobj_lock); -@@ -2336,3 +2384,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - VERIFY3U(ENOENT, ==, -- spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); -+ spa_create("ztest_bad_file", nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2344,3 +2392,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - VERIFY3U(ENOENT, ==, -- spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); -+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2353,3 +2401,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); -- VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); -+ VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2411,3 +2459,3 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) - zpool_prop_to_name(ZPOOL_PROP_VERSION), version); -- VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); -+ VERIFY3S(spa_create(name, nvroot, props, NULL), ==, 0); - fnvlist_free(nvroot); -@@ -2486,4 +2534,3 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) - mutex_enter(&ztest_vdev_lock); -- leaves = -- MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; -+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; - -@@ -2509,3 +2556,3 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) - * grab a reference on the dataset which may cause -- * dmu_objset_destroy() to fail with EBUSY thus -+ * dsl_destroy_head() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. -@@ -2741,3 +2788,3 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) - uint64_t oldguid, pguid; -- size_t oldsize, newsize; -+ uint64_t oldsize, newsize; - char *oldpath, *newpath; -@@ -2902,4 +2949,4 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) - "returned %d, expected %d", -- oldpath, (longlong_t)oldsize, newpath, -- (longlong_t)newsize, replacing, error, expected_error); -+ oldpath, oldsize, newpath, -+ newsize, replacing, error, expected_error); - } -@@ -3198,3 +3245,3 @@ ztest_objset_destroy_cb(const char *name, void *arg) - */ -- VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); - error = dmu_object_info(os, ZTEST_DIROBJ, &doi); -@@ -3206,3 +3253,3 @@ ztest_objset_destroy_cb(const char *name, void *arg) - } -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -3211,3 +3258,7 @@ ztest_objset_destroy_cb(const char *name, void *arg) - */ -- VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); -+ if (strchr(name, '@') != NULL) { -+ VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); -+ } else { -+ VERIFY0(dsl_destroy_head(name)); -+ } - return (0); -@@ -3221,7 +3272,5 @@ ztest_snapshot_create(char *osname, uint64_t id) - -- (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, -- (u_longlong_t)id); -+ (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); - -- error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error == ENOSPC) { -@@ -3230,4 +3279,6 @@ ztest_snapshot_create(char *osname, uint64_t id) - } -- if (error != 0 && error != EEXIST) -- fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); -+ if (error != 0 && error != EEXIST) { -+ fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, -+ snapname, error); -+ } - return (B_TRUE); -@@ -3244,3 +3295,3 @@ ztest_snapshot_destroy(char *osname, uint64_t id) - -- error = dmu_objset_destroy(snapname, B_FALSE); -+ error = dsl_destroy_snapshot(snapname, B_FALSE); - if (error != 0 && error != ENOENT) -@@ -3272,3 +3323,3 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - * If this dataset exists from a previous run, process its replay log -- * half of the time. If we don't replay it, then dmu_objset_destroy() -+ * half of the time. If we don't replay it, then dsl_destroy_head() - * (invoked from ztest_objset_destroy_cb()) should just throw it away. -@@ -3294,3 +3345,4 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - */ -- VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); -+ VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, -+ FTAG, &os)); - -@@ -3308,4 +3360,3 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - -- VERIFY3U(0, ==, -- dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); - -@@ -3399,17 +3450,17 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id) - -- error = dmu_objset_destroy(clone2name, B_FALSE); -+ error = dsl_destroy_head(clone2name); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); -- error = dmu_objset_destroy(snap3name, B_FALSE); -+ fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); -+ error = dsl_destroy_snapshot(snap3name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); -- error = dmu_objset_destroy(snap2name, B_FALSE); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); -+ error = dsl_destroy_snapshot(snap2name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); -- error = dmu_objset_destroy(clone1name, B_FALSE); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); -+ error = dsl_destroy_head(clone1name); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); -- error = dmu_objset_destroy(snap1name, B_FALSE); -+ fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); -+ error = dsl_destroy_snapshot(snap1name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); - -@@ -3428,4 +3479,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - { -- objset_t *clone; -- dsl_dataset_t *ds; -+ objset_t *os; - char *snap1name; -@@ -3459,4 +3509,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); - if (error && error != EEXIST) { -@@ -3469,8 +3518,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(snap1name, FTAG, &clone); -- if (error) -- fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); -- -- error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -+ error = dmu_objset_clone(clone1name, snap1name); - if (error) { -@@ -3483,4 +3527,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); - if (error && error != EEXIST) { -@@ -3493,4 +3536,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); - if (error && error != EEXIST) { -@@ -3503,8 +3545,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(snap3name, FTAG, &clone); -- if (error) -- fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); -- -- error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -+ error = dmu_objset_clone(clone2name, snap3name); - if (error) { -@@ -3517,5 +3554,5 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds); -+ error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); - if (error) -- fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error); -+ fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); - error = dsl_dataset_promote(clone2name, NULL); -@@ -3524,3 +3561,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - error); -- dsl_dataset_disown(ds, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -3539,3 +3576,3 @@ out: - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 4 -+#define OD_ARRAY_SIZE 4 - -@@ -3552,3 +3589,3 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) - -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -3574,3 +3611,3 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 2 -+#define OD_ARRAY_SIZE 2 - -@@ -3586,3 +3623,3 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) - objset_t *os = zd->zd_os; -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -3692,2 +3729,5 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) - -+ /* This accounts for setting the checksum/compression. */ -+ dmu_tx_hold_bonus(tx, bigobj); -+ - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); -@@ -3850,3 +3890,3 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 2 -+#define OD_ARRAY_SIZE 2 - -@@ -3873,3 +3913,3 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) - -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -4094,3 +4134,3 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - uint64_t offset = (1ULL << (ztest_random(20) + 43)) + -@@ -4111,3 +4151,3 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4124,3 +4164,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - -@@ -4128,4 +4168,5 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - -- if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ if (ztest_object_init(zd, od, sizeof (ztest_od_t), -+ !ztest_random(2)) != 0) { -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4134,3 +4175,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4152,3 +4193,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - umem_free(data, blocksize); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4177,3 +4218,3 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4300,3 +4341,3 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) - out: -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4314,3 +4355,3 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4347,3 +4388,3 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) - out: -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4363,3 +4404,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4367,3 +4408,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4398,3 +4439,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - count = -1ULL; -- VERIFY(zap_count(os, object, &count) == 0); -+ VERIFY0(zap_count(os, object, &count)); - ASSERT(count != -1ULL); -@@ -4461,3 +4502,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4552,3 +4593,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); -@@ -4556,3 +4597,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4599,3 +4640,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4671,3 +4712,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4716,2 +4757,18 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) - -+static int -+user_release_one(const char *snapname, const char *holdname) -+{ -+ nvlist_t *snaps, *holds; -+ int error; -+ -+ snaps = fnvlist_alloc(); -+ holds = fnvlist_alloc(); -+ fnvlist_add_boolean(holds, holdname); -+ fnvlist_add_nvlist(snaps, snapname, holds); -+ fnvlist_free(holds); -+ error = dsl_dataset_user_release(snaps, NULL); -+ fnvlist_free(snaps); -+ return (error); -+} -+ - /* -@@ -4730,2 +4787,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - char osname[MAXNAMELEN]; -+ nvlist_t *holds; - -@@ -4735,6 +4793,8 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- (void) snprintf(snapname, 100, "sh1_%llu", (u_longlong_t)id); -- (void) snprintf(fullname, 100, "%s@%s", osname, snapname); -- (void) snprintf(clonename, 100, "%s/ch1_%llu",osname,(u_longlong_t)id); -- (void) snprintf(tag, 100, "tag_%llu", (u_longlong_t)id); -+ (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", -+ (u_longlong_t)id); -+ (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); -+ (void) snprintf(clonename, sizeof (clonename), -+ "%s/ch1_%llu", osname, (u_longlong_t)id); -+ (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); - -@@ -4743,5 +4803,11 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- (void) dmu_objset_destroy(clonename, B_FALSE); -- (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); -- (void) dmu_objset_destroy(fullname, B_FALSE); -+ error = dsl_destroy_head(clonename); -+ if (error != ENOENT) -+ ASSERT0(error); -+ error = user_release_one(fullname, tag); -+ if (error != ESRCH && error != ENOENT) -+ ASSERT0(error); -+ error = dsl_destroy_snapshot(fullname, B_FALSE); -+ if (error != ENOENT) -+ ASSERT0(error); - -@@ -4751,4 +4817,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, -- FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error) { -@@ -4761,8 +4826,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(fullname, FTAG, &origin); -- if (error) -- fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); -- -- error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); -- dmu_objset_rele(origin, FTAG); -+ error = dmu_objset_clone(clonename, fullname); - if (error) { -@@ -4775,5 +4835,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_TRUE); -+ error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { -- fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); -@@ -4781,5 +4841,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(clonename, B_FALSE); -+ error = dsl_destroy_head(clonename); - if (error) -- fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); -+ fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); - -@@ -4794,4 +4854,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, -- FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error) { -@@ -4804,4 +4863,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, -- B_TRUE, -1); -+ holds = fnvlist_alloc(); -+ fnvlist_add_string(holds, fullname, tag); -+ error = dsl_dataset_user_hold(holds, 0, NULL); -+ fnvlist_free(holds); -+ - if (error) -@@ -4809,5 +4871,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_FALSE); -+ error = dsl_destroy_snapshot(fullname, B_FALSE); - if (error != EBUSY) { -- fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", - fullname, error); -@@ -4815,5 +4877,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_TRUE); -+ error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { -- fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); -@@ -4821,7 +4883,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); -+ error = user_release_one(fullname, tag); - if (error) -- fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); -+ fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); - -- VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); -+ VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); - -@@ -4868,2 +4930,10 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - /* -+ * Grab the name lock as reader. There are some operations -+ * which don't like to have their vdevs changed while -+ * they are in progress (i.e. spa_change_guid). Those -+ * operations will have grabbed the name lock as writer. -+ */ -+ (void) rw_enter(&ztest_name_lock, RW_READER); -+ -+ /* - * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. -@@ -4896,3 +4966,10 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - -- if (vd0 != NULL && maxfaults != 1) { -+ /* -+ * If the top-level vdev needs to be resilvered -+ * then we only allow faults on the device that is -+ * resilvering. -+ */ -+ if (vd0 != NULL && maxfaults != 1 && -+ (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || -+ vd0->vdev_resilver_txg != 0)) { - /* -@@ -4927,2 +5004,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - spa_config_exit(spa, SCL_STATE, FTAG); -+ (void) rw_exit(&ztest_name_lock); - goto out; -@@ -4940,2 +5018,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - spa_config_exit(spa, SCL_STATE, FTAG); -+ (void) rw_exit(&ztest_name_lock); - -@@ -4955,3 +5034,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - * grab a reference on the dataset which may cause -- * dmu_objset_destroy() to fail with EBUSY thus -+ * dsl_destroy_head() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. -@@ -5049,3 +5128,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); -@@ -5053,3 +5132,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5068,3 +5147,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5083,3 +5162,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5092,4 +5171,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - uint64_t offset = i * blocksize; -- VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db, -- DMU_READ_NO_PREFETCH) == 0); -+ int error = dmu_buf_hold(os, object, offset, FTAG, &db, -+ DMU_READ_NO_PREFETCH); -+ if (error != 0) { -+ fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", -+ os, (long long)object, (long long) offset, error); -+ } - ASSERT(db->db_offset == offset); -@@ -5109,4 +5192,4 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - */ -- VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, -- DMU_READ_NO_PREFETCH) == 0); -+ VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, -+ DMU_READ_NO_PREFETCH)); - blk = *((dmu_buf_impl_t *)db)->db_blkptr; -@@ -5128,3 +5211,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -5308,2 +5391,3 @@ ztest_spa_import_export(char *oldname, char *newname) - spa_t *spa; -+ int error; - -@@ -5352,3 +5436,8 @@ ztest_spa_import_export(char *oldname, char *newname) - */ -- VERIFY3U(0, ==, spa_import(newname, config, NULL, 0)); -+ error = spa_import(newname, config, NULL, 0); -+ if (error != 0) { -+ dump_nvlist(config, 0); -+ fatal(B_FALSE, "couldn't import pool %s as %s: error %u", -+ oldname, newname, error); -+ } - -@@ -5408,3 +5497,3 @@ ztest_resume_thread(void *arg) - --#define GRACE 300 -+#define GRACE 300 - -@@ -5559,3 +5648,3 @@ ztest_dataset_open(int d) - -- VERIFY0(dmu_objset_hold(name, zd, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); - (void) rw_exit(&ztest_name_lock); -@@ -5600,3 +5689,3 @@ ztest_dataset_close(int d) - zil_close(zd->zd_zilog); -- dmu_objset_rele(zd->zd_os, zd); -+ dmu_objset_disown(zd->zd_os, zd); - -@@ -5646,3 +5735,3 @@ ztest_run(ztest_shared_t *zs) - kernel_init(FREAD | FWRITE); -- VERIFY(spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0); -+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); - spa->spa_debug = B_TRUE; -@@ -5650,5 +5739,6 @@ ztest_run(ztest_shared_t *zs) - -- VERIFY3U(0, ==, dmu_objset_hold(ztest_opts.zo_pool, FTAG, &os)); -+ VERIFY0(dmu_objset_own(ztest_opts.zo_pool, -+ DMU_OST_ANY, B_TRUE, FTAG, &os)); - zs->zs_guid = dmu_objset_fsid_guid(os); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -5743,2 +5833,5 @@ ztest_run(ztest_shared_t *zs) - -+ if (ztest_opts.zo_verbose >= 3) -+ zfs_dbgmsg_print(FTAG); -+ - umem_free(tid, ztest_opts.zo_threads * sizeof (kt_did_t)); -@@ -5805,2 +5898,4 @@ ztest_freeze(void) - VERIFY3U(0, ==, ztest_dataset_open(0)); -+ spa->spa_debug = B_TRUE; -+ ztest_spa = spa; - -@@ -5940,5 +6035,5 @@ ztest_init(ztest_shared_t *zs) - } -- VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, -- NULL, NULL)); -+ VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); - nvlist_free(nvroot); -+ nvlist_free(props); - -@@ -5993,3 +6088,3 @@ setup_hdr(void) - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - -@@ -6020,3 +6115,3 @@ setup_data(void) - PROT_READ, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - -@@ -6027,3 +6122,3 @@ setup_data(void) - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - buf = (uint8_t *)hdr; -@@ -6155,2 +6250,4 @@ main(int argc, char **argv) - -+ dprintf_setup(&argc, argv); -+ - ztest_fd_rand = open("/dev/urandom", O_RDONLY); -@@ -6159,3 +6256,2 @@ main(int argc, char **argv) - if (!fd_data_str) { -- dprintf_setup(&argc, argv); - process_options(argc, argv); -diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c -index 018bb66..d9c80b3 100644 ---- a/cmd/zvol_id/zvol_id_main.c -+++ b/cmd/zvol_id/zvol_id_main.c -@@ -35,3 +35,4 @@ - --int ioctl_get_msg(char *var, int fd) -+static int -+ioctl_get_msg(char *var, int fd) - { -@@ -49,3 +50,4 @@ int ioctl_get_msg(char *var, int fd) - --int main(int argc, char **argv) -+int -+main(int argc, char **argv) - { -diff --git a/config/Rules.am b/config/Rules.am -index e3fa5b5..4fb40c4 100644 ---- a/config/Rules.am -+++ b/config/Rules.am -@@ -3,4 +3,6 @@ DEFAULT_INCLUDES = -include ${top_builddir}/zfs_config.h - AM_LIBTOOLFLAGS = --silent --AM_CFLAGS = -Wall -Wstrict-prototypes --AM_CFLAGS += -fno-strict-aliasing ${NO_UNUSED_BUT_SET_VARIABLE} ${DEBUG_CFLAGS} -+AM_CFLAGS = ${DEBUG_CFLAGS} -Wall -Wstrict-prototypes -+AM_CFLAGS += ${NO_UNUSED_BUT_SET_VARIABLE} -+AM_CFLAGS += ${NO_AGGRESSIVE_LOOP_OPTIMIZATIONS} -+AM_CFLAGS += -fno-strict-aliasing - AM_CPPFLAGS = -D_GNU_SOURCE -D__EXTENSIONS__ -D_REENTRANT -@@ -8 +10,5 @@ AM_CPPFLAGS += -D_POSIX_PTHREAD_SEMANTICS -D_FILE_OFFSET_BITS=64 - AM_CPPFLAGS += -D_LARGEFILE64_SOURCE -DTEXT_DOMAIN=\"zfs-linux-user\" -+AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" -+AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" -+AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" -+AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" -diff --git a/config/always-no-aggressive-loop-optimizations.m4 b/config/always-no-aggressive-loop-optimizations.m4 -new file mode 100644 -index 0000000..0a5576d ---- /dev/null -+++ b/config/always-no-aggressive-loop-optimizations.m4 -@@ -0,0 +1,20 @@ -+dnl # -+dnl # Check if gcc supports -fno-aggressive-loop-optimizations -+dnl # -+AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_NO_AGGRESSIVE_LOOP_OPTIMIZATIONS], [ -+ AC_MSG_CHECKING([for -fno-aggressive-loop-optimizations support]) -+ -+ saved_flags="$CFLAGS" -+ CFLAGS="$CFLAGS -fno-aggressive-loop-optimizations" -+ -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ -+ NO_AGGRESSIVE_LOOP_OPTIMIZATIONS=-fno-aggressive-loop-optimizations -+ AC_MSG_RESULT([yes]) -+ ], [ -+ NO_AGGRESSIVE_LOOP_OPTIMIZATIONS= -+ AC_MSG_RESULT([no]) -+ ]) -+ -+ CFLAGS="$saved_flags" -+ AC_SUBST([NO_AGGRESSIVE_LOOP_OPTIMIZATIONS]) -+]) -diff --git a/config/always-no-unused-but-set-variable.m4 b/config/always-no-unused-but-set-variable.m4 -index 4a3ceb6..863c90a 100644 ---- a/config/always-no-unused-but-set-variable.m4 -+++ b/config/always-no-unused-but-set-variable.m4 -@@ -14,3 +14,3 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE], [ - -- AC_RUN_IFELSE([AC_LANG_PROGRAM([], [])], -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], - [ -diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 -new file mode 100644 -index 0000000..a03ee5b ---- /dev/null -+++ b/config/kernel-acl.m4 -@@ -0,0 +1,265 @@ -+dnl # -+dnl # Check if posix_acl_release can be used from a CDDL module, -+dnl # The is_owner_or_cap macro was replaced by -+dnl # inode_owner_or_capable -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ -+ AC_MSG_CHECKING([whether posix_acl_release() is available]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ ],[ -+ struct posix_acl* tmp = posix_acl_alloc(1, 0); -+ posix_acl_release(tmp); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_RELEASE, 1, -+ [posix_acl_release() is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether posix_acl_release() is GPL-only]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ -+ MODULE_LICENSE("CDDL"); -+ ],[ -+ struct posix_acl* tmp = posix_acl_alloc(1, 0); -+ posix_acl_release(tmp); -+ ],[ -+ AC_MSG_RESULT(no) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_RELEASE_GPL_ONLY, 1, -+ [posix_acl_release() is GPL-only]) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # posix_acl_chmod_masq() is not exported anymore and posix_acl_chmod() -+dnl # was introduced to replace it. -+dnl # -+dnl # 3.14 API change, -+dnl # posix_acl_chmod() is changed to __posix_acl_chmod() -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CHMOD], [ -+ AC_MSG_CHECKING([whether posix_acl_chmod exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ posix_acl_chmod(NULL, 0, 0) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_CHMOD, 1, [posix_acl_chmod() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether __posix_acl_chmod exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ __posix_acl_chmod(NULL, 0, 0) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE___POSIX_ACL_CHMOD, 1, [__posix_acl_chmod() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.30 API change, -+dnl # caching of ACL into the inode was added in this version. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CACHING], [ -+ AC_MSG_CHECKING([whether inode has i_acl and i_default_acl]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct inode ino; -+ ino.i_acl = NULL; -+ ino.i_default_acl = NULL; -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_CACHING, 1, -+ [inode contains i_acl and i_default_acl]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # posix_acl_equiv_mode now wants an umode_t* instead of a mode_t* -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ -+ AC_MSG_CHECKING([whether posix_acl_equiv_mode() wants umode_t]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ umode_t tmp; -+ posix_acl_equiv_mode(NULL,&tmp); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T, 1, -+ [ posix_acl_equiv_mode wants umode_t*]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.27 API change, -+dnl # Check if inode_operations contains the function permission -+dnl # and expects the nameidata structure to have been removed. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION], [ -+ AC_MSG_CHECKING([whether iops->permission() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int permission_fn(struct inode *inode, int mask) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .permission = permission_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.26 API change, -+dnl # Check if inode_operations contains the function permission -+dnl # and expects the nameidata structure to be passed. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA], [ -+ AC_MSG_CHECKING([whether iops->permission() wants nameidata]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int permission_fn(struct inode *inode, int mask, -+ struct nameidata *nd) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .permission = permission_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) -+ AC_DEFINE(HAVE_PERMISSION_WITH_NAMEIDATA, 1, -+ [iops->permission() with nameidata exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.32 API change, -+dnl # Check if inode_operations contains the function check_acl -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL], [ -+ AC_MSG_CHECKING([whether iops->check_acl() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int check_acl_fn(struct inode *inode, int mask) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .check_acl = check_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.38 API change, -+dnl # The function check_acl gained a new parameter: flags -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS], [ -+ AC_MSG_CHECKING([whether iops->check_acl() wants flags]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int check_acl_fn(struct inode *inode, int mask, -+ unsigned int flags) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .check_acl = check_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) -+ AC_DEFINE(HAVE_CHECK_ACL_WITH_FLAGS, 1, -+ [iops->check_acl() wants flags]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # Check if inode_operations contains the function get_acl -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ -+ AC_MSG_CHECKING([whether iops->get_acl() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ struct posix_acl *get_acl_fn(struct inode *inode, int type) -+ { return NULL; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .get_acl = get_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_GET_ACL, 1, [iops->get_acl() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.30 API change, -+dnl # current_umask exists only since this version. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_CURRENT_UMASK], [ -+ AC_MSG_CHECKING([whether current_umask exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ current_umask(); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CURRENT_UMASK, 1, [current_umask() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/config/kernel-bdi-setup-and-register.m4 b/config/kernel-bdi-setup-and-register.m4 -index 4196091..6369409 100644 ---- a/config/kernel-bdi-setup-and-register.m4 -+++ b/config/kernel-bdi-setup-and-register.m4 -@@ -2,3 +2,3 @@ dnl # - dnl # 2.6.34 API change --dnl # The bdi_setup_and_register() helper function is avilable and -+dnl # The bdi_setup_and_register() helper function is avaliable and - dnl # exported by the kernel. This is a trivial helper function but -@@ -12,3 +12,4 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI_SETUP_AND_REGISTER], - ], [ -- bdi_setup_and_register(NULL, NULL, 0); -+ int r = bdi_setup_and_register(NULL, NULL, 0); -+ r = *(&r); - ], [bdi_setup_and_register], [mm/backing-dev.c], [ -diff --git a/config/kernel-bdi.m4 b/config/kernel-bdi.m4 -index 34ffaab..00bd375 100644 ---- a/config/kernel-bdi.m4 -+++ b/config/kernel-bdi.m4 -@@ -8,5 +8,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI], [ - #include -+ -+ static const struct super_block -+ sb __attribute__ ((unused)) = { -+ .s_bdi = NULL, -+ }; - ],[ -- struct super_block sb __attribute__ ((unused)); -- sb.s_bdi = NULL; - ],[ -diff --git a/config/kernel-bio-bvec-iter.m4 b/config/kernel-bio-bvec-iter.m4 -new file mode 100644 -index 0000000..64c9893 ---- /dev/null -+++ b/config/kernel-bio-bvec-iter.m4 -@@ -0,0 +1,20 @@ -+dnl # -+dnl # 3.14 API change, -+dnl # Immutable biovecs. A number of fields of struct bio are moved to -+dnl # struct bvec_iter. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_BIO_BVEC_ITER], [ -+ AC_MSG_CHECKING([whether bio has bi_iter]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct bio bio; -+ bio.bi_iter.bi_sector = 0; -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_BIO_BVEC_ITER, 1, [bio has bi_iter]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -diff --git a/config/kernel-rq-for-each_segment.m4 b/config/kernel-rq-for-each_segment.m4 -index 449168d..84ce7d1 100644 ---- a/config/kernel-rq-for-each_segment.m4 -+++ b/config/kernel-rq-for-each_segment.m4 -@@ -3,6 +3,9 @@ dnl # 2.6.x API change - dnl # -+dnl # 3.14 API change -+dnl # - AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ -- AC_MSG_CHECKING([whether rq_for_each_segment() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" -+ -+ AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec *]) - ZFS_LINUX_TRY_COMPILE([ -@@ -18,2 +21,22 @@ AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ - [rq_for_each_segment() is available]) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BVP, 1, -+ [rq_for_each_segment() wants bio_vec *]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct bio_vec bv; -+ struct req_iterator iter; -+ struct request *req = NULL; -+ rq_for_each_segment(bv, req, iter) { } -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1, -+ [rq_for_each_segment() is available]) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BV, 1, -+ [rq_for_each_segment() wants bio_vec]) - ],[ -@@ -21,2 +44,3 @@ AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ - ]) -+ - EXTRA_KCFLAGS="$tmp_flags" -diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 -index 325c960..2ba2fcb 100644 ---- a/config/kernel-xattr-handler.m4 -+++ b/config/kernel-xattr-handler.m4 -@@ -20,6 +20,7 @@ AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], - }; -- ],[ -- struct super_block sb __attribute__ ((unused)); - -- sb.s_xattr = xattr_handlers; -+ const struct super_block sb __attribute__ ((unused)) = { -+ .s_xattr = xattr_handlers, -+ }; -+ ],[ - ],[ -@@ -42,8 +43,10 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ - #include -- ],[ -- int (*get)(struct dentry *dentry, const char *name, -- void *buffer, size_t size, int handler_flags) = NULL; -- struct xattr_handler xops __attribute__ ((unused)); - -- xops.get = get; -+ int get(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int handler_flags) { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .get = get, -+ }; -+ ],[ - ],[ -@@ -66,9 +69,11 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ - #include -- ],[ -- int (*set)(struct dentry *dentry, const char *name, -- const void *buffer, size_t size, int flags, -- int handler_flags) = NULL; -- struct xattr_handler xops __attribute__ ((unused)); - -- xops.set = set; -+ int set(struct dentry *dentry, const char *name, -+ const void *buffer, size_t size, int flags, -+ int handler_flags) { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .set = set, -+ }; -+ ],[ - ],[ -@@ -81 +86,70 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ - ]) -+ -+dnl # -+dnl # 2.6.33 API change, -+dnl # The xattr_hander->list() callback was changed to take a dentry -+dnl # instead of an inode, and a handler_flags argument was added. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ -+ AC_MSG_CHECKING([whether xattr_handler->list() wants dentry]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ size_t list(struct dentry *dentry, char *list, size_t list_size, -+ const char *name, size_t name_len, int handler_flags) -+ { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .list = list, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_DENTRY_XATTR_LIST, 1, -+ [xattr_handler->list() wants dentry]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.7 API change, -+dnl # The posix_acl_{from,to}_xattr functions gained a new -+dnl # parameter: user_ns -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ -+ AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ ],[ -+ posix_acl_from_xattr(&init_user_ns, NULL, 0); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_FROM_XATTR_USERNS, 1, -+ [posix_acl_from_xattr() needs user_ns]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.39 API change, -+dnl # The is_owner_or_cap() macro was replaced by inode_owner_or_capable(), -+dnl # this is used for permission checks in the xattr call paths. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ -+ AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ inode_owner_or_capable(NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE, 1, -+ [inode_owner_or_capable() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/config/kernel.m4 b/config/kernel.m4 -index 74ce22c..2557033 100644 ---- a/config/kernel.m4 -+++ b/config/kernel.m4 -@@ -19,2 +19,3 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE -+ ZFS_AC_KERNEL_BIO_BVEC_ITER - ZFS_AC_KERNEL_BIO_FAILFAST -@@ -47,2 +48,15 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - ZFS_AC_KERNEL_XATTR_HANDLER_SET -+ ZFS_AC_KERNEL_XATTR_HANDLER_LIST -+ ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE -+ ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS -+ ZFS_AC_KERNEL_POSIX_ACL_RELEASE -+ ZFS_AC_KERNEL_POSIX_ACL_CHMOD -+ ZFS_AC_KERNEL_POSIX_ACL_CACHING -+ ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T -+ ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION -+ ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA -+ ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL -+ ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS -+ ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL -+ ZFS_AC_KERNEL_CURRENT_UMASK - ZFS_AC_KERNEL_SHOW_OPTIONS -@@ -93,2 +107,3 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE" -+ KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_AGGRESSIVE_LOOP_OPTIMIZATIONS" - KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL" -diff --git a/config/user-frame-larger-than.m4 b/config/user-frame-larger-than.m4 -index 7ad8622..e0828ec 100644 ---- a/config/user-frame-larger-than.m4 -+++ b/config/user-frame-larger-than.m4 -@@ -9,3 +9,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN], [ - -- AC_RUN_IFELSE([AC_LANG_PROGRAM([], [])], -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], - [ -diff --git a/config/user-libblkid.m4 b/config/user-libblkid.m4 -index 276587f..2dd2623 100644 ---- a/config/user-libblkid.m4 -+++ b/config/user-libblkid.m4 -@@ -24,3 +24,10 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - LIBBLKID= -- AS_IF([test "x$with_blkid" != xno], -+ AS_IF([test "x$with_blkid" = xyes], -+ [ -+ AC_SUBST([LIBBLKID], ["-lblkid"]) -+ AC_DEFINE([HAVE_LIBBLKID], 1, -+ [Define if you have libblkid]) -+ ]) -+ -+ AS_IF([test "x$with_blkid" = xcheck], - [ -@@ -31,3 +38,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - ZFS_DEV=`mktemp` -- dd if=/dev/zero of=$ZFS_DEV bs=1024k count=8 \ -+ truncate -s 64M $ZFS_DEV -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=128 conv=notrunc &>/dev/null \ - >/dev/null 2>/dev/null -@@ -37,5 +47,13 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - >/dev/null 2>/dev/null -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=136 conv=notrunc &>/dev/null \ -+ >/dev/null 2>/dev/null -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=140 conv=notrunc &>/dev/null \ -+ >/dev/null 2>/dev/null - -- saved_LDFLAGS="$LDFLAGS" -- LDFLAGS="-lblkid" -+ saved_LIBS="$LIBS" -+ LIBS="-lblkid" - -@@ -44,2 +62,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - #include -+ #include - #include -@@ -60,6 +79,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - -- if (strcmp(value, "zfs")) { -+ if (strcmp(value, "zfs_member")) { - free(value); - blkid_put_cache(cache); -- return 3; -+ return 0; - } -@@ -84,3 +103,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - -- LDFLAGS="$saved_LDFLAGS" -+ LIBS="$saved_LIBS" - ], -diff --git a/config/user-runstatedir.m4 b/config/user-runstatedir.m4 -new file mode 100644 -index 0000000..ded1362 ---- /dev/null -+++ b/config/user-runstatedir.m4 -@@ -0,0 +1,6 @@ -+dnl For backwards compatibility; runstatedir added in autoconf 2.70. -+AC_DEFUN([ZFS_AC_CONFIG_USER_RUNSTATEDIR], [ -+ if test "x$runstatedir" = x; then -+ AC_SUBST([runstatedir], ['${localstatedir}/run']) -+ fi -+]) -diff --git a/config/user-selinux.m4 b/config/user-selinux.m4 -deleted file mode 100644 -index 84df6ce..0000000 ---- a/config/user-selinux.m4 -+++ /dev/null -@@ -1,36 +0,0 @@ --dnl # --dnl # Check to see if the selinux libraries are available. If they --dnl # are then they will be consulted during mount to determine if --dnl # selinux is enabled or disabled. --dnl # --AC_DEFUN([ZFS_AC_CONFIG_USER_LIBSELINUX], [ -- AC_ARG_WITH([selinux], -- [AS_HELP_STRING([--with-selinux], -- [support selinux @<:@default=check@:>@])], -- [], -- [with_selinux=check]) -- -- LIBSELINUX= -- AS_IF([test "x$with_selinux" != xno], [ -- AC_CHECK_HEADER([selinux/selinux.h], [ -- AC_CHECK_LIB([selinux], [is_selinux_enabled], [ -- AC_SUBST([LIBSELINUX], ["-lselinux"]) -- AC_DEFINE([HAVE_LIBSELINUX], 1, -- [Define if you have selinux]) -- ], [ -- AS_IF([test "x$with_selinux" != xcheck], -- [AC_MSG_FAILURE( -- [--with-selinux given but unavailable]) -- ]) -- ]) -- ], [ -- AS_IF([test "x$with_selinux" != xcheck], -- [AC_MSG_FAILURE( -- [--with-selinux given but unavailable]) -- ]) -- ]) -- ], [ -- AC_MSG_CHECKING([for selinux support]) -- AC_MSG_RESULT([no]) -- ]) --]) -diff --git a/config/user-systemd.m4 b/config/user-systemd.m4 -new file mode 100644 -index 0000000..5988945 ---- /dev/null -+++ b/config/user-systemd.m4 -@@ -0,0 +1,29 @@ -+AC_DEFUN([ZFS_AC_CONFIG_USER_SYSTEMD], [ -+ AC_ARG_ENABLE(systemd, -+ AC_HELP_STRING([--enable-systemd], -+ [install systemd unit/preset files [[default: yes]]]), -+ [],enable_systemd=yes) -+ -+ AC_ARG_WITH(systemdunitdir, -+ AC_HELP_STRING([--with-systemdunitdir=DIR], -+ [install systemd unit files in dir [[/usr/lib/systemd/system]]]), -+ systemdunitdir=$withval,systemdunitdir=/usr/lib/systemd/system) -+ -+ AC_ARG_WITH(systemdpresetdir, -+ AC_HELP_STRING([--with-systemdpresetdir=DIR], -+ [install systemd preset files in dir [[/usr/lib/systemd/system-preset]]]), -+ systemdpresetdir=$withval,systemdpresetdir=/usr/lib/systemd/system-preset) -+ -+ AS_IF([test "x$enable_systemd" = xyes], -+ [ -+ ZFS_INIT_SYSTEMD=systemd -+ ZFS_MODULE_LOAD=modules-load.d -+ modulesloaddir=/usr/lib/modules-load.d -+ ]) -+ -+ AC_SUBST(ZFS_INIT_SYSTEMD) -+ AC_SUBST(ZFS_MODULE_LOAD) -+ AC_SUBST(systemdunitdir) -+ AC_SUBST(systemdpresetdir) -+ AC_SUBST(modulesloaddir) -+]) -diff --git a/config/user-sysvinit.m4 b/config/user-sysvinit.m4 -new file mode 100644 -index 0000000..65dcc38 ---- /dev/null -+++ b/config/user-sysvinit.m4 -@@ -0,0 +1,11 @@ -+AC_DEFUN([ZFS_AC_CONFIG_USER_SYSVINIT], [ -+ AC_ARG_ENABLE(sysvinit, -+ AC_HELP_STRING([--enable-sysvinit], -+ [install SysV init scripts [default: yes]]), -+ [],enable_sysvinit=yes) -+ -+ AS_IF([test "x$enable_sysvinit" = xyes], -+ [ZFS_INIT_SYSV=init.d]) -+ -+ AC_SUBST(ZFS_INIT_SYSV) -+]) -diff --git a/config/user.m4 b/config/user.m4 -index 6925e56..3802437 100644 ---- a/config/user.m4 -+++ b/config/user.m4 -@@ -5,2 +5,4 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ - ZFS_AC_CONFIG_USER_UDEV -+ ZFS_AC_CONFIG_USER_SYSTEMD -+ ZFS_AC_CONFIG_USER_SYSVINIT - ZFS_AC_CONFIG_USER_DRACUT -@@ -11,4 +13,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ - ZFS_AC_CONFIG_USER_LIBBLKID -- ZFS_AC_CONFIG_USER_LIBSELINUX - ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN -+ ZFS_AC_CONFIG_USER_RUNSTATEDIR -+dnl # -+dnl # Checks for library functions -+ AC_CHECK_FUNCS([mlockall]) - ]) -diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 -index 005185b..477b916 100644 ---- a/config/zfs-build.m4 -+++ b/config/zfs-build.m4 -@@ -64,2 +64,3 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ - ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE -+ ZFS_AC_CONFIG_ALWAYS_NO_AGGRESSIVE_LOOP_OPTIMIZATIONS - ]) -diff --git a/configure.ac b/configure.ac -index 58e2158..66272fd 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -66,2 +66,5 @@ AC_CONFIG_FILES([ - etc/zfs/Makefile -+ etc/systemd/Makefile -+ etc/systemd/system/Makefile -+ etc/modules-load.d/Makefile - man/Makefile -@@ -90,2 +93,3 @@ AC_CONFIG_FILES([ - lib/libzfs/Makefile -+ lib/libzfs_core/Makefile - lib/libshare/Makefile -@@ -105,2 +109,4 @@ AC_CONFIG_FILES([ - cmd/arcstat/Makefile -+ cmd/dbufstat/Makefile -+ cmd/zed/Makefile - module/Makefile -diff --git a/etc/Makefile.am b/etc/Makefile.am -index 65882b5..a62678b 100644 ---- a/etc/Makefile.am -+++ b/etc/Makefile.am -@@ -1 +1,2 @@ --SUBDIRS = init.d zfs -+SUBDIRS = zfs $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) -+DIST_SUBDIRS = init.d zfs systemd modules-load.d -diff --git a/etc/init.d/zfs.fedora.in b/etc/init.d/zfs.fedora.in -index 3cece9b..1786378 100644 ---- a/etc/init.d/zfs.fedora.in -+++ b/etc/init.d/zfs.fedora.in -@@ -29,3 +29,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/init.d/zfs.gentoo.in b/etc/init.d/zfs.gentoo.in -index 0034e02..07fce01 100644 ---- a/etc/init.d/zfs.gentoo.in -+++ b/etc/init.d/zfs.gentoo.in -@@ -7,3 +7,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -@@ -22,2 +22,3 @@ depend() - before bootmisc logger -+ use mtab - keyword -lxc -openvz -prefix -vserver -diff --git a/etc/init.d/zfs.lsb.in b/etc/init.d/zfs.lsb.in -index 0d0ffb4..e626f79 100644 ---- a/etc/init.d/zfs.lsb.in -+++ b/etc/init.d/zfs.lsb.in -@@ -31,2 +31,6 @@ ZPOOL="@sbindir@/zpool" - ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" -+USE_DISK_BY_ID=0 -+VERBOSE_MOUNT=0 -+DO_OVERLAY_MOUNTS=0 -+MOUNT_EXTRA_OPTIONS="" - -@@ -40,3 +44,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -@@ -71,9 +75,27 @@ start() - # all filesystem based on their properties. -- if [ -f "$ZPOOL_CACHE" ] ; then -+ if [ "$USE_DISK_BY_ID" -eq 1 ]; then -+ log_begin_msg "Importing ZFS pools" -+ "$ZPOOL" import -d /dev/disk/by-id -aN 2>/dev/null -+ ret=$? -+ log_end_msg $ret -+ [ "$ret" -eq 0 ] && POOL_IMPORTED=1 -+ elif [ -f "$ZPOOL_CACHE" ] ; then - log_begin_msg "Importing ZFS pools" - "$ZPOOL" import -c "$ZPOOL_CACHE" -aN 2>/dev/null -- log_end_msg $? -+ ret=$? -+ log_end_msg $ret -+ [ "$ret" -eq 0 ] && POOL_IMPORTED=1 -+ fi -+ -+ if [ -n "$POOL_IMPORTED" ]; then -+ if [ "$VERBOSE_MOUNT" -eq 1 ]; then -+ verbose=v -+ fi -+ -+ if [ "$DO_OVERLAY_MOUNTS" -eq 1 ]; then -+ overlay=O -+ fi - - log_begin_msg "Mounting ZFS filesystems" -- "$ZFS" mount -a -+ "$ZFS" mount -a$verbose$overlay$MOUNT_EXTRA_OPTIONS - log_end_msg $? -@@ -92,2 +114,6 @@ stop() - -+ log_begin_msg "Unsharing ZFS filesystems" -+ "$ZFS" unshare -a -+ log_end_msg $? -+ - log_begin_msg "Unmounting ZFS filesystems" -@@ -96,2 +122,9 @@ stop() - -+ log_begin_msg "Exporting ZFS pools" -+ "$ZPOOL" list -H -o name | \ -+ while read pool; do -+ "$ZPOOL" export $pool -+ done -+ log_end_msg $? -+ - rm -f "$LOCKFILE" -diff --git a/etc/init.d/zfs.lunar.in b/etc/init.d/zfs.lunar.in -index 3cf79ce..7a51104 100644 ---- a/etc/init.d/zfs.lunar.in -+++ b/etc/init.d/zfs.lunar.in -@@ -18,3 +18,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/init.d/zfs.redhat.in b/etc/init.d/zfs.redhat.in -index fb5187f..227787d 100644 ---- a/etc/init.d/zfs.redhat.in -+++ b/etc/init.d/zfs.redhat.in -@@ -29,3 +29,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/modules-load.d/.gitignore b/etc/modules-load.d/.gitignore -new file mode 100644 -index 0000000..fee9217 ---- /dev/null -+++ b/etc/modules-load.d/.gitignore -@@ -0,0 +1 @@ -+*.conf -diff --git a/etc/modules-load.d/Makefile.am b/etc/modules-load.d/Makefile.am -new file mode 100644 -index 0000000..980cb85 ---- /dev/null -+++ b/etc/modules-load.d/Makefile.am -@@ -0,0 +1,13 @@ -+modulesload_DATA = \ -+ $(top_srcdir)/etc/modules-load.d/zfs.conf -+ -+EXTRA_DIST = \ -+ $(top_srcdir)/etc/modules-load.d/zfs.conf.in -+ -+$(modulesload_DATA): -+ -$(SED) \ -+ -e '' \ -+ '$@.in' >'$@' -+ -+distclean-local:: -+ -$(RM) $(modulesload_DATA) -diff --git a/etc/modules-load.d/zfs.conf.in b/etc/modules-load.d/zfs.conf.in -new file mode 100644 -index 0000000..73304bc ---- /dev/null -+++ b/etc/modules-load.d/zfs.conf.in -@@ -0,0 +1 @@ -+zfs -diff --git a/etc/systemd/Makefile.am b/etc/systemd/Makefile.am -new file mode 100644 -index 0000000..d4008c0 ---- /dev/null -+++ b/etc/systemd/Makefile.am -@@ -0,0 +1 @@ -+SUBDIRS = system -diff --git a/etc/systemd/system/.gitignore b/etc/systemd/system/.gitignore -new file mode 100644 -index 0000000..efada54 ---- /dev/null -+++ b/etc/systemd/system/.gitignore -@@ -0,0 +1,3 @@ -+*.service -+*.target -+*.preset -diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in -new file mode 100644 -index 0000000..4efdd72 ---- /dev/null -+++ b/etc/systemd/system/50-zfs.preset.in -@@ -0,0 +1,2 @@ -+# ZFS is enabled by default -+enable zfs.* -diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am -new file mode 100644 -index 0000000..b7a8db2 ---- /dev/null -+++ b/etc/systemd/system/Makefile.am -@@ -0,0 +1,35 @@ -+systemdpreset_DATA = \ -+ $(top_srcdir)/etc/systemd/system/50-zfs.preset -+systemdunit_DATA = \ -+ $(top_srcdir)/etc/systemd/system/zed.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-cache.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-scan.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-mount.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-share.service \ -+ $(top_srcdir)/etc/systemd/system/zfs.target -+ -+EXTRA_DIST = \ -+ $(top_srcdir)/etc/systemd/system/zed.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-cache.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-scan.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-mount.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-share.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs.target.in \ -+ $(top_srcdir)/etc/systemd/system/50-zfs.preset.in -+ -+$(systemdunit_DATA): -+ -$(SED) -e 's,@bindir\@,$(bindir),g' \ -+ -e 's,@runstatedir\@,$(runstatedir),g' \ -+ -e 's,@sbindir\@,$(sbindir),g' \ -+ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -+ '$@.in' >'$@' -+ -+$(systemdpreset_DATA): -+ -$(SED) -e 's,@bindir\@,$(bindir),g' \ -+ -e 's,@runstatedir\@,$(runstatedir),g' \ -+ -e 's,@sbindir\@,$(sbindir),g' \ -+ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -+ '$@.in' >'$@' -+ -+distclean-local:: -+ -$(RM) $(systemdunit_DATA) $(systemdpreset_DATA) -diff --git a/etc/systemd/system/zed.service.in b/etc/systemd/system/zed.service.in -new file mode 100644 -index 0000000..78988ab ---- /dev/null -+++ b/etc/systemd/system/zed.service.in -@@ -0,0 +1,13 @@ -+[Unit] -+Description=ZFS Event Daemon (zed) -+Documentation=man:zed(8) -+After=zfs-import-cache.service -+After=zfs-import-scan.service -+ -+[Service] -+Type=forking -+ExecStart=@sbindir@/zed -+PIDFile=@runstatedir@/zed.pid -+User=root -+Group=root -+Restart=on-abort -diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in -new file mode 100644 -index 0000000..918a258 ---- /dev/null -+++ b/etc/systemd/system/zfs-import-cache.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=Import ZFS pools by cache file -+DefaultDependencies=no -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+ConditionPathExists=@sysconfdir@/zfs/zpool.cache -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN -diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in -new file mode 100644 -index 0000000..ab1b0f6 ---- /dev/null -+++ b/etc/systemd/system/zfs-import-scan.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=Import ZFS pools by device scanning -+DefaultDependencies=no -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+ConditionPathExists=!@sysconfdir@/zfs/zpool.cache -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zpool import -d /dev/disk/by-id -aN -diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in -new file mode 100644 -index 0000000..f1056af ---- /dev/null -+++ b/etc/systemd/system/zfs-mount.service.in -@@ -0,0 +1,15 @@ -+[Unit] -+Description=Mount ZFS filesystems -+DefaultDependencies=no -+Wants=zfs-import-cache.service -+Wants=zfs-import-scan.service -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+After=zfs-import-cache.service -+After=zfs-import-scan.service -+Before=local-fs.target -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zfs mount -a -diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in -new file mode 100644 -index 0000000..a21c9c6 ---- /dev/null -+++ b/etc/systemd/system/zfs-share.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=ZFS file system shares -+After=nfs-server.service -+After=smb.service -+PartOf=nfs-server.service -+PartOf=smb.service -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zfs share -a -diff --git a/etc/systemd/system/zfs.target.in b/etc/systemd/system/zfs.target.in -new file mode 100644 -index 0000000..3541533 ---- /dev/null -+++ b/etc/systemd/system/zfs.target.in -@@ -0,0 +1,8 @@ -+[Unit] -+Description=ZFS startup target -+Requires=zfs-mount.service -+Requires=zfs-share.service -+Wants=zed.service -+ -+[Install] -+WantedBy=multi-user.target -diff --git a/etc/zfs/vdev_id.conf.sas_direct.example b/etc/zfs/vdev_id.conf.sas_direct.example -index a0c43a7..115ebd8 100644 ---- a/etc/zfs/vdev_id.conf.sas_direct.example -+++ b/etc/zfs/vdev_id.conf.sas_direct.example -@@ -10,13 +10,16 @@ channel 86:00.0 0 D - -+ -+# Custom mapping for Channel A -+ - # Linux Mapped --# Slot Slot --slot 1 7 --slot 2 10 --slot 3 3 --slot 4 6 --slot 5 2 --slot 6 8 --slot 7 1 --slot 8 4 --slot 9 9 --slot 10 5 -+# Slot Slot Channel -+slot 1 7 A -+slot 2 10 A -+slot 3 3 A -+slot 4 6 A -+ -+# Default mapping for B, C, and D -+slot 1 4 -+slot 2 2 -+slot 3 1 -+slot 4 3 -diff --git a/include/Makefile.am b/include/Makefile.am -index 64141d9..2e1c31a 100644 ---- a/include/Makefile.am -+++ b/include/Makefile.am -@@ -20,2 +20,3 @@ USER_H = \ - $(top_srcdir)/include/libzfs.h \ -+ $(top_srcdir)/include/libzfs_core.h \ - $(top_srcdir)/include/libzfs_impl.h -diff --git a/include/libzfs.h b/include/libzfs.h -index 3472b76..5bc8b03 100644 ---- a/include/libzfs.h -+++ b/include/libzfs.h -@@ -23,5 +23,6 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -59,2 +60,7 @@ extern "C" { - -+/* -+ * Default wait time for a device name to be created. -+ */ -+#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ -+ - #define DEFAULT_IMPORT_PATH_SIZE 7 -@@ -65,3 +71,4 @@ extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE]; - */ --enum { -+typedef enum zfs_error { -+ EZFS_SUCCESS = 0, /* no error -- success */ - EZFS_NOMEM = 2000, /* out of memory */ -@@ -137,3 +144,3 @@ enum { - EZFS_UNKNOWN --}; -+} zfs_error_t; - -@@ -193,2 +200,5 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); - -+extern void zfs_save_arguments(int argc, char **, char *, int); -+extern int zpool_log_history(libzfs_handle_t *, const char *); -+ - extern int libzfs_errno(libzfs_handle_t *); -@@ -228,3 +238,3 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, - nvlist_t *, nvlist_t *); --extern int zpool_destroy(zpool_handle_t *); -+extern int zpool_destroy(zpool_handle_t *, const char *); - extern int zpool_add(zpool_handle_t *, nvlist_t *); -@@ -274,2 +284,4 @@ extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, - size_t proplen, zprop_source_t *); -+extern int zpool_get_prop_literal(zpool_handle_t *, zpool_prop_t, char *, -+ size_t proplen, zprop_source_t *, boolean_t literal); - extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, -@@ -302,2 +314,3 @@ typedef enum { - ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ -+ ZPOOL_STATUS_ERRATA, /* informational errata available */ - -@@ -337,4 +350,6 @@ typedef enum { - --extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); --extern zpool_status_t zpool_import_status(nvlist_t *, char **); -+extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, -+ zpool_errata_t *); -+extern zpool_status_t zpool_import_status(nvlist_t *, char **, -+ zpool_errata_t *); - extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); -@@ -352,4 +367,4 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); - */ --extern int zpool_export(zpool_handle_t *, boolean_t); --extern int zpool_export_force(zpool_handle_t *); -+extern int zpool_export(zpool_handle_t *, boolean_t, const char *); -+extern int zpool_export_force(zpool_handle_t *, const char *); - extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, -@@ -387,3 +402,3 @@ struct zfs_cmd; - --extern const char *zfs_history_event_names[LOG_END]; -+extern const char *zfs_history_event_names[]; - -@@ -395,7 +410,6 @@ extern int zpool_history_unpack(char *, uint64_t, uint64_t *, - nvlist_t ***, uint_t *); --extern void zpool_set_history_str(const char *subcommand, int argc, -- char **argv, char *history_str); --extern int zpool_stage_history(libzfs_handle_t *, const char *); --extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, int, int); -+extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, -+ int); - extern int zpool_events_clear(libzfs_handle_t *, int *); -+extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int); - extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, -@@ -452,4 +466,2 @@ extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, - char *buf, size_t len); --extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, -- uint64_t *usedp); - extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); -@@ -473,3 +485,4 @@ typedef struct zprop_list { - --extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); -+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, -+ boolean_t); - extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); -@@ -566,5 +579,7 @@ extern int zfs_destroy(zfs_handle_t *, boolean_t); - extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); --extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t); -+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); - extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); - extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); -+extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, -+ nvlist_t *props); - extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -@@ -607,4 +622,5 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *, - extern int zfs_promote(zfs_handle_t *); --extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, -- boolean_t, boolean_t, int, uint64_t, uint64_t); -+extern int zfs_hold(zfs_handle_t *, const char *, const char *, -+ boolean_t, int); -+extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); - extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); -diff --git a/include/libzfs_core.h b/include/libzfs_core.h -new file mode 100644 -index 0000000..3642dc7 ---- /dev/null -+++ b/include/libzfs_core.h -@@ -0,0 +1,67 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ -+#ifndef _LIBZFS_CORE_H -+#define _LIBZFS_CORE_H -+ -+#include -+#include -+#include -+#include -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+int libzfs_core_init(void); -+void libzfs_core_fini(void); -+ -+int lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist); -+int lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props); -+int lzc_clone(const char *fsname, const char *origin, nvlist_t *props); -+int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist); -+ -+int lzc_snaprange_space(const char *firstsnap, const char *lastsnap, -+ uint64_t *usedp); -+ -+int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist); -+int lzc_release(nvlist_t *holds, nvlist_t **errlist); -+int lzc_get_holds(const char *snapname, nvlist_t **holdsp); -+ -+int lzc_send(const char *snapname, const char *fromsnap, int fd); -+int lzc_receive(const char *snapname, nvlist_t *props, const char *origin, -+ boolean_t force, int fd); -+int lzc_send_space(const char *snapname, const char *fromsnap, -+ uint64_t *result); -+ -+boolean_t lzc_exists(const char *dataset); -+ -+int lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _LIBZFS_CORE_H */ -diff --git a/include/libzfs_impl.h b/include/libzfs_impl.h -index fabcb11..5502455 100644 ---- a/include/libzfs_impl.h -+++ b/include/libzfs_impl.h -@@ -23,7 +23,7 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ - --#ifndef _LIBFS_IMPL_H --#define _LIBFS_IMPL_H -+#ifndef _LIBZFS_IMPL_H -+#define _LIBZFS_IMPL_H - -@@ -38,2 +38,3 @@ - #include -+#include - -@@ -71,3 +72,2 @@ struct libzfs_handle { - char libzfs_desc[1024]; -- char *libzfs_log_str; - int libzfs_printerr; -@@ -195,4 +195,2 @@ int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); - --int zvol_create_link(libzfs_handle_t *, const char *); --int zvol_remove_link(libzfs_handle_t *, const char *); - boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); -@@ -221,2 +219,2 @@ extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); - --#endif /* _LIBFS_IMPL_H */ -+#endif /* _LIBZFS_IMPL_H */ -diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h -index ec9926f..8566033 100644 ---- a/include/linux/blkdev_compat.h -+++ b/include/linux/blkdev_compat.h -@@ -29,3 +29,3 @@ - #ifndef _ZFS_BLKDEV_H --#define _ZFS_BLKDEV_H -+#define _ZFS_BLKDEV_H - -@@ -48,3 +48,3 @@ blk_fetch_request(struct request_queue *q) - -- return req; -+ return (req); - } -@@ -81,3 +81,3 @@ __blk_end_request(struct request *req, int error, unsigned int nr_bytes) - -- return 0; -+ return (0); - } -@@ -94,6 +94,6 @@ blk_end_request(struct request *req, int error, unsigned int nr_bytes) - -- return rc; -+ return (rc); - } - #else --# ifdef HAVE_BLK_END_REQUEST_GPL_ONLY -+#ifdef HAVE_BLK_END_REQUEST_GPL_ONLY - /* -@@ -103,4 +103,4 @@ blk_end_request(struct request *req, int error, unsigned int nr_bytes) - */ --# define __blk_end_request __blk_end_request_x --# define blk_end_request blk_end_request_x -+#define __blk_end_request __blk_end_request_x -+#define blk_end_request blk_end_request_x - -@@ -117,3 +117,3 @@ __blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - -- return 0; -+ return (0); - } -@@ -129,5 +129,5 @@ blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - -- return rc; -+ return (rc); - } --# endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ -+#endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ - #endif /* HAVE_BLK_END_REQUEST */ -@@ -143,3 +143,3 @@ blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - #if defined(HAVE_BLK_QUEUE_FLUSH) && defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) --#define blk_queue_flush __blk_queue_flush -+#define blk_queue_flush __blk_queue_flush - static inline void -@@ -155,3 +155,3 @@ blk_rq_pos(struct request *req) - { -- return req->sector; -+ return (req->sector); - } -@@ -163,3 +163,3 @@ blk_rq_sectors(struct request *req) - { -- return req->nr_sectors; -+ return (req->nr_sectors); - } -@@ -173,3 +173,3 @@ blk_rq_sectors(struct request *req) - */ --#define blk_rq_bytes __blk_rq_bytes -+#define blk_rq_bytes __blk_rq_bytes - static inline unsigned int -@@ -177,3 +177,3 @@ __blk_rq_bytes(struct request *req) - { -- return blk_rq_sectors(req) << 9; -+ return (blk_rq_sectors(req) << 9); - } -@@ -188,3 +188,3 @@ __blk_rq_bytes(struct request *req) - #ifndef blk_fs_request --#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) -+#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) - #endif -@@ -199,3 +199,3 @@ __blk_rq_bytes(struct request *req) - #ifndef blk_queue_stackable --#define blk_queue_stackable(q) ((q)->request_fn == NULL) -+#define blk_queue_stackable(q) ((q)->request_fn == NULL) - #endif -@@ -207,3 +207,3 @@ __blk_rq_bytes(struct request *req) - #ifndef HAVE_BLK_QUEUE_MAX_HW_SECTORS --#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors -+#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors - static inline void -@@ -221,3 +221,3 @@ __blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) - #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS --#define blk_queue_max_segments __blk_queue_max_segments -+#define blk_queue_max_segments __blk_queue_max_segments - static inline void -@@ -237,3 +237,3 @@ __blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) - #ifndef HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE --#define blk_queue_physical_block_size(q, x) ((void)(0)) -+#define blk_queue_physical_block_size(q, x) ((void)(0)) - #endif -@@ -246,3 +246,3 @@ __blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) - #ifndef HAVE_BLK_QUEUE_IO_OPT --#define blk_queue_io_opt(q, x) ((void)(0)) -+#define blk_queue_io_opt(q, x) ((void)(0)) - #endif -@@ -258,3 +258,3 @@ get_disk_ro(struct gendisk *disk) - -- return policy; -+ return (policy); - } -@@ -276,6 +276,6 @@ struct req_iterator { - --# define for_each_bio(_bio) \ -+#define for_each_bio(_bio) \ - for (; _bio; _bio = _bio->bi_next) - --# define __rq_for_each_bio(_bio, rq) \ -+#define __rq_for_each_bio(_bio, rq) \ - if ((rq->bio)) \ -@@ -283,5 +283,7 @@ struct req_iterator { - --# define rq_for_each_segment(bvl, _rq, _iter) \ -+#define rq_for_each_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_segment(bvl, _iter.bio, _iter.i) -+ -+#define HAVE_RQ_FOR_EACH_SEGMENT_BVP 1 - #endif /* HAVE_RQ_FOR_EACH_SEGMENT */ -@@ -289,2 +291,34 @@ struct req_iterator { - /* -+ * 3.14 API change -+ * rq_for_each_segment changed from taking bio_vec * to taking bio_vec. -+ * We provide rq_for_each_segment4 which takes both. -+ * You should not modify the fields in @bv and @bvp. -+ * -+ * Note: the if-else is just to inject the assignment before the loop body. -+ */ -+#ifdef HAVE_RQ_FOR_EACH_SEGMENT_BVP -+#define rq_for_each_segment4(bv, bvp, rq, iter) \ -+ rq_for_each_segment(bvp, rq, iter) \ -+ if ((bv = *bvp), 0) \ -+ ; \ -+ else -+#else -+#define rq_for_each_segment4(bv, bvp, rq, iter) \ -+ rq_for_each_segment(bv, rq, iter) \ -+ if ((bvp = &bv), 0) \ -+ ; \ -+ else -+#endif -+ -+#ifdef HAVE_BIO_BVEC_ITER -+#define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector -+#define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size -+#define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx -+#else -+#define BIO_BI_SECTOR(bio) (bio)->bi_sector -+#define BIO_BI_SIZE(bio) (bio)->bi_size -+#define BIO_BI_IDX(bio) (bio)->bi_idx -+#endif -+ -+/* - * Portable helper for correctly setting the FAILFAST flags. The -@@ -317,17 +351,19 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */ -- *flags |= -- ((1 << BIO_RW_FAILFAST_DEV) | -- (1 << BIO_RW_FAILFAST_TRANSPORT) | -- (1 << BIO_RW_FAILFAST_DRIVER)); -+ *flags |= ( -+ (1 << BIO_RW_FAILFAST_DEV) | -+ (1 << BIO_RW_FAILFAST_TRANSPORT) | -+ (1 << BIO_RW_FAILFAST_DRIVER)); - #else --# ifdef HAVE_BIO_RW_FAILFAST -+#ifdef HAVE_BIO_RW_FAILFAST - /* BIO_RW_FAILFAST preferred interface from 2.6.12 - 2.6.27 */ - *flags |= (1 << BIO_RW_FAILFAST); --# else --# ifdef HAVE_REQ_FAILFAST_MASK -- /* REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, -- * the BIO_* and REQ_* flags were unified under REQ_* flags. */ -+#else -+#ifdef HAVE_REQ_FAILFAST_MASK -+ /* -+ * REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, -+ * the BIO_* and REQ_* flags were unified under REQ_* flags. -+ */ - *flags |= REQ_FAILFAST_MASK; --# endif /* HAVE_REQ_FAILFAST_MASK */ --# endif /* HAVE_BIO_RW_FAILFAST */ -+#endif /* HAVE_REQ_FAILFAST_MASK */ -+#endif /* HAVE_BIO_RW_FAILFAST */ - #endif /* HAVE_BIO_RW_FAILFAST_DTD */ -@@ -339,3 +375,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifndef DISK_NAME_LEN --#define DISK_NAME_LEN 32 -+#define DISK_NAME_LEN 32 - #endif /* DISK_NAME_LEN */ -@@ -348,8 +384,10 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_2ARGS_BIO_END_IO_T --# define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) --# define BIO_END_IO_RETURN(rc) return -+#define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) -+#define BIO_END_IO_RETURN(rc) return - #else --# define BIO_END_IO_PROTO(fn, x, y, z) static int fn(struct bio *x, \ -- unsigned int y, int z) --# define BIO_END_IO_RETURN(rc) return rc -+#define BIO_END_IO_PROTO(fn, x, y, z) static int fn( \ -+ struct bio *x, \ -+ unsigned int y, \ -+ int z) -+#define BIO_END_IO_RETURN(rc) return rc - #endif /* HAVE_2ARGS_BIO_END_IO_T */ -@@ -372,11 +410,11 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #if defined(HAVE_BLKDEV_GET_BY_PATH) --# define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ -+#define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ - (md) | FMODE_EXCL, hld) --# define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) -+#define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) - #elif defined(HAVE_OPEN_BDEV_EXCLUSIVE) --# define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) --# define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) -+#define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) -+#define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) - #else --# define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) --# define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) -+#define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) -+#define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) - #endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */ -@@ -389,5 +427,5 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_1ARG_INVALIDATE_BDEV --# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) -+#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) - #else --# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) -+#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) - #endif /* HAVE_1ARG_INVALIDATE_BDEV */ -@@ -400,3 +438,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifndef HAVE_LOOKUP_BDEV --# define lookup_bdev(path) ERR_PTR(-ENOTSUP) -+#define lookup_bdev(path) ERR_PTR(-ENOTSUP) - #endif -@@ -418,9 +456,9 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE --# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) -+#define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) -+#else -+#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -+#define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) - #else --# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE --# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) --# else --# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) --# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ -+#define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -+#endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ - #endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ -@@ -440,9 +478,9 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef WRITE_FLUSH_FUA --# define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA --# define VDEV_REQ_FLUSH REQ_FLUSH --# define VDEV_REQ_FUA REQ_FUA -+#define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA -+#define VDEV_REQ_FLUSH REQ_FLUSH -+#define VDEV_REQ_FUA REQ_FUA - #else --# define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER --# define VDEV_REQ_FLUSH REQ_HARDBARRIER --# define VDEV_REQ_FUA REQ_HARDBARRIER -+#define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER -+#define VDEV_REQ_FLUSH REQ_HARDBARRIER -+#define VDEV_REQ_FUA REQ_HARDBARRIER - #endif -@@ -454,3 +492,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef REQ_DISCARD --# define VDEV_REQ_DISCARD REQ_DISCARD -+#define VDEV_REQ_DISCARD REQ_DISCARD - #endif -@@ -469,3 +507,3 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) - #else --#define blk_queue_discard_granularity(x, dg) ((void)0) -+#define blk_queue_discard_granularity(x, dg) ((void)0) - #endif /* HAVE_DISCARD_GRANULARITY */ -@@ -487,3 +525,3 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) - */ --#define VDEV_HOLDER ((void *)0x2401de7) -+#define VDEV_HOLDER ((void *)0x2401de7) - -diff --git a/include/linux/dcache_compat.h b/include/linux/dcache_compat.h -index 2b9e5c1..bdaa5db 100644 ---- a/include/linux/dcache_compat.h -+++ b/include/linux/dcache_compat.h -@@ -26,3 +26,3 @@ - #ifndef _ZFS_DCACHE_H --#define _ZFS_DCACHE_H -+#define _ZFS_DCACHE_H - -@@ -30,7 +30,7 @@ - --#define dname(dentry) ((char *)((dentry)->d_name.name)) --#define dlen(dentry) ((int)((dentry)->d_name.len)) -+#define dname(dentry) ((char *)((dentry)->d_name.name)) -+#define dlen(dentry) ((int)((dentry)->d_name.len)) - - #ifndef HAVE_D_MAKE_ROOT --#define d_make_root(inode) d_alloc_root(inode) -+#define d_make_root(inode) d_alloc_root(inode) - #endif /* HAVE_D_MAKE_ROOT */ -@@ -76,5 +76,5 @@ d_clear_d_op(struct dentry *dentry) - dentry->d_op = NULL; -- dentry->d_flags &= -- ~(DCACHE_OP_HASH | DCACHE_OP_COMPARE | -- DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -+ dentry->d_flags &= ~( -+ DCACHE_OP_HASH | DCACHE_OP_COMPARE | -+ DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); - #endif /* HAVE_D_SET_D_OP */ -diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h -index 17fa3ff..4358cd2 100644 ---- a/include/linux/vfs_compat.h -+++ b/include/linux/vfs_compat.h -@@ -26,3 +26,3 @@ - #ifndef _ZFS_VFS_H --#define _ZFS_VFS_H -+#define _ZFS_VFS_H - -@@ -73,3 +73,6 @@ extern atomic_long_t zfs_bdi_seq; - static inline int --bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) -+bdi_setup_and_register( -+ struct backing_dev_info *bdi, -+ char *name, -+ unsigned int cap) - { -@@ -101,3 +104,3 @@ bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) - #ifndef LOOKUP_RCU --#define LOOKUP_RCU 0x0 -+#define LOOKUP_RCU 0x0 - #endif /* LOOKUP_RCU */ -@@ -138,3 +141,3 @@ typedef int zpl_umode_t; - #if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) --#define clear_inode(ip) end_writeback(ip) -+#define clear_inode(ip) end_writeback(ip) - #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ -@@ -146,9 +149,9 @@ typedef int zpl_umode_t; - #ifdef HAVE_5ARG_SGET --#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) -+#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) - #else --#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) -+#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) - #endif /* HAVE_5ARG_SGET */ - --#define ZFS_IOC_GETFLAGS FS_IOC_GETFLAGS --#define ZFS_IOC_SETFLAGS FS_IOC_SETFLAGS -+#define ZFS_IOC_GETFLAGS FS_IOC_GETFLAGS -+#define ZFS_IOC_SETFLAGS FS_IOC_SETFLAGS - -@@ -156,4 +159,7 @@ typedef int zpl_umode_t; - static inline loff_t --lseek_execute(struct file *filp, struct inode *inode, -- loff_t offset, loff_t maxsize) -+lseek_execute( -+ struct file *filp, -+ struct inode *inode, -+ loff_t offset, -+ loff_t maxsize) - { -@@ -176,2 +182,151 @@ lseek_execute(struct file *filp, struct inode *inode, - -+#if defined(CONFIG_FS_POSIX_ACL) -+/* -+ * These functions safely approximates the behavior of posix_acl_release() -+ * which cannot be used because it calls the GPL-only symbol kfree_rcu(). -+ * The in-kernel version, which can access the RCU, frees the ACLs after -+ * the grace period expires. Because we're unsure how long that grace -+ * period may be this implementation conservatively delays for 60 seconds. -+ * This is several orders of magnitude larger than expected grace period. -+ * At 60 seconds the kernel will also begin issuing RCU stall warnings. -+ */ -+#include -+#ifndef HAVE_POSIX_ACL_CACHING -+#define ACL_NOT_CACHED ((void *)(-1)) -+#endif /* HAVE_POSIX_ACL_CACHING */ -+ -+#if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) -+ -+#define zpl_posix_acl_release(arg) posix_acl_release(arg) -+#define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) -+#define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) -+ -+#else -+ -+static inline void -+zpl_posix_acl_free(void *arg) { -+ kfree(arg); -+} -+ -+static inline void -+zpl_posix_acl_release(struct posix_acl *acl) -+{ -+ if ((acl == NULL) || (acl == ACL_NOT_CACHED)) -+ return; -+ -+ if (atomic_dec_and_test(&acl->a_refcount)) { -+ taskq_dispatch_delay(system_taskq, zpl_posix_acl_free, acl, -+ TQ_SLEEP, ddi_get_lbolt() + 60*HZ); -+ } -+} -+ -+static inline void -+zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { -+#ifdef HAVE_POSIX_ACL_CACHING -+ struct posix_acl *older = NULL; -+ -+ spin_lock(&ip->i_lock); -+ -+ if ((newer != ACL_NOT_CACHED) && (newer != NULL)) -+ posix_acl_dup(newer); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ older = ip->i_acl; -+ rcu_assign_pointer(ip->i_acl, newer); -+ break; -+ case ACL_TYPE_DEFAULT: -+ older = ip->i_default_acl; -+ rcu_assign_pointer(ip->i_default_acl, newer); -+ break; -+ } -+ -+ spin_unlock(&ip->i_lock); -+ -+ zpl_posix_acl_release(older); -+#endif /* HAVE_POSIX_ACL_CACHING */ -+} -+ -+static inline void -+zpl_forget_cached_acl(struct inode *ip, int type) { -+ zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); -+} -+#endif /* HAVE_POSIX_ACL_RELEASE */ -+ -+/* -+ * 2.6.38 API change, -+ * The is_owner_or_cap() function was renamed to inode_owner_or_capable(). -+ */ -+#ifdef HAVE_INODE_OWNER_OR_CAPABLE -+#define zpl_inode_owner_or_capable(ip) inode_owner_or_capable(ip) -+#else -+#define zpl_inode_owner_or_capable(ip) is_owner_or_cap(ip) -+#endif /* HAVE_INODE_OWNER_OR_CAPABLE */ -+ -+#ifndef HAVE___POSIX_ACL_CHMOD -+#ifdef HAVE_POSIX_ACL_CHMOD -+#define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) -+#define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) -+#else -+static inline int -+__posix_acl_chmod(struct posix_acl **acl, int flags, umode_t umode) { -+ struct posix_acl *oldacl = *acl; -+ mode_t mode = umode; -+ int error; -+ -+ *acl = posix_acl_clone(*acl, flags); -+ zpl_posix_acl_release(oldacl); -+ -+ if (!(*acl)) -+ return (-ENOMEM); -+ -+ error = posix_acl_chmod_masq(*acl, mode); -+ if (error) { -+ zpl_posix_acl_release(*acl); -+ *acl = NULL; -+ } -+ -+ return (error); -+} -+ -+static inline int -+__posix_acl_create(struct posix_acl **acl, int flags, umode_t *umodep) { -+ struct posix_acl *oldacl = *acl; -+ mode_t mode = *umodep; -+ int error; -+ -+ *acl = posix_acl_clone(*acl, flags); -+ zpl_posix_acl_release(oldacl); -+ -+ if (!(*acl)) -+ return (-ENOMEM); -+ -+ error = posix_acl_create_masq(*acl, &mode); -+ *umodep = mode; -+ -+ if (error < 0) { -+ zpl_posix_acl_release(*acl); -+ *acl = NULL; -+ } -+ -+ return (error); -+} -+#endif /* HAVE_POSIX_ACL_CHMOD */ -+#endif /* HAVE___POSIX_ACL_CHMOD */ -+ -+#ifndef HAVE_CURRENT_UMASK -+static inline int -+current_umask(void) -+{ -+ return (current->fs->umask); -+} -+#endif /* HAVE_CURRENT_UMASK */ -+ -+#ifdef HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T -+typedef umode_t zpl_equivmode_t; -+#else -+typedef mode_t zpl_equivmode_t; -+#endif /* HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T */ -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - #endif /* _ZFS_VFS_H */ -diff --git a/include/linux/xattr_compat.h b/include/linux/xattr_compat.h -index 84d8fde..a7371f9 100644 ---- a/include/linux/xattr_compat.h -+++ b/include/linux/xattr_compat.h -@@ -26,3 +26,5 @@ - #ifndef _ZFS_XATTR_H --#define _ZFS_XATTR_H -+#define _ZFS_XATTR_H -+ -+#include - -@@ -47,3 +49,3 @@ typedef struct xattr_handler xattr_handler_t; - #ifdef HAVE_DENTRY_XATTR_GET --#define ZPL_XATTR_GET_WRAPPER(fn) \ -+#define ZPL_XATTR_GET_WRAPPER(fn) \ - static int \ -@@ -52,6 +54,6 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ - { \ -- return __ ## fn(dentry->d_inode, name, buffer, size); \ -+ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ - } - #else --#define ZPL_XATTR_GET_WRAPPER(fn) \ -+#define ZPL_XATTR_GET_WRAPPER(fn) \ - static int \ -@@ -59,3 +61,3 @@ fn(struct inode *ip, const char *name, void *buffer, size_t size) \ - { \ -- return __ ## fn(ip, name, buffer, size); \ -+ return (__ ## fn(ip, name, buffer, size)); \ - } -@@ -69,3 +71,3 @@ fn(struct inode *ip, const char *name, void *buffer, size_t size) \ - #ifdef HAVE_DENTRY_XATTR_SET --#define ZPL_XATTR_SET_WRAPPER(fn) \ -+#define ZPL_XATTR_SET_WRAPPER(fn) \ - static int \ -@@ -74,6 +76,6 @@ fn(struct dentry *dentry, const char *name, const void *buffer, \ - { \ -- return __ ## fn(dentry->d_inode, name, buffer, size, flags); \ -+ return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ - } - #else --#define ZPL_XATTR_SET_WRAPPER(fn) \ -+#define ZPL_XATTR_SET_WRAPPER(fn) \ - static int \ -@@ -82,3 +84,3 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - { \ -- return __ ## fn(ip, name, buffer, size, flags); \ -+ return (__ ## fn(ip, name, buffer, size, flags)); \ - } -@@ -87,6 +89,6 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - #ifdef HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY --#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ -+#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, qstr, nm, val, len) - #else --#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ -+#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, nm, val, len) -@@ -94,2 +96,35 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - -+/* -+ * Linux 3.7 API change. posix_acl_{from,to}_xattr gained the user_ns -+ * parameter. For the HAVE_POSIX_ACL_FROM_XATTR_USERNS version the -+ * userns _may_ not be correct because it's used outside the RCU. -+ */ -+#ifdef HAVE_POSIX_ACL_FROM_XATTR_USERNS -+static inline struct posix_acl * -+zpl_acl_from_xattr(const void *value, int size) -+{ -+ return (posix_acl_from_xattr(CRED()->user_ns, value, size)); -+} -+ -+static inline int -+zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) -+{ -+ return (posix_acl_to_xattr(CRED()->user_ns, acl, value, size)); -+} -+ -+#else -+ -+static inline struct posix_acl * -+zpl_acl_from_xattr(const void *value, int size) -+{ -+ return (posix_acl_from_xattr(value, size)); -+} -+ -+static inline int -+zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) -+{ -+ return (posix_acl_to_xattr(acl, value, size)); -+} -+#endif /* HAVE_POSIX_ACL_FROM_XATTR_USERNS */ -+ - #endif /* _ZFS_XATTR_H */ -diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am -index 2245ff4..9d77566 100644 ---- a/include/sys/Makefile.am -+++ b/include/sys/Makefile.am -@@ -14,2 +14,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dmu_objset.h \ -+ $(top_srcdir)/include/sys/dmu_send.h \ - $(top_srcdir)/include/sys/dmu_traverse.h \ -@@ -21,2 +22,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dsl_deleg.h \ -+ $(top_srcdir)/include/sys/dsl_destroy.h \ - $(top_srcdir)/include/sys/dsl_dir.h \ -@@ -26,2 +28,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dsl_synctask.h \ -+ $(top_srcdir)/include/sys/dsl_userhold.h \ - $(top_srcdir)/include/sys/efi_partition.h \ -@@ -61,2 +64,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/zfs_debug.h \ -+ $(top_srcdir)/include/sys/zfs_delay.h \ - $(top_srcdir)/include/sys/zfs_dir.h \ -@@ -67,4 +71,4 @@ COMMON_H = \ - $(top_srcdir)/include/sys/zfs_vfsops.h \ -- $(top_srcdir)/include/sys/zfs_znode.h \ - $(top_srcdir)/include/sys/zfs_vnops.h \ -+ $(top_srcdir)/include/sys/zfs_znode.h \ - $(top_srcdir)/include/sys/zil.h \ -diff --git a/include/sys/arc.h b/include/sys/arc.h -index 6788219..005d071 100644 ---- a/include/sys/arc.h -+++ b/include/sys/arc.h -@@ -88,2 +88,3 @@ typedef enum arc_space_type { - ARC_SPACE_DATA, -+ ARC_SPACE_META, - ARC_SPACE_HDRS, -@@ -94,2 +95,32 @@ typedef enum arc_space_type { - -+typedef enum arc_state_type { -+ ARC_STATE_ANON, -+ ARC_STATE_MRU, -+ ARC_STATE_MRU_GHOST, -+ ARC_STATE_MFU, -+ ARC_STATE_MFU_GHOST, -+ ARC_STATE_L2C_ONLY, -+ ARC_STATE_NUMTYPES -+} arc_state_type_t; -+ -+typedef struct arc_buf_info { -+ arc_state_type_t abi_state_type; -+ arc_buf_contents_t abi_state_contents; -+ uint64_t abi_state_index; -+ uint32_t abi_flags; -+ uint32_t abi_datacnt; -+ uint64_t abi_size; -+ uint64_t abi_spa; -+ uint64_t abi_access; -+ uint32_t abi_mru_hits; -+ uint32_t abi_mru_ghost_hits; -+ uint32_t abi_mfu_hits; -+ uint32_t abi_mfu_ghost_hits; -+ uint32_t abi_l2arc_hits; -+ uint32_t abi_holds; -+ uint64_t abi_l2arc_dattr; -+ uint64_t abi_l2arc_asize; -+ enum zio_compress abi_l2arc_compress; -+} arc_buf_info_t; -+ - void arc_space_consume(uint64_t space, arc_space_type_t type); -@@ -102,3 +133,4 @@ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); - void arc_buf_add_ref(arc_buf_t *buf, void *tag); --int arc_buf_remove_ref(arc_buf_t *buf, void *tag); -+boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); -+void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); - int arc_buf_size(arc_buf_t *buf); -@@ -107,2 +139,3 @@ int arc_released(arc_buf_t *buf); - int arc_has_callback(arc_buf_t *buf); -+void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); - void arc_buf_freeze(arc_buf_t *buf); -@@ -115,3 +148,3 @@ int arc_referenced(arc_buf_t *buf); - int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, -- arc_done_func_t *done, void *private, int priority, int flags, -+ arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_t *zb); -@@ -119,4 +152,5 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, -- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, -- void *private, int priority, int zio_flags, const zbookmark_t *zb); -+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, -+ arc_done_func_t *done, void *private, zio_priority_t priority, -+ int zio_flags, const zbookmark_t *zb); - -@@ -129,3 +163,2 @@ int arc_buf_evict(arc_buf_t *buf); - --void arc_adjust_meta(int64_t adjustment, boolean_t may_prune); - void arc_flush(spa_t *spa); -@@ -149,6 +182,5 @@ void l2arc_stop(void); - --/* Global tunings */ --extern int zfs_write_limit_shift; --extern unsigned long zfs_write_limit_max; --extern kmutex_t zfs_write_limit_lock; -+#ifndef _KERNEL -+extern boolean_t arc_watch; -+#endif - -diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h -index 394fdfb..23b919b 100644 ---- a/include/sys/dbuf.h -+++ b/include/sys/dbuf.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -113,2 +114,5 @@ typedef struct dbuf_dirty_record { - -+ /* How much space was changed to dsl_pool_dirty_space() for this? */ -+ unsigned int dr_accounted; -+ - union dirty_types { -@@ -133,2 +137,3 @@ typedef struct dbuf_dirty_record { - uint8_t dr_copies; -+ boolean_t dr_nopwrite; - } dl; -@@ -252,3 +257,3 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, - --void dbuf_prefetch(struct dnode *dn, uint64_t blkid); -+void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); - -@@ -284,2 +289,5 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); - -+void dbuf_stats_init(dbuf_hash_table_t *hash); -+void dbuf_stats_destroy(void); -+ - #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) -@@ -309,7 +317,4 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - --#define DBUF_IS_METADATA(_db) \ -- (dbuf_is_metadata(_db)) -- - #define DBUF_GET_BUFC_TYPE(_db) \ -- (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -+ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) - -@@ -317,3 +322,3 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ -- (DBUF_IS_METADATA(_db) && \ -+ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -@@ -322,3 +327,3 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ -- (DBUF_IS_METADATA(_db) && \ -+ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -diff --git a/include/sys/ddt.h b/include/sys/ddt.h -index 6943259..3befcb8 100644 ---- a/include/sys/ddt.h -+++ b/include/sys/ddt.h -@@ -65,12 +65,11 @@ typedef struct ddt_key { - zio_cksum_t ddk_cksum; /* 256-bit block checksum */ -- uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ -+ /* -+ * Encoded with logical & physical size, and compression, as follows: -+ * +-------+-------+-------+-------+-------+-------+-------+-------+ -+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE | -+ * +-------+-------+-------+-------+-------+-------+-------+-------+ -+ */ -+ uint64_t ddk_prop; - } ddt_key_t; - --/* -- * ddk_prop layout: -- * -- * +-------+-------+-------+-------+-------+-------+-------+-------+ -- * | 0 | 0 | 0 | comp | PSIZE | LSIZE | -- * +-------+-------+-------+-------+-------+-------+-------+-------+ -- */ - #define DDK_GET_LSIZE(ddk) \ -@@ -219,2 +218,4 @@ extern void ddt_enter(ddt_t *ddt); - extern void ddt_exit(ddt_t *ddt); -+extern void ddt_init(void); -+extern void ddt_fini(void); - extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); -diff --git a/include/sys/dmu.h b/include/sys/dmu.h -index adaab4c..1314c1e 100644 ---- a/include/sys/dmu.h -+++ b/include/sys/dmu.h -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -@@ -44,2 +45,3 @@ - #include -+#include - #include -@@ -215,11 +217,7 @@ typedef enum dmu_object_type { - --typedef enum dmu_objset_type { -- DMU_OST_NONE, -- DMU_OST_META, -- DMU_OST_ZFS, -- DMU_OST_ZVOL, -- DMU_OST_OTHER, /* For testing only! */ -- DMU_OST_ANY, /* Be careful! */ -- DMU_OST_NUMTYPES --} dmu_objset_type_t; -+typedef enum txg_how { -+ TXG_WAIT = 1, -+ TXG_NOWAIT, -+ TXG_WAITED, -+} txg_how_t; - -@@ -263,13 +261,10 @@ int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); - --int dmu_objset_evict_dbufs(objset_t *os); -+void dmu_objset_evict_dbufs(objset_t *os); - int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); --int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, -- uint64_t flags); --int dmu_objset_destroy(const char *name, boolean_t defer); --int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer, char *); --int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); --int dmu_objset_rename(const char *name, const char *newname, -- boolean_t recursive); -+int dmu_objset_clone(const char *name, const char *origin); -+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, -+ struct nvlist *errlist); -+int dmu_objset_snapshot_one(const char *fsname, const char *snapname); -+int dmu_objset_snapshot_tmp(const char *, const char *, int); - int dmu_objset_find(char *name, int func(const char *, void *), void *arg, -@@ -277,2 +272,4 @@ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, - void dmu_objset_byteswap(void *buf, size_t size); -+int dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive); - -@@ -413,2 +410,4 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, - * buffer as well. You must release what you hold with dmu_buf_rele(). -+ * -+ * Returns ENOENT, EIO, or 0. - */ -@@ -502,2 +501,7 @@ void *dmu_buf_get_user(dmu_buf_t *db); - /* -+ * Returns the blkptr associated with this dbuf, or NULL if not set. -+ */ -+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); -+ -+/* - * Indicate that you are going to modify the buffer's data (db_data). -@@ -546,3 +550,3 @@ void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); - void dmu_tx_abort(dmu_tx_t *tx); --int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -+int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); - void dmu_tx_wait(dmu_tx_t *tx); -@@ -578,3 +582,3 @@ int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size); --int dmu_free_object(objset_t *os, uint64_t object); -+int dmu_free_long_object(objset_t *os, uint64_t object); - -@@ -667,4 +671,11 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; - int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); -+void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -+/* Like dmu_object_info, but faster if you have a held dnode in hand. */ - void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ - void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); -+/* -+ * Like dmu_object_info_from_db, but faster still when you only care about -+ * the size. This is specifically optimized for zfs_getattr(). -+ */ - void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, -@@ -794,33 +805,4 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start, - --int dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- int outfd, struct vnode *vp, offset_t *off); --int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorign, -- uint64_t *sizep); -- --typedef struct dmu_recv_cookie { -- /* -- * This structure is opaque! -- * -- * If logical and real are different, we are recving the stream -- * into the "real" temporary clone, and then switching it with -- * the "logical" target. -- */ -- struct dsl_dataset *drc_logical_ds; -- struct dsl_dataset *drc_real_ds; -- struct drr_begin *drc_drrb; -- char *drc_tosnap; -- char *drc_top_ds; -- boolean_t drc_newfs; -- boolean_t drc_force; -- struct avl_tree *drc_guid_to_ds_map; --} dmu_recv_cookie_t; -- --int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, -- boolean_t force, objset_t *origin, dmu_recv_cookie_t *); --int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, -- int cleanup_fd, uint64_t *action_handlep); --int dmu_recv_end(dmu_recv_cookie_t *drc); -- --int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, -- offset_t *off); -+int dmu_diff(const char *tosnap_name, const char *fromsnap_name, -+ struct vnode *vp, offset_t *offp); - -diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h -index f13a2a3..bbff15d 100644 ---- a/include/sys/dmu_impl.h -+++ b/include/sys/dmu_impl.h -@@ -23,3 +23,6 @@ - * Use is subject to license terms. -+ */ -+/* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -267,2 +270,5 @@ typedef struct dmu_sendarg { - dmu_pendop_t dsa_pending_op; -+ boolean_t dsa_incremental; -+ uint64_t dsa_last_data_object; -+ uint64_t dsa_last_data_offset; - } dmu_sendarg_t; -diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h -index 507f732..edf362f 100644 ---- a/include/sys/dmu_objset.h -+++ b/include/sys/dmu_objset.h -@@ -23,2 +23,3 @@ - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -45,2 +46,3 @@ extern krwlock_t os_lock; - -+struct dsl_pool; - struct dsl_dataset; -@@ -116,4 +118,2 @@ struct objset { - void *os_user_ptr; -- -- /* SA layout/attribute registration */ - sa_os_t *os_sa; -@@ -138,2 +138,3 @@ int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); -+void dmu_objset_refresh_ownership(objset_t *os, void *tag); - void dmu_objset_rele(objset_t *os, void *tag); -@@ -142,9 +143,2 @@ int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); - --int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, -- void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); --int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, -- uint64_t flags); --int dmu_objset_destroy(const char *name, boolean_t defer); --int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); - void dmu_objset_stats(objset_t *os, nvlist_t *nv); -@@ -154,9 +148,6 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t dmu_objset_fsid_guid(objset_t *os); --int dmu_objset_find(char *name, int func(const char *, void *), void *arg, -- int flags); --int dmu_objset_find_spa(spa_t *spa, const char *name, -- int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); --int dmu_objset_prefetch(const char *name, void *arg); --void dmu_objset_byteswap(void *buf, size_t size); --int dmu_objset_evict_dbufs(objset_t *os); -+int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, -+ int func(struct dsl_pool *, struct dsl_dataset *, void *), -+ void *arg, int flags); -+void dmu_objset_evict_dbufs(objset_t *os); - timestruc_t dmu_objset_snap_cmtime(objset_t *os); -@@ -176,2 +167,3 @@ int dmu_objset_userspace_upgrade(objset_t *os); - boolean_t dmu_objset_userspace_present(objset_t *os); -+int dmu_fsname(const char *snapname, char *buf); - -diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h -new file mode 100644 -index 0000000..65514b7 ---- /dev/null -+++ b/include/sys/dmu_send.h -@@ -0,0 +1,68 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ */ -+ -+#ifndef _DMU_SEND_H -+#define _DMU_SEND_H -+ -+#include -+#include -+ -+struct vnode; -+struct dsl_dataset; -+struct drr_begin; -+struct avl_tree; -+ -+int dmu_send(const char *tosnap, const char *fromsnap, int outfd, -+ struct vnode *vp, offset_t *off); -+int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, -+ uint64_t *sizep); -+int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, -+ int outfd, struct vnode *vp, offset_t *off); -+ -+typedef struct dmu_recv_cookie { -+ struct dsl_dataset *drc_ds; -+ struct drr_begin *drc_drrb; -+ const char *drc_tofs; -+ const char *drc_tosnap; -+ boolean_t drc_newfs; -+ boolean_t drc_byteswap; -+ boolean_t drc_force; -+ struct avl_tree *drc_guid_to_ds_map; -+ zio_cksum_t drc_cksum; -+ uint64_t drc_newsnapobj; -+ void *drc_owner; -+} dmu_recv_cookie_t; -+ -+int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, -+ boolean_t force, char *origin, dmu_recv_cookie_t *drc); -+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, -+ int cleanup_fd, uint64_t *action_handlep); -+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); -+boolean_t dmu_objset_is_receiving(objset_t *os); -+ -+#endif /* _DMU_SEND_H */ -diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h -index 40c1ded..c70c97d 100644 ---- a/include/sys/dmu_tx.h -+++ b/include/sys/dmu_tx.h -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ - -@@ -59,4 +62,18 @@ struct dmu_tx { - struct dmu_tx_hold *tx_needassign_txh; -- list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ -- uint8_t tx_anyobj; -+ -+ /* list of dmu_tx_callback_t on this dmu_tx */ -+ list_t tx_callbacks; -+ -+ /* placeholder for syncing context, doesn't need specific holds */ -+ boolean_t tx_anyobj; -+ -+ /* has this transaction already been delayed? */ -+ boolean_t tx_waited; -+ -+ /* time this transaction was created */ -+ hrtime_t tx_start; -+ -+ /* need to wait for sufficient dirty space */ -+ boolean_t tx_wait_dirty; -+ - int tx_err; -@@ -115,8 +132,7 @@ typedef struct dmu_tx_stats { - kstat_named_t dmu_tx_group; -- kstat_named_t dmu_tx_how; - kstat_named_t dmu_tx_memory_reserve; - kstat_named_t dmu_tx_memory_reclaim; -- kstat_named_t dmu_tx_memory_inflight; - kstat_named_t dmu_tx_dirty_throttle; -- kstat_named_t dmu_tx_write_limit; -+ kstat_named_t dmu_tx_dirty_delay; -+ kstat_named_t dmu_tx_dirty_over_max; - kstat_named_t dmu_tx_quota; -@@ -126,5 +142,5 @@ extern dmu_tx_stats_t dmu_tx_stats; - --#define DMU_TX_STAT_INCR(stat, val) \ -+#define DMU_TX_STAT_INCR(stat, val) \ - atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val)); --#define DMU_TX_STAT_BUMP(stat) \ -+#define DMU_TX_STAT_BUMP(stat) \ - DMU_TX_STAT_INCR(stat, 1); -@@ -135,3 +151,3 @@ extern dmu_tx_stats_t dmu_tx_stats; - dmu_tx_t *dmu_tx_create(objset_t *dd); --int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -+int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); - void dmu_tx_commit(dmu_tx_t *tx); -@@ -139,2 +155,3 @@ void dmu_tx_abort(dmu_tx_t *tx); - uint64_t dmu_tx_get_txg(dmu_tx_t *tx); -+struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); - void dmu_tx_wait(dmu_tx_t *tx); -diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h -index 442ab15..38ed1d8 100644 ---- a/include/sys/dmu_zfetch.h -+++ b/include/sys/dmu_zfetch.h -@@ -52,3 +52,3 @@ typedef struct zstream { - clock_t zst_last; /* lbolt of last prefetch */ -- avl_node_t zst_node; /* embed avl node here */ -+ list_node_t zst_node; /* next zstream here */ - } zstream_t; -diff --git a/include/sys/dnode.h b/include/sys/dnode.h -index 9f9134d..55b87bc 100644 ---- a/include/sys/dnode.h -+++ b/include/sys/dnode.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -147,5 +147,4 @@ typedef struct dnode { - /* -- * dn_struct_rwlock protects the structure of the dnode, -- * including the number of levels of indirection (dn_nlevels), -- * dn_maxblkid, and dn_next_* -+ * Protects the structure of the dnode, including the number of levels -+ * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* - */ -@@ -191,2 +190,4 @@ typedef struct dnode { - uint32_t dn_dbufs_count; /* count of dn_dbufs */ -+ /* There are no level-0 blocks of this blkid or higher in dn_dbufs */ -+ uint64_t dn_unlisted_l0_blkid; - -diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h -index afcf2b7..f6449c6 100644 ---- a/include/sys/dsl_dataset.h -+++ b/include/sys/dsl_dataset.h -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -37,2 +38,3 @@ - #include -+#include - -@@ -50,6 +52,4 @@ struct dsl_pool; - /* -- * NB: nopromote can not yet be set, but we want support for it in this -- * on-disk version, so that we don't need to upgrade for it later. It -- * will be needed when we implement 'zfs split' (where the split off -- * clone should not be promoted). -+ * Note: nopromote can not yet be set, but we want support for it in this -+ * on-disk version, so that we don't need to upgrade for it later. - */ -@@ -78,2 +78,4 @@ struct dsl_pool; - -+#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) -+ - typedef struct dsl_dataset_phys { -@@ -127,5 +129,2 @@ typedef struct dsl_dataset { - -- /* to protect against multiple concurrent incremental recv */ -- kmutex_t ds_recvlock; -- - /* protected by lock on pool's dp_dirty_datasets list */ -@@ -141,9 +140,11 @@ typedef struct dsl_dataset { - uint64_t ds_userrefs; -+ void *ds_owner; - - /* -- * ds_owner is protected by the ds_rwlock and the ds_lock -+ * Long holds prevent the ds from being destroyed; they allow the -+ * ds to remain held even after dropping the dp_config_rwlock. -+ * Owning counts as a long hold. See the comments above -+ * dsl_pool_hold() for details. - */ -- krwlock_t ds_rwlock; -- kcondvar_t ds_exclusive_cv; -- void *ds_owner; -+ refcount_t ds_longholds; - -@@ -165,11 +166,2 @@ typedef struct dsl_dataset { - --struct dsl_ds_destroyarg { -- dsl_dataset_t *ds; /* ds to destroy */ -- dsl_dataset_t *rm_origin; /* also remove our origin? */ -- boolean_t is_origin_rm; /* set if removing origin snap */ -- boolean_t defer; /* destroy -d requested? */ -- boolean_t releasing; /* destroying due to release? */ -- boolean_t need_prep; /* do we need to retry due to EBUSY? */ --}; -- - /* -@@ -180,12 +172,2 @@ struct dsl_ds_destroyarg { - --struct dsl_ds_holdarg { -- dsl_sync_task_group_t *dstg; -- char *htag; -- char *snapname; -- boolean_t recursive; -- boolean_t gotone; -- boolean_t temphold; -- char failed[MAXPATHLEN]; --}; -- - #define dsl_dataset_is_snapshot(ds) \ -@@ -196,18 +178,14 @@ struct dsl_ds_holdarg { - --int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); --int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, -- void *tag, dsl_dataset_t **); --int dsl_dataset_own(const char *name, boolean_t inconsistentok, -+int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, -+ dsl_dataset_t **dsp); -+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, -+ dsl_dataset_t **); -+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -+int dsl_dataset_own(struct dsl_pool *dp, const char *name, - void *tag, dsl_dataset_t **dsp); - int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, -- boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); --void dsl_dataset_name(dsl_dataset_t *ds, char *name); --void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -+ void *tag, dsl_dataset_t **dsp); - void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); --void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); --boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, -- void *tag); --void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); --void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, -- minor_t minor); -+void dsl_dataset_name(dsl_dataset_t *ds, char *name); -+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); - uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, -@@ -216,22 +194,8 @@ uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx); --int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); --int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); --dsl_checkfunc_t dsl_dataset_destroy_check; --dsl_syncfunc_t dsl_dataset_destroy_sync; --dsl_checkfunc_t dsl_dataset_snapshot_check; --dsl_syncfunc_t dsl_dataset_snapshot_sync; --dsl_syncfunc_t dsl_dataset_user_hold_sync; --int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -+int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); - int dsl_dataset_promote(const char *name, char *conflsnap); --int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, -- boolean_t force); --int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, -- boolean_t recursive, boolean_t temphold, int cleanup_fd); --int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, -- boolean_t temphold); --int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, -- boolean_t recursive); --int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, -- char *htag, boolean_t retry); --int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); -+int dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive); -+int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, -+ minor_t cleanup_minor, const char *htag); - -@@ -242,3 +206,4 @@ spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); - --boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); -+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, -+ dsl_dataset_t *snap); - -@@ -274,9 +239,31 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t *ref_rsrv); --int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, -+int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, - uint64_t quota); --dsl_syncfunc_t dsl_dataset_set_quota_sync; --int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, -+int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, - uint64_t reservation); - --int dsl_destroy_inconsistent(const char *dsname, void *arg); -+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier); -+void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); -+void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); -+boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); -+ -+int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); -+void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, dmu_tx_t *tx); -+int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx, boolean_t recv); -+void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx); -+ -+void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, -+ dmu_tx_t *tx); -+void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); -+int dsl_dataset_get_snapname(dsl_dataset_t *ds); -+int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, -+ uint64_t *value); -+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); -+void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, -+ zprop_source_t source, uint64_t value, dmu_tx_t *tx); -+int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); - -diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h -index 9db6d07..5842639 100644 ---- a/include/sys/dsl_deleg.h -+++ b/include/sys/dsl_deleg.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -67,4 +67,3 @@ int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); - int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); --int dsl_deleg_access_impl(struct dsl_dataset *ds, boolean_t descendent, -- const char *perm, cred_t *cr); -+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); - void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); -diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h -new file mode 100644 -index 0000000..3f63864 ---- /dev/null -+++ b/include/sys/dsl_destroy.h -@@ -0,0 +1,53 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ */ -+ -+#ifndef _SYS_DSL_DESTROY_H -+#define _SYS_DSL_DESTROY_H -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+struct nvlist; -+struct dsl_dataset; -+struct dmu_tx; -+ -+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, -+ struct nvlist *); -+int dsl_destroy_snapshot(const char *, boolean_t); -+int dsl_destroy_head(const char *); -+int dsl_destroy_head_check_impl(struct dsl_dataset *, int); -+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *); -+int dsl_destroy_inconsistent(const char *, void *); -+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); -+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, -+ boolean_t, struct dmu_tx *); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _SYS_DSL_DESTROY_H */ -diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h -index 65ad202..d69d476 100644 ---- a/include/sys/dsl_dir.h -+++ b/include/sys/dsl_dir.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -103,7 +104,6 @@ struct dsl_dir { - --void dsl_dir_close(dsl_dir_t *dd, void *tag); --int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); --int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, -- const char **tailp); --int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, -+void dsl_dir_rele(dsl_dir_t *dd, void *tag); -+int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, -+ dsl_dir_t **, const char **tail); -+int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **); -@@ -113,4 +113,2 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, - const char *name, dmu_tx_t *tx); --dsl_checkfunc_t dsl_dir_destroy_check; --dsl_syncfunc_t dsl_dir_destroy_sync; - void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); -@@ -133,5 +131,4 @@ int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, - uint64_t reservation); --int dsl_dir_rename(dsl_dir_t *dd, const char *newname); -+int dsl_dir_rename(const char *oldname, const char *newname); - int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); --int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); - boolean_t dsl_dir_is_clone(dsl_dir_t *dd); -@@ -141,2 +138,4 @@ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); - timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); -+void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, -+ dmu_tx_t *tx); - -diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h -index 4a4bf76..d5bad8d 100644 ---- a/include/sys/dsl_pool.h -+++ b/include/sys/dsl_pool.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -38,2 +38,3 @@ - #include -+#include - -@@ -52,2 +53,10 @@ struct dsl_scan; - -+extern unsigned long zfs_dirty_data_max; -+extern unsigned long zfs_dirty_data_max_max; -+extern unsigned long zfs_dirty_data_sync; -+extern int zfs_dirty_data_max_percent; -+extern int zfs_dirty_data_max_max_percent; -+extern int zfs_delay_min_dirty_percent; -+extern unsigned long zfs_delay_scale; -+ - /* These macros are for indexing into the zfs_all_blkstats_t. */ -@@ -72,9 +81,2 @@ typedef struct zfs_all_blkstats { - --typedef struct txg_history { -- kstat_txg_t th_kstat; -- vdev_stat_t th_vs1; -- vdev_stat_t th_vs2; -- kmutex_t th_lock; -- list_node_t th_link; --} txg_history_t; - -@@ -90,4 +92,2 @@ typedef struct dsl_pool { - struct taskq *dp_iput_taskq; -- kstat_t *dp_txg_kstat; -- kstat_t *dp_tx_assign_kstat; - -@@ -95,5 +95,2 @@ typedef struct dsl_pool { - blkptr_t dp_meta_rootbp; -- hrtime_t dp_read_overhead; -- uint64_t dp_throughput; /* bytes per millisec */ -- uint64_t dp_write_limit; - uint64_t dp_tmp_userrefs_obj; -@@ -107,4 +104,5 @@ typedef struct dsl_pool { - kmutex_t dp_lock; -- uint64_t dp_space_towrite[TXG_SIZE]; -- uint64_t dp_tempreserved[TXG_SIZE]; -+ kcondvar_t dp_spaceavail_cv; -+ uint64_t dp_dirty_pertxg[TXG_SIZE]; -+ uint64_t dp_dirty_total; - uint64_t dp_mos_used_delta; -@@ -112,7 +110,8 @@ typedef struct dsl_pool { - uint64_t dp_mos_uncompressed_delta; -- uint64_t dp_txg_history_size; -- list_t dp_txg_history; -- uint64_t dp_tx_assign_size; -- kstat_named_t *dp_tx_assign_buckets; - -+ /* -+ * Time of most recently scheduled (furthest in the future) -+ * wakeup for delayed transactions. -+ */ -+ hrtime_t dp_last_wakeup; - -@@ -127,2 +126,3 @@ typedef struct dsl_pool { - * Protects administrative changes (properties, namespace) -+ * - * It is only held for write in syncing context. Therefore -@@ -131,3 +131,3 @@ typedef struct dsl_pool { - */ -- krwlock_t dp_config_rwlock; -+ rrwlock_t dp_config_rwlock; - -@@ -145,6 +145,4 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); - uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); --int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); --void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); --void dsl_pool_memory_pressure(dsl_pool_t *dp); --void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); - void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -@@ -157,2 +155,6 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, - int64_t used, int64_t comp, int64_t uncomp); -+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); -+void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); -+void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); -+boolean_t dsl_pool_config_held(dsl_pool_t *dp); - -@@ -160,14 +162,10 @@ taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); - --extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, -- const char *tag, uint64_t *now, dmu_tx_t *tx); --extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, -+int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, -+ const char *tag, uint64_t now, dmu_tx_t *tx); -+int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, dmu_tx_t *tx); --extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); -+void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); - int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); -- --void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs); -- --txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); --txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); --void dsl_pool_txg_history_put(txg_history_t *th); -+int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); -+void dsl_pool_rele(dsl_pool_t *dp, void *tag); - -diff --git a/include/sys/dsl_prop.h b/include/sys/dsl_prop.h -index a636ad3..5fe18d6 100644 ---- a/include/sys/dsl_prop.h -+++ b/include/sys/dsl_prop.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -55,16 +56,2 @@ typedef struct dsl_props_arg { - --typedef struct dsl_prop_set_arg { -- const char *psa_name; -- zprop_source_t psa_source; -- int psa_intsz; -- int psa_numints; -- const void *psa_value; -- -- /* -- * Used to handle the special requirements of the quota and reservation -- * properties. -- */ -- uint64_t psa_effective_value; --} dsl_prop_setarg_t; -- - int dsl_prop_register(struct dsl_dataset *ds, const char *propname, -@@ -73,3 +60,4 @@ int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg); --int dsl_prop_numcb(struct dsl_dataset *ds); -+void dsl_prop_notify_all(struct dsl_dir *dd); -+boolean_t dsl_prop_hascb(struct dsl_dataset *ds); - -@@ -80,5 +68,7 @@ int dsl_prop_get_integer(const char *ddname, const char *propname, - int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); --int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); -+int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); - int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, - int intsz, int numints, void *buf, char *setpoint); -+int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, -+ uint64_t *valuep); - int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, -@@ -87,24 +77,22 @@ int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - --dsl_syncfunc_t dsl_props_set_sync; --int dsl_prop_set(const char *ddname, const char *propname, -- zprop_source_t source, int intsz, int numints, const void *buf); --int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); --void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, -+void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, -+ nvlist_t *props, dmu_tx_t *tx); -+void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, -+ zprop_source_t source, int intsz, int numints, const void *value, - dmu_tx_t *tx); -+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); -+int dsl_prop_set_int(const char *dsname, const char *propname, -+ zprop_source_t source, uint64_t value); -+int dsl_prop_set_string(const char *dsname, const char *propname, -+ zprop_source_t source, const char *value); -+int dsl_prop_inherit(const char *dsname, const char *propname, -+ zprop_source_t source); - --void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, -- zprop_source_t source, uint64_t *value); --int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); --#ifdef ZFS_DEBUG --void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); --#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ -- dsl_prop_check_prediction((dd), (psa)) --#else --#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ --#endif -+int dsl_prop_predict(dsl_dir_t *dd, const char *propname, -+ zprop_source_t source, uint64_t value, uint64_t *newvalp); - - /* flag first receive on or after SPA_VERSION_RECVD_PROPS */ --boolean_t dsl_prop_get_hasrecvd(objset_t *os); --void dsl_prop_set_hasrecvd(objset_t *os); --void dsl_prop_unset_hasrecvd(objset_t *os); -+boolean_t dsl_prop_get_hasrecvd(const char *dsname); -+int dsl_prop_set_hasrecvd(const char *dsname); -+void dsl_prop_unset_hasrecvd(const char *dsname); - -diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h -index 5691f4d..bcb85d6 100644 ---- a/include/sys/dsl_scan.h -+++ b/include/sys/dsl_scan.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -74,2 +74,34 @@ typedef enum dsl_scan_flags { - -+#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) -+ -+/* -+ * Every pool will have one dsl_scan_t and this structure will contain -+ * in-memory information about the scan and a pointer to the on-disk -+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan -+ * is contained on-disk to allow the scan to resume in the event of a reboot -+ * or panic. This structure maintains information about the behavior of a -+ * running scan, some caching information, and how it should traverse the pool. -+ * -+ * The following members of this structure direct the behavior of the scan: -+ * -+ * scn_pausing - a scan that cannot be completed in a single txg or -+ * has exceeded its allotted time will need to pause. -+ * When this flag is set the scanner will stop traversing -+ * the pool and write out the current state to disk. -+ * -+ * scn_restart_txg - directs the scanner to either restart or start a -+ * a scan at the specified txg value. -+ * -+ * scn_done_txg - when a scan completes its traversal it will set -+ * the completion txg to the next txg. This is necessary -+ * to ensure that any blocks that were freed during -+ * the scan but have not yet been processed (i.e deferred -+ * frees) are accounted for. -+ * -+ * This structure also maintains information about deferred frees which are -+ * a special kind of traversal. Deferred free can exist in either a bptree or -+ * a bpobj structure. The scn_is_bptree flag will indicate the type of -+ * deferred free that is in progress. If the deferred free is part of an -+ * asynchronous destroy then the scn_async_destroying flag will be set. -+ */ - typedef struct dsl_scan { -@@ -79,2 +111,3 @@ typedef struct dsl_scan { - uint64_t scn_restart_txg; -+ uint64_t scn_done_txg; - uint64_t scn_sync_start_time; -@@ -84,2 +117,3 @@ typedef struct dsl_scan { - boolean_t scn_is_bptree; -+ boolean_t scn_async_destroying; - -diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h -index 9126290..ef86fb6 100644 ---- a/include/sys/dsl_synctask.h -+++ b/include/sys/dsl_synctask.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -36,39 +37,22 @@ struct dsl_pool; - --typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); --typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); -+typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); -+typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); - - typedef struct dsl_sync_task { -- list_node_t dst_node; -+ txg_node_t dst_node; -+ struct dsl_pool *dst_pool; -+ uint64_t dst_txg; -+ int dst_space; - dsl_checkfunc_t *dst_checkfunc; - dsl_syncfunc_t *dst_syncfunc; -- void *dst_arg1; -- void *dst_arg2; -- int dst_err; -+ void *dst_arg; -+ int dst_error; -+ boolean_t dst_nowaiter; - } dsl_sync_task_t; - --typedef struct dsl_sync_task_group { -- txg_node_t dstg_node; -- list_t dstg_tasks; -- struct dsl_pool *dstg_pool; -- uint64_t dstg_txg; -- int dstg_err; -- int dstg_space; -- boolean_t dstg_nowaiter; --} dsl_sync_task_group_t; -- --dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); --void dsl_sync_task_create(dsl_sync_task_group_t *dstg, -- dsl_checkfunc_t *, dsl_syncfunc_t *, -- void *arg1, void *arg2, int blocks_modified); --int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); --void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); --void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); --void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); -- --int dsl_sync_task_do(struct dsl_pool *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified); --void dsl_sync_task_do_nowait(struct dsl_pool *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); -+void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx); -+int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, -+ dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified); -+void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc, -+ void *arg, int blocks_modified, dmu_tx_t *tx); - -diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h -new file mode 100644 -index 0000000..071aeb8 ---- /dev/null -+++ b/include/sys/dsl_userhold.h -@@ -0,0 +1,57 @@ -+ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#ifndef _SYS_DSL_USERHOLD_H -+#define _SYS_DSL_USERHOLD_H -+ -+#include -+#include -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+struct dsl_pool; -+struct dsl_dataset; -+struct dmu_tx; -+ -+int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, -+ nvlist_t *errlist); -+int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); -+int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); -+void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); -+int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, -+ boolean_t temphold, struct dmu_tx *tx); -+void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, -+ minor_t minor, uint64_t now, struct dmu_tx *tx); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _SYS_DSL_USERHOLD_H */ -diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h -index 741b99e..d541b07 100644 ---- a/include/sys/fm/fs/zfs.h -+++ b/include/sys/fm/fs/zfs.h -@@ -41,3 +41,3 @@ extern "C" { - #define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy" --#define FM_EREPORT_ZFS_POOL_REGUID "zpool.reguid" -+#define FM_EREPORT_ZFS_POOL_REGUID "zpool.reguid" - #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" -@@ -77,2 +77,7 @@ extern "C" { - #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" - #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" -diff --git a/include/sys/fm/protocol.h b/include/sys/fm/protocol.h -index 1ee2212..de05bb2 100644 ---- a/include/sys/fm/protocol.h -+++ b/include/sys/fm/protocol.h -@@ -72,2 +72,3 @@ extern "C" { - #define FM_EREPORT_TIME "time" -+#define FM_EREPORT_EID "eid" - -diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h -index a3a8c3f..18fe490 100644 ---- a/include/sys/fm/util.h -+++ b/include/sys/fm/util.h -@@ -73,3 +73,3 @@ typedef struct erpt_dump { - --#define ZEVENT_SHUTDOWN 0x1 -+#define ZEVENT_SHUTDOWN 0x1 - -@@ -78,7 +78,8 @@ typedef void zevent_cb_t(nvlist_t *, nvlist_t *); - typedef struct zevent_s { -- nvlist_t *ev_nvl; /* protected by the zevent_lock */ -- nvlist_t *ev_detector; /* " */ -- list_t ev_ze_list; /* " */ -- list_node_t ev_node; /* " */ -- zevent_cb_t *ev_cb; /* " */ -+ nvlist_t *ev_nvl; /* protected by the zevent_lock */ -+ nvlist_t *ev_detector; /* " */ -+ list_t ev_ze_list; /* " */ -+ list_node_t ev_node; /* " */ -+ zevent_cb_t *ev_cb; /* " */ -+ uint64_t ev_eid; - } zevent_t; -@@ -86,5 +87,5 @@ typedef struct zevent_s { - typedef struct zfs_zevent { -- zevent_t *ze_zevent; /* protected by the zevent_lock */ -- list_node_t ze_node; /* " */ -- uint64_t ze_dropped; /* " */ -+ zevent_t *ze_zevent; /* protected by the zevent_lock */ -+ list_node_t ze_node; /* " */ -+ uint64_t ze_dropped; /* " */ - } zfs_zevent_t; -@@ -100,2 +101,3 @@ extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); - extern int zfs_zevent_wait(zfs_zevent_t *); -+extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); - extern void zfs_zevent_init(zfs_zevent_t **); -diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h -index 26c24fc..ae72f83 100644 ---- a/include/sys/fs/zfs.h -+++ b/include/sys/fs/zfs.h -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -54,2 +54,12 @@ typedef enum { - -+typedef enum dmu_objset_type { -+ DMU_OST_NONE, -+ DMU_OST_META, -+ DMU_OST_ZFS, -+ DMU_OST_ZVOL, -+ DMU_OST_OTHER, /* For testing only! */ -+ DMU_OST_ANY, /* Be careful! */ -+ DMU_OST_NUMTYPES -+} dmu_objset_type_t; -+ - #define ZFS_TYPE_DATASET \ -@@ -130,3 +140,12 @@ typedef enum { - ZFS_PROP_CLONES, -+ ZFS_PROP_LOGICALUSED, -+ ZFS_PROP_LOGICALREFERENCED, -+ ZFS_PROP_INCONSISTENT, /* not exposed to the user */ - ZFS_PROP_SNAPDEV, -+ ZFS_PROP_ACLTYPE, -+ ZFS_PROP_SELINUX_CONTEXT, -+ ZFS_PROP_SELINUX_FSCONTEXT, -+ ZFS_PROP_SELINUX_DEFCONTEXT, -+ ZFS_PROP_SELINUX_ROOTCONTEXT, -+ ZFS_PROP_RELATIME, - ZFS_NUM_PROPS -@@ -518,3 +537,3 @@ typedef struct zpool_rewind_policy { - #define ZPOOL_CONFIG_REMOVING "removing" --#define ZPOOL_CONFIG_RESILVERING "resilvering" -+#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" - #define ZPOOL_CONFIG_COMMENT "comment" -@@ -531,2 +550,3 @@ typedef struct zpool_rewind_policy { - #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ -+#define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ - /* -@@ -687,2 +707,13 @@ typedef enum dsl_scan_state { - -+/* -+ * Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering -+ * of this enum must be maintained to ensure the errata identifiers map to -+ * the correct documentation. New errata may only be appended to the list -+ * and must contain corresponding documentation at the above link. -+ */ -+typedef enum zpool_errata { -+ ZPOOL_ERRATA_NONE, -+ ZPOOL_ERRATA_ZOL_2094_SCRUB, -+ ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, -+} zpool_errata_t; - -@@ -755,6 +786,9 @@ typedef struct ddt_histogram { - */ --#define ZFS_IOC ('Z' << 8) -- - typedef enum zfs_ioc { -- ZFS_IOC_POOL_CREATE = ZFS_IOC, -+ /* -+ * Illumos - 69/128 numbers reserved. -+ */ -+ ZFS_IOC_FIRST = ('Z' << 8), -+ ZFS_IOC = ZFS_IOC_FIRST, -+ ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, - ZFS_IOC_POOL_DESTROY, -@@ -781,4 +815,2 @@ typedef enum zfs_ioc { - ZFS_IOC_SET_PROP, -- ZFS_IOC_CREATE_MINOR, -- ZFS_IOC_REMOVE_MINOR, - ZFS_IOC_CREATE, -@@ -795,3 +827,2 @@ typedef enum zfs_ioc { - ZFS_IOC_PROMOTE, -- ZFS_IOC_DESTROY_SNAPS_NVL, - ZFS_IOC_SNAPSHOT, -@@ -818,9 +849,27 @@ typedef enum zfs_ioc { - ZFS_IOC_OBJ_TO_STATS, -- ZFS_IOC_EVENTS_NEXT, -- ZFS_IOC_EVENTS_CLEAR, -- ZFS_IOC_POOL_REGUID, - ZFS_IOC_SPACE_WRITTEN, - ZFS_IOC_SPACE_SNAPS, -+ ZFS_IOC_DESTROY_SNAPS, -+ ZFS_IOC_POOL_REGUID, - ZFS_IOC_POOL_REOPEN, - ZFS_IOC_SEND_PROGRESS, -+ ZFS_IOC_LOG_HISTORY, -+ ZFS_IOC_SEND_NEW, -+ ZFS_IOC_SEND_SPACE, -+ ZFS_IOC_CLONE, -+ -+ /* -+ * Linux - 3/64 numbers reserved. -+ */ -+ ZFS_IOC_LINUX = ('Z' << 8) + 0x80, -+ ZFS_IOC_EVENTS_NEXT, -+ ZFS_IOC_EVENTS_CLEAR, -+ ZFS_IOC_EVENTS_SEEK, -+ -+ /* -+ * FreeBSD - 1/64 numbers reserved. -+ */ -+ ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0, -+ -+ ZFS_IOC_LAST - } zfs_ioc_t; -@@ -830,3 +879,3 @@ typedef enum zfs_ioc { - */ --#define BLKZNAME _IOR(0x12,125,char[ZFS_MAXNAMELEN]) -+#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAXNAMELEN]) - -@@ -866,2 +915,8 @@ typedef enum { - #define ZPOOL_HIST_INT_STR "history internal str" -+#define ZPOOL_HIST_INT_NAME "internal_name" -+#define ZPOOL_HIST_IOCTL "ioctl" -+#define ZPOOL_HIST_INPUT_NVL "in_nvl" -+#define ZPOOL_HIST_OUTPUT_NVL "out_nvl" -+#define ZPOOL_HIST_DSNAME "dsname" -+#define ZPOOL_HIST_DSID "dsid" - -@@ -884,2 +939,3 @@ typedef enum { - #define ZFS_IMPORT_ONLY 0x8 -+#define ZFS_IMPORT_TEMP_NAME 0x10 - -@@ -911,53 +967,2 @@ typedef enum { - --/* -- * Note: This is encoded on-disk, so new events must be added to the -- * end, and unused events can not be removed. Be sure to edit -- * libzfs_pool.c: hist_event_table[]. -- */ --typedef enum history_internal_events { -- LOG_NO_EVENT = 0, -- LOG_POOL_CREATE, -- LOG_POOL_VDEV_ADD, -- LOG_POOL_REMOVE, -- LOG_POOL_DESTROY, -- LOG_POOL_EXPORT, -- LOG_POOL_IMPORT, -- LOG_POOL_VDEV_ATTACH, -- LOG_POOL_VDEV_REPLACE, -- LOG_POOL_VDEV_DETACH, -- LOG_POOL_VDEV_ONLINE, -- LOG_POOL_VDEV_OFFLINE, -- LOG_POOL_UPGRADE, -- LOG_POOL_CLEAR, -- LOG_POOL_SCAN, -- LOG_POOL_PROPSET, -- LOG_DS_CREATE, -- LOG_DS_CLONE, -- LOG_DS_DESTROY, -- LOG_DS_DESTROY_BEGIN, -- LOG_DS_INHERIT, -- LOG_DS_PROPSET, -- LOG_DS_QUOTA, -- LOG_DS_PERM_UPDATE, -- LOG_DS_PERM_REMOVE, -- LOG_DS_PERM_WHO_REMOVE, -- LOG_DS_PROMOTE, -- LOG_DS_RECEIVE, -- LOG_DS_RENAME, -- LOG_DS_RESERVATION, -- LOG_DS_REPLAY_INC_SYNC, -- LOG_DS_REPLAY_FULL_SYNC, -- LOG_DS_ROLLBACK, -- LOG_DS_SNAPSHOT, -- LOG_DS_UPGRADE, -- LOG_DS_REFQUOTA, -- LOG_DS_REFRESERV, -- LOG_POOL_SCAN_DONE, -- LOG_DS_USER_HOLD, -- LOG_DS_USER_RELEASE, -- LOG_POOL_SPLIT, -- LOG_POOL_GUID_CHANGE, -- LOG_END --} history_internal_events_t; -- - #ifdef __cplusplus -diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h -index 9991242..70f7af0 100644 ---- a/include/sys/metaslab.h -+++ b/include/sys/metaslab.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -59,2 +59,3 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, - extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -+extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp); - extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); -diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h -index a36baed..36aa60d 100644 ---- a/include/sys/metaslab_impl.h -+++ b/include/sys/metaslab_impl.h -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -47,2 +47,3 @@ struct metaslab_class { - uint64_t mc_aliquot; -+ uint64_t mc_alloc_groups; /* # of allocatable groups */ - uint64_t mc_alloc; /* total allocated space */ -@@ -60,2 +61,4 @@ struct metaslab_group { - uint64_t mg_alloc_failures; -+ boolean_t mg_allocatable; /* can we allocate? */ -+ uint64_t mg_free_capacity; /* percentage free */ - int64_t mg_bias; -diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h -index c502568..7a67870 100644 ---- a/include/sys/nvpair.h -+++ b/include/sys/nvpair.h -@@ -287,2 +287,3 @@ nvlist_t *fnvlist_dup(nvlist_t *); - void fnvlist_merge(nvlist_t *, nvlist_t *); -+size_t fnvlist_num_pairs(nvlist_t *); - -diff --git a/include/sys/refcount.h b/include/sys/refcount.h -index 1752c64..e767a23 100644 ---- a/include/sys/refcount.h -+++ b/include/sys/refcount.h -@@ -52,2 +52,3 @@ typedef struct refcount { - kmutex_t rc_mtx; -+ boolean_t rc_tracked; - list_t rc_list; -@@ -58,5 +59,6 @@ typedef struct refcount { - --/* Note: refcount_t must be initialized with refcount_create() */ -+/* Note: refcount_t must be initialized with refcount_create[_untracked]() */ - - void refcount_create(refcount_t *rc); -+void refcount_create_untracked(refcount_t *rc); - void refcount_destroy(refcount_t *rc); -@@ -81,2 +83,3 @@ typedef struct refcount { - #define refcount_create(rc) ((rc)->rc_count = 0) -+#define refcount_create_untracked(rc) ((rc)->rc_count = 0) - #define refcount_destroy(rc) ((rc)->rc_count = 0) -diff --git a/include/sys/rrwlock.h b/include/sys/rrwlock.h -index 798a015..25c8a52 100644 ---- a/include/sys/rrwlock.h -+++ b/include/sys/rrwlock.h -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -59,2 +62,3 @@ typedef struct rrwlock { - boolean_t rr_writer_wanted; -+ boolean_t rr_track_all; - } rrwlock_t; -@@ -66,7 +70,10 @@ typedef struct rrwlock { - */ --void rrw_init(rrwlock_t *rrl); -+void rrw_init(rrwlock_t *rrl, boolean_t track_all); - void rrw_destroy(rrwlock_t *rrl); - void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); -+void rrw_enter_read(rrwlock_t *rrl, void *tag); -+void rrw_enter_write(rrwlock_t *rrl); - void rrw_exit(rrwlock_t *rrl, void *tag); - boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); -+void rrw_tsd_destroy(void *arg); - -@@ -74,2 +81,4 @@ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); - #define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) -+#define RRW_LOCK_HELD(x) \ -+ (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) - -diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h -index 8ae05ce..fcbd8eb 100644 ---- a/include/sys/sa_impl.h -+++ b/include/sys/sa_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -152,5 +152,6 @@ struct sa_os { - * header for all bonus and spill buffers. -+ * - * The header has a fixed portion with a variable number - * of "lengths" depending on the number of variable sized -- * attribues which are determined by the "layout number" -+ * attributes which are determined by the "layout number" - */ -@@ -160,3 +161,19 @@ typedef struct sa_hdr_phys { - uint32_t sa_magic; -- uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ -+ /* -+ * Encoded with hdrsize and layout number as follows: -+ * 16 10 0 -+ * +--------+-------+ -+ * | hdrsz |layout | -+ * +--------+-------+ -+ * -+ * Bits 0-10 are the layout number -+ * Bits 11-16 are the size of the header. -+ * The hdrsize is the number * 8 -+ * -+ * For example. -+ * hdrsz of 1 ==> 8 byte header -+ * 2 ==> 16 byte header -+ * -+ */ -+ uint16_t sa_layout_info; - uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ -@@ -165,20 +182,2 @@ typedef struct sa_hdr_phys { - --/* -- * sa_hdr_phys -> sa_layout_info -- * -- * 16 10 0 -- * +--------+-------+ -- * | hdrsz |layout | -- * +--------+-------+ -- * -- * Bits 0-10 are the layout number -- * Bits 11-16 are the size of the header. -- * The hdrsize is the number * 8 -- * -- * For example. -- * hdrsz of 1 ==> 8 byte header -- * 2 ==> 16 byte header -- * -- */ -- - #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -diff --git a/include/sys/spa.h b/include/sys/spa.h -index 8f2af8a..d5a91c2 100644 ---- a/include/sys/spa.h -+++ b/include/sys/spa.h -@@ -53,3 +53,6 @@ typedef struct ddt ddt_t; - typedef struct ddt_entry ddt_entry_t; -+typedef struct zbookmark zbookmark_t; -+ - struct dsl_pool; -+struct dsl_dataset; - -@@ -421,5 +424,5 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, -- const char *history_str, nvlist_t *zplprops); -+ nvlist_t *zplprops); - extern int spa_import_rootpool(char *devpath, char *devid); --extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, -+extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, - uint64_t flags); -@@ -535,2 +538,39 @@ extern boolean_t spa_refcount_zero(spa_t *spa); - -+/* Historical pool statistics */ -+typedef struct spa_stats_history { -+ kmutex_t lock; -+ uint64_t count; -+ uint64_t size; -+ kstat_t *kstat; -+ void *private; -+ list_t list; -+} spa_stats_history_t; -+ -+typedef struct spa_stats { -+ spa_stats_history_t read_history; -+ spa_stats_history_t txg_history; -+ spa_stats_history_t tx_assign_histogram; -+ spa_stats_history_t io_history; -+} spa_stats_t; -+ -+typedef enum txg_state { -+ TXG_STATE_BIRTH = 0, -+ TXG_STATE_OPEN = 1, -+ TXG_STATE_QUIESCED = 2, -+ TXG_STATE_WAIT_FOR_SYNC = 3, -+ TXG_STATE_SYNCED = 4, -+ TXG_STATE_COMMITTED = 5, -+} txg_state_t; -+ -+extern void spa_stats_init(spa_t *spa); -+extern void spa_stats_destroy(spa_t *spa); -+extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, -+ uint32_t aflags); -+extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); -+extern int spa_txg_history_set(spa_t *spa, uint64_t txg, -+ txg_state_t completed_state, hrtime_t completed_time); -+extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, -+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty); -+extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); -+ - /* Pool configuration locks */ -@@ -631,17 +671,2 @@ extern uint64_t strtonum(const char *str, char **nptr); - --/* history logging */ --typedef enum history_log_type { -- LOG_CMD_POOL_CREATE, -- LOG_CMD_NORMAL, -- LOG_INTERNAL --} history_log_type_t; -- --typedef struct history_arg { -- char *ha_history_str; -- history_log_type_t ha_log_type; -- history_internal_events_t ha_event; -- char *ha_zone; -- uid_t ha_uid; --} history_arg_t; -- - extern char *spa_his_ievent_table[]; -@@ -651,7 +676,11 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, - char *his_buf); --extern int spa_history_log(spa_t *spa, const char *his_buf, -- history_log_type_t what); --extern void spa_history_log_internal(history_internal_events_t event, -- spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); --extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); -+extern int spa_history_log(spa_t *spa, const char *his_buf); -+extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); -+extern void spa_history_log_version(spa_t *spa, const char *operation); -+extern void spa_history_log_internal(spa_t *spa, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...); -+extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, -+ dmu_tx_t *tx, const char *fmt, ...); -+extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...); - -diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h -index 47dfe43..90a32d3 100644 ---- a/include/sys/spa_impl.h -+++ b/include/sys/spa_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -236,6 +236,9 @@ struct spa { - uint64_t spa_deadman_calls; /* number of deadman calls */ -- uint64_t spa_sync_starttime; /* starting time fo spa_sync */ -+ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ - uint64_t spa_deadman_synctime; /* deadman expiration timer */ -+ uint64_t spa_errata; /* errata issues detected */ -+ spa_stats_t spa_stats; /* assorted spa statistics */ -+ - /* -- * spa_refcnt & spa_config_lock must be the last elements -+ * spa_refcount & spa_config_lock must be the last elements - * because refcount_t changes size based on compilation options. -diff --git a/include/sys/space_map.h b/include/sys/space_map.h -index 2da80d2..588feb8 100644 ---- a/include/sys/space_map.h -+++ b/include/sys/space_map.h -@@ -96,3 +96,2 @@ struct space_map_ops { - * -- * - * non-debug entry -@@ -151,2 +150,4 @@ extern boolean_t space_map_contains(space_map_t *sm, - uint64_t start, uint64_t size); -+extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start, -+ uint64_t size, avl_index_t *wherep); - extern void space_map_swap(space_map_t **msrc, space_map_t **mdest); -diff --git a/include/sys/txg.h b/include/sys/txg.h -index f9d6dd4..1bb6bac 100644 ---- a/include/sys/txg.h -+++ b/include/sys/txg.h -@@ -25,3 +25,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -47,5 +47,2 @@ extern "C" { - --#define TXG_WAIT 1ULL --#define TXG_NOWAIT 2ULL -- - typedef struct tx_cpu tx_cpu_t; -@@ -79,9 +76,5 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); - --/* -- * Delay the caller by the specified number of ticks or until -- * the txg closes (whichever comes first). This is intended -- * to be used to throttle writers when the system nears its -- * capacity. -- */ --extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); -+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, -+ hrtime_t resolution); -+extern void txg_kick(struct dsl_pool *dp); - -@@ -127,7 +120,7 @@ extern void txg_list_destroy(txg_list_t *tl); - extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); --extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); --extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); - extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); - extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); --extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); - extern void *txg_list_head(txg_list_t *tl, uint64_t txg); -diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h -index 7b356ea..e583d61 100644 ---- a/include/sys/txg_impl.h -+++ b/include/sys/txg_impl.h -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -25,2 +26,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #ifndef _SYS_TXG_IMPL_H -@@ -35,13 +40,55 @@ extern "C" { - -+/* -+ * The tx_cpu structure is a per-cpu structure that is used to track -+ * the number of active transaction holds (tc_count). As transactions -+ * are assigned into a transaction group the appropriate tc_count is -+ * incremented to indicate that there are pending changes that have yet -+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement -+ * the tc_count. A transaction group is not considered quiesced until all -+ * tx_cpu structures have reached a tc_count of zero. -+ * -+ * This structure is a per-cpu structure by design. Updates to this structure -+ * are frequent and concurrent. Having a single structure would result in -+ * heavy lock contention so a per-cpu design was implemented. With the fanned -+ * out mutex design, consumers only need to lock the mutex associated with -+ * thread's cpu. -+ * -+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock. -+ * The tc_lock is used to protect all members of the tx_cpu structure with -+ * the exception of the tc_open_lock. This lock should only be held for a -+ * short period of time, typically when updating the value of tc_count. -+ * -+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure. -+ * This lock is used to ensure that transactions are only assigned into -+ * the current open transaction group. In order to move the current open -+ * transaction group to the quiesce phase, the txg_quiesce thread must -+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. -+ * The tc_open_lock is held until the transaction is assigned into the -+ * transaction group. Typically, this is a short operation but if throttling -+ * is occuring it may be held for longer periods of time. -+ */ - struct tx_cpu { -- kmutex_t tc_lock; -+ kmutex_t tc_open_lock; /* protects tx_open_txg */ -+ kmutex_t tc_lock; /* protects the rest of this struct */ - kcondvar_t tc_cv[TXG_SIZE]; -- uint64_t tc_count[TXG_SIZE]; -+ uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ - list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ -- char tc_pad[16]; -+ char tc_pad[8]; /* pad to fill 3 cache lines */ - }; - -+/* -+ * The tx_state structure maintains the state information about the different -+ * stages of the pool's transcation groups. A per pool tx_state structure -+ * is used to track this information. The tx_state structure also points to -+ * an array of tx_cpu structures (described above). Although the tx_sync_lock -+ * is used to protect the members of this structure, it is not used to -+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure -+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. -+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on -+ * every cpu (see txg_quiesce()). -+ */ - typedef struct tx_state { -- tx_cpu_t *tx_cpu; /* protects right to enter txg */ -- kmutex_t tx_sync_lock; /* protects tx_state_t */ -+ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ -+ kmutex_t tx_sync_lock; /* protects the rest of this struct */ -+ - uint64_t tx_open_txg; /* currently open txg id */ -@@ -51,2 +98,4 @@ typedef struct tx_state { - -+ hrtime_t tx_open_time; /* start time of tx_open_txg */ -+ - uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ -diff --git a/include/sys/unique.h b/include/sys/unique.h -index d971752..d4ba32e 100644 ---- a/include/sys/unique.h -+++ b/include/sys/unique.h -@@ -28,4 +28,2 @@ - -- -- - #include -@@ -44,3 +42,3 @@ void unique_fini(void); - * Return a new unique value (which will not be uniquified against until -- * it is unique_insert()-ed. -+ * it is unique_insert()-ed). - */ -diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h -index daefed7..d5a1889 100644 ---- a/include/sys/vdev_disk.h -+++ b/include/sys/vdev_disk.h -@@ -28,3 +28,3 @@ - #ifndef _SYS_VDEV_DISK_H --#define _SYS_VDEV_DISK_H -+#define _SYS_VDEV_DISK_H - -diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h -index e0669cc..4b465d2 100644 ---- a/include/sys/vdev_impl.h -+++ b/include/sys/vdev_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -102,8 +102,18 @@ struct vdev_cache { - -+typedef struct vdev_queue_class { -+ uint32_t vqc_active; -+ -+ /* -+ * Sorted by offset or timestamp, depending on if the queue is -+ * LBA-ordered vs FIFO. -+ */ -+ avl_tree_t vqc_queued_tree; -+} vdev_queue_class_t; -+ - struct vdev_queue { -- avl_tree_t vq_deadline_tree; -- avl_tree_t vq_read_tree; -- avl_tree_t vq_write_tree; -- avl_tree_t vq_pending_tree; -- hrtime_t vq_io_complete_ts; -+ vdev_t *vq_vdev; -+ vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; -+ avl_tree_t vq_active_tree; -+ uint64_t vq_last_offset; -+ hrtime_t vq_io_complete_ts; /* time last i/o completed */ - hrtime_t vq_io_delta_ts; -@@ -184,3 +194,3 @@ struct vdev { - uint64_t vdev_removed; /* persistent removed state */ -- uint64_t vdev_resilvering; /* persistent resilvering state */ -+ uint64_t vdev_resilver_txg; /* persistent resilvering state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ -@@ -256,8 +266,9 @@ typedef struct vdev_label { - -+/* Offset of embedded boot loader region on each label */ -+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) - /* -- * Size and offset of embedded boot loader region on each label. -+ * Size of embedded boot loader region on each label. - * The total size of the first two labels plus the boot area is 4MB. - */ --#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) --#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ -+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ - -@@ -328,4 +339,5 @@ extern void vdev_set_min_asize(vdev_t *vd); - /* -- * zdb uses this tunable, so it must be declared here to make lint happy. -+ * Global variables - */ -+/* zdb uses this tunable, so it must be declared here to make lint happy. */ - extern int zfs_vdev_cache_size; -diff --git a/include/sys/zap.h b/include/sys/zap.h -index 092669c..aabfca7 100644 ---- a/include/sys/zap.h -+++ b/include/sys/zap.h -@@ -88,9 +88,3 @@ extern "C" { - /* -- * The matchtype specifies which entry will be accessed. -- * MT_EXACT: only find an exact match (non-normalized) -- * MT_FIRST: find the "first" normalized (case and Unicode -- * form) match; the designated "first" match will not change as long -- * as the set of entries with this normalization doesn't change -- * MT_BEST: if there is an exact match, find that, otherwise find the -- * first normalized match -+ * Specifies matching criteria for ZAP lookups. - */ -@@ -98,4 +92,14 @@ typedef enum matchtype - { -+ /* Only find an exact match (non-normalized) */ - MT_EXACT, -+ /* -+ * If there is an exact match, find that, otherwise find the -+ * first normalized match. -+ */ - MT_BEST, -+ /* -+ * Find the "first" normalized (case and Unicode form) match; -+ * the designated "first" match will not change as long as the -+ * set of entries with this normalization doesn't change. -+ */ - MT_FIRST -@@ -176,5 +180,6 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - * If 'integer_size' is equal to or larger than the attribute's integer -- * size, the call will succeed and return 0. * When converting to a -- * larger integer size, the integers will be treated as unsigned (ie. no -- * sign-extension will be performed). -+ * size, the call will succeed and return 0. -+ * -+ * When converting to a larger integer size, the integers will be treated as -+ * unsigned (ie. no sign-extension will be performed). - * -@@ -185,3 +190,7 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - * transferred, the call will return EOVERFLOW. -- * -+ */ -+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, -+ uint64_t integer_size, uint64_t num_integers, void *buf); -+ -+/* - * If rn_len is nonzero, realname will be set to the name of the found -@@ -193,4 +202,2 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - */ --int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, -- uint64_t integer_size, uint64_t num_integers, void *buf); - int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, -diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h -index 3a33636..f6947a7 100644 ---- a/include/sys/zap_leaf.h -+++ b/include/sys/zap_leaf.h -@@ -103,2 +103,3 @@ typedef struct zap_leaf_phys { - struct zap_leaf_header { -+ /* Public to ZAP */ - uint64_t lh_block_type; /* ZBT_LEAF */ -@@ -111,4 +112,3 @@ typedef struct zap_leaf_phys { - --/* above is accessable to zap, below is zap_leaf private */ -- -+ /* Private to zap_leaf */ - uint16_t lh_freelist; /* chunk head of free list */ -@@ -163,3 +163,3 @@ typedef struct zap_leaf { - typedef struct zap_entry_handle { -- /* below is set by zap_leaf.c and is public to zap.c */ -+ /* Set by zap_leaf and public to ZAP */ - uint64_t zeh_num_integers; -@@ -169,3 +169,3 @@ typedef struct zap_entry_handle { - -- /* below is private to zap_leaf.c */ -+ /* Private to zap_leaf */ - uint16_t zeh_fakechunk; -@@ -204,3 +204,3 @@ extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, - * -- * zap_entry_update may fail if it runs out of space (ENOSPC). -+ * May fail if it runs out of space (ENOSPC). - */ -@@ -223,6 +223,3 @@ extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, - --/* -- * Return true if there are additional entries with the same normalized -- * form. -- */ -+/* Determine whether there is another entry with the same normalized form. */ - extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, -diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h -index 481e85b..1a081e4 100644 ---- a/include/sys/zfeature.h -+++ b/include/sys/zfeature.h -@@ -28,3 +28,2 @@ - --#include - #include -@@ -36,10 +35,14 @@ extern "C" { - --extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, -+struct spa; -+struct dmu_tx; -+struct objset; -+ -+extern boolean_t feature_is_supported(struct objset *os, uint64_t obj, - uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat); - --struct spa; --extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); --extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); --extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); --extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); -+extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); -+extern void spa_feature_enable(struct spa *, zfeature_info_t *, -+ struct dmu_tx *); -+extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *); -+extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *); - extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); -diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h -index 11fc335..2c51f09 100644 ---- a/include/sys/zfs_acl.h -+++ b/include/sys/zfs_acl.h -@@ -49,3 +49,4 @@ struct znode_phys; - /* -- * ZFS ACLs are store in various forms. -+ * ZFS ACLs (Access Control Lists) are stored in various forms. -+ * - * Files created with ACL version ZFS_ACL_VERSION_INITIAL -@@ -139,4 +140,4 @@ typedef struct acl_ops { - int (*ace_mask_off)(void); /* off of access mask in ace */ -+ /* ptr to data if any */ - int (*ace_data)(void *acep, void **datap); -- /* ptr to data if any */ - } acl_ops_t; -diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h -index 4b34259..fa12cea 100644 ---- a/include/sys/zfs_context.h -+++ b/include/sys/zfs_context.h -@@ -27,3 +27,3 @@ - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -61,3 +61,5 @@ - #include -+#include - #include -+#include - #include -@@ -98,2 +100,4 @@ - #include -+#include -+#include - #include -@@ -119,3 +123,3 @@ - --#define noinline __attribute__((noinline)) -+#define noinline __attribute__((noinline)) - -@@ -151,2 +155,3 @@ extern void vpanic(const char *, __va_list); - -+#ifdef __sun - /* -@@ -159,4 +164,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE --#define DTRACE_PROBE(a) ((void)0) - #endif /* DTRACE_PROBE */ -+#define DTRACE_PROBE(a) \ -+ ZFS_PROBE0(#a) - -@@ -164,4 +170,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE1 --#define DTRACE_PROBE1(a, b, c) ((void)0) - #endif /* DTRACE_PROBE1 */ -+#define DTRACE_PROBE1(a, b, c) \ -+ ZFS_PROBE1(#a, (unsigned long)c) - -@@ -169,4 +176,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE2 --#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) - #endif /* DTRACE_PROBE2 */ -+#define DTRACE_PROBE2(a, b, c, d, e) \ -+ ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e) - -@@ -174,4 +182,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE3 --#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) - #endif /* DTRACE_PROBE3 */ -+#define DTRACE_PROBE3(a, b, c, d, e, f, g) \ -+ ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g) - -@@ -179,6 +188,19 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE4 --#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) - #endif /* DTRACE_PROBE4 */ -+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \ -+ ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \ -+ (unsigned long)i) - - /* -+ * We use the comma operator so that this macro can be used without much -+ * additional code. For example, "return (EINVAL);" becomes -+ * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated -+ * twice, so it should not have side effects (e.g. something like: -+ * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). -+ */ -+#define SET_ERROR(err) (ZFS_SET_ERROR(err), err) -+#else -+#define SET_ERROR(err) (err) -+#endif -+/* - * Threads -@@ -205,2 +227,4 @@ typedef pthread_t kt_did_t; - -+#define kpreempt(x) ((void)0) -+ - typedef struct kthread { -@@ -211,5 +235,4 @@ typedef struct kthread { - --#define tsd_get(key) pthread_getspecific(key) --#define tsd_set(key, val) pthread_setspecific(key, val) - #define curthread zk_thread_current() -+#define getcomm() "unknown" - #define thread_exit zk_thread_exit -@@ -217,5 +240,5 @@ typedef struct kthread { - zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ -- len, NULL, state, pri, PTHREAD_CREATE_DETACHED) -+ len, NULL, state, pri, PTHREAD_CREATE_DETACHED) - #define thread_join(t) zk_thread_join(t) --#define newproc(f,a,cid,pri,ctp,pid) (ENOSYS) -+#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) - -@@ -250,3 +273,3 @@ typedef struct kmutex { - #define MUTEX_DEFAULT 0 --#define MUTEX_HELD(m) ((m)->m_owner == curthread) -+#define MUTEX_HELD(m) ((m)->m_owner == curthread) - #define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) -@@ -280,3 +303,3 @@ typedef int krw_t; - #define RW_WRITER 1 --#define RW_DEFAULT RW_READER -+#define RW_DEFAULT RW_READER - -@@ -286,2 +309,8 @@ typedef int krw_t; - -+#undef RW_LOCK_HELD -+#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) -+ -+#undef RW_LOCK_HELD -+#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) -+ - extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); -@@ -295,2 +324,3 @@ extern void rw_exit(krwlock_t *rwlp); - extern uid_t crgetuid(cred_t *cr); -+extern uid_t crgetruid(cred_t *cr); - extern gid_t crgetgid(cred_t *cr); -@@ -302,3 +332,3 @@ extern gid_t *crgetgroups(cred_t *cr); - */ --#define CV_MAGIC 0xd31ea9a83b1b30c4ull -+#define CV_MAGIC 0xd31ea9a83b1b30c4ull - -@@ -315,7 +345,25 @@ extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); - extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); -+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, -+ hrtime_t res, int flag); - extern void cv_signal(kcondvar_t *cv); - extern void cv_broadcast(kcondvar_t *cv); --#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) --#define cv_wait_interruptible(cv, mp) cv_wait(cv, mp) --#define cv_wait_io(cv, mp) cv_wait(cv, mp) -+#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) -+#define cv_wait_interruptible(cv, mp) cv_wait(cv, mp) -+#define cv_wait_io(cv, mp) cv_wait(cv, mp) -+ -+/* -+ * Thread-specific data -+ */ -+#define tsd_get(k) pthread_getspecific(k) -+#define tsd_set(k, v) pthread_setspecific(k, v) -+#define tsd_create(kp, d) pthread_key_create(kp, d) -+#define tsd_destroy(kp) /* nothing */ -+ -+/* -+ * Thread-specific data -+ */ -+#define tsd_get(k) pthread_getspecific(k) -+#define tsd_set(k, v) pthread_setspecific(k, v) -+#define tsd_create(kp, d) pthread_key_create(kp, d) -+#define tsd_destroy(kp) /* nothing */ - -@@ -324,6 +372,16 @@ extern void cv_broadcast(kcondvar_t *cv); - */ --extern kstat_t *kstat_create(char *, int, -- char *, char *, uchar_t, ulong_t, uchar_t); -+extern kstat_t *kstat_create(const char *, int, -+ const char *, const char *, uchar_t, ulong_t, uchar_t); - extern void kstat_install(kstat_t *); - extern void kstat_delete(kstat_t *); -+extern void kstat_waitq_enter(kstat_io_t *); -+extern void kstat_waitq_exit(kstat_io_t *); -+extern void kstat_runq_enter(kstat_io_t *); -+extern void kstat_runq_exit(kstat_io_t *); -+extern void kstat_waitq_to_runq(kstat_io_t *); -+extern void kstat_runq_back_to_waitq(kstat_io_t *); -+extern void kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)); - -@@ -593,3 +651,3 @@ extern char *kmem_vasprintf(const char *fmt, va_list adx); - extern char *kmem_asprintf(const char *fmt, ...); --#define strfree(str) kmem_free((str), strlen(str)+1) -+#define strfree(str) kmem_free((str), strlen(str) + 1) - -@@ -655,2 +713,11 @@ void ksiddomain_rele(ksiddomain_t *); - -+#define zfs_sleep_until(wakeup) \ -+ do { \ -+ hrtime_t delta = wakeup - gethrtime(); \ -+ struct timespec ts; \ -+ ts.tv_sec = delta / NANOSEC; \ -+ ts.tv_nsec = delta % NANOSEC; \ -+ (void) nanosleep(&ts, NULL); \ -+ } while (0) -+ - #endif /* _KERNEL */ -diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h -index 591d0df..e512079 100644 ---- a/include/sys/zfs_debug.h -+++ b/include/sys/zfs_debug.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -27,2 +28,6 @@ - -+#ifdef __cplusplus -+extern "C" { -+#endif -+ - #ifndef TRUE -@@ -38,2 +43,3 @@ - */ -+ - #if !defined(ZFS_DEBUG) && !defined(_KERNEL) -@@ -45,7 +51,9 @@ extern int zfs_recover; - --#define ZFS_DEBUG_DPRINTF 0x0001 --#define ZFS_DEBUG_DBUF_VERIFY 0x0002 --#define ZFS_DEBUG_DNODE_VERIFY 0x0004 --#define ZFS_DEBUG_SNAPNAMES 0x0008 --#define ZFS_DEBUG_MODIFY 0x0010 -+#define ZFS_DEBUG_DPRINTF (1<<0) -+#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -+#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -+#define ZFS_DEBUG_SNAPNAMES (1<<3) -+#define ZFS_DEBUG_MODIFY (1<<4) -+#define ZFS_DEBUG_SPA (1<<5) -+#define ZFS_DEBUG_ZIO_FREE (1<<6) - -@@ -69,3 +77,3 @@ extern int zfs_recover; - #else --#define dprintf(...) \ -+#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ -@@ -75,6 +83,26 @@ extern int zfs_recover; - --void zfs_panic_recover(const char *fmt, ...); --#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) --void zfs_dbgmsg_init(void); --void zfs_dbgmsg_fini(void); -+extern void zfs_panic_recover(const char *fmt, ...); -+ -+typedef struct zfs_dbgmsg { -+ list_node_t zdm_node; -+ time_t zdm_timestamp; -+ char zdm_msg[1]; /* variable length allocation */ -+} zfs_dbgmsg_t; -+ -+extern void zfs_dbgmsg_init(void); -+extern void zfs_dbgmsg_fini(void); -+#if defined(_KERNEL) && defined(__linux__) -+#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) -+#else -+extern void zfs_dbgmsg(const char *fmt, ...); -+extern void zfs_dbgmsg_print(const char *tag); -+#endif -+ -+#ifndef _KERNEL -+extern int dprintf_find_string(const char *string); -+#endif -+ -+#ifdef __cplusplus -+} -+#endif - -diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h -new file mode 100644 -index 0000000..b56a7da ---- /dev/null -+++ b/include/sys/zfs_delay.h -@@ -0,0 +1,41 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#ifndef _SYS_FS_ZFS_DELAY_H -+#define _SYS_FS_ZFS_DELAY_H -+ -+#include -+ -+/* -+ * Generic wrapper to sleep until a given time. -+ */ -+#define zfs_sleep_until(wakeup) \ -+ do { \ -+ hrtime_t delta = wakeup - gethrtime(); \ -+ \ -+ if (delta > 0) { \ -+ unsigned long delta_us; \ -+ delta_us = delta / (NANOSEC / MICROSEC); \ -+ usleep_range(delta_us, delta_us + 100); \ -+ } \ -+ } while (0) -+ -+#endif /* _SYS_FS_ZFS_DELAY_H */ -diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h -index c0cb470..0ab095c 100644 ---- a/include/sys/zfs_ioctl.h -+++ b/include/sys/zfs_ioctl.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -43,2 +44,11 @@ extern "C" { - /* -+ * The structures in this file are passed between userland and the -+ * kernel. Userland may be running a 32-bit process, while the kernel -+ * is 64-bit. Therefore, these structures need to compile the same in -+ * 32-bit and 64-bit. This means not using type "long", and adding -+ * explicit padding so that the 32-bit structure will not be packed more -+ * tightly than the 64-bit structure (which requires 64-bit alignment). -+ */ -+ -+/* - * Property values for snapdir -@@ -53,2 +63,7 @@ extern "C" { - #define ZFS_SNAPDEV_VISIBLE 1 -+/* -+ * Property values for acltype -+ */ -+#define ZFS_ACLTYPE_OFF 0 -+#define ZFS_ACLTYPE_POSIXACL 1 - -@@ -246,2 +261,3 @@ typedef struct zinject_record { - -+#define ZEVENT_NONE 0x0 - #define ZEVENT_NONBLOCK 0x1 -@@ -249,2 +265,5 @@ typedef struct zinject_record { - -+#define ZEVENT_SEEK_START 0 -+#define ZEVENT_SEEK_END UINT64_MAX -+ - typedef enum zinject_type { -@@ -279,6 +298,17 @@ typedef enum zfs_case { - typedef struct zfs_cmd { -- char zc_name[MAXPATHLEN]; -+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */ -+ uint64_t zc_nvlist_src; /* really (char *) */ -+ uint64_t zc_nvlist_src_size; -+ uint64_t zc_nvlist_dst; /* really (char *) */ -+ uint64_t zc_nvlist_dst_size; -+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ -+ int zc_pad2; -+ -+ /* -+ * The following members are for legacy ioctls which haven't been -+ * converted to the new method. -+ */ -+ uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; -- char zc_top_ds[MAXPATHLEN]; - uint64_t zc_guid; -@@ -286,6 +316,2 @@ typedef struct zfs_cmd { - uint64_t zc_nvlist_conf_size; -- uint64_t zc_nvlist_src; /* really (char *) */ -- uint64_t zc_nvlist_src_size; -- uint64_t zc_nvlist_dst; /* really (char *) */ -- uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; -@@ -293,4 +319,3 @@ typedef struct zfs_cmd { - uint64_t zc_perm_action; -- uint64_t zc_history; /* really (char *) */ -- uint64_t zc_history_len; -+ uint64_t zc_history_len; - uint64_t zc_history_offset; -@@ -337,3 +362,6 @@ extern int zfs_secpolicy_rename_perms(const char *from, - extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); --extern int zfs_unmount_snap(const char *, void *); -+extern int zfs_unmount_snap(const char *); -+extern void zfs_destroy_unmount_origin(const char *); -+ -+extern boolean_t dataset_name_hidden(const char *name); - -@@ -346,3 +374,3 @@ enum zfsdev_state_type { - typedef struct zfsdev_state { -- list_node_t zs_next; /* next zfsdev_state_t link */ -+ list_node_t zs_next; /* next zfsdev_state_t link */ - struct file *zs_file; /* associated file struct */ -diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h -index da18b1f..ea5e403 100644 ---- a/include/sys/zfs_rlock.h -+++ b/include/sys/zfs_rlock.h -@@ -28,4 +28,2 @@ - -- -- - #ifdef __cplusplus -@@ -60,6 +58,6 @@ typedef struct rl { - /* -- * Lock a range (offset, length) as either shared (READER) -- * or exclusive (WRITER or APPEND). APPEND is a special type that -- * is converted to WRITER that specified to lock from the start of the -- * end of file. zfs_range_lock() returns the range lock structure. -+ * Lock a range (offset, length) as either shared (RL_READER) -+ * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that -+ * is converted to RL_WRITER that specified to lock from the start of the -+ * end of file. Returns the range lock structure. - */ -@@ -67,5 +65,3 @@ rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); - --/* -- * Unlock range and destroy range lock structure. -- */ -+/* Unlock range and destroy range lock structure. */ - void zfs_range_unlock(rl_t *rl); -@@ -79,3 +75,4 @@ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); - /* -- * AVL comparison function used to compare range locks -+ * AVL comparison function used to order range locks -+ * Locks are ordered on the start offset of the range. - */ -diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h -index 0bac780..735d4b3 100644 ---- a/include/sys/zfs_sa.h -+++ b/include/sys/zfs_sa.h -@@ -130,4 +130,4 @@ typedef struct znode_phys { - --#define DXATTR_MAX_ENTRY_SIZE (32768) --#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) -+#define DXATTR_MAX_ENTRY_SIZE (32768) -+#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) - -diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h -index f685c12..eeeffbe 100644 ---- a/include/sys/zfs_vfsops.h -+++ b/include/sys/zfs_vfsops.h -@@ -62,2 +62,3 @@ typedef struct zfs_sb { - uint_t z_acl_inherit; /* acl inheritance behavior */ -+ uint_t z_acl_type; /* type of ACL usable on this FS */ - zfs_case_t z_case; /* case-sense */ -@@ -66,2 +67,3 @@ typedef struct zfs_sb { - boolean_t z_atime; /* enable atimes mount option */ -+ boolean_t z_relatime; /* enable relatime mount option */ - boolean_t z_unmounted; /* unmounted */ -@@ -71,3 +73,3 @@ typedef struct zfs_sb { - uint64_t z_nr_znodes; /* number of znodes in the fs */ -- unsigned long z_rollback_time;/* last online rollback time */ -+ unsigned long z_rollback_time; /* last online rollback time */ - kmutex_t z_znodes_lock; /* lock for z_all_znodes */ -@@ -103,3 +105,3 @@ typedef struct zfs_sb { - */ --#define ZFS_LINK_MAX ((1U << 31) - 1U) -+#define ZFS_LINK_MAX ((1U << 31) - 1U) - -diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h -index bdddcc3..a020068 100644 ---- a/include/sys/zfs_znode.h -+++ b/include/sys/zfs_znode.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -140,4 +141,5 @@ extern "C" { - --/* Path component length */ - /* -+ * Path component length -+ * - * The generic fs code uses MAXNAMELEN to represent -@@ -205,4 +207,4 @@ typedef struct znode { - uint64_t z_gid; /* gid fuid (cached) */ -- mode_t z_mode; /* mode (cached) */ - uint32_t z_sync_cnt; /* synchronous open count */ -+ mode_t z_mode; /* mode (cached) */ - kmutex_t z_acl_lock; /* acl data lock */ -@@ -210,4 +212,4 @@ typedef struct znode { - krwlock_t z_xattr_lock; /* xattr data lock */ -- nvlist_t *z_xattr_cached;/* cached xattrs */ -- struct znode *z_xattr_parent;/* xattr parent znode */ -+ nvlist_t *z_xattr_cached; /* cached xattrs */ -+ struct znode *z_xattr_parent; /* xattr parent znode */ - list_node_t z_link_node; /* all znodes in fs link */ -@@ -249,10 +251,6 @@ typedef struct znode { - --/* -- * ZFS_ENTER() is called on entry to each ZFS inode and vfs operation. -- * ZFS_EXIT() must be called before exitting the vop. -- * ZFS_VERIFY_ZP() verifies the znode is valid. -- */ -+/* Called on entry to each ZFS vnode and vfs operation */ - #define ZFS_ENTER(zsb) \ - { \ -- rrw_enter(&(zsb)->z_teardown_lock, RW_READER, FTAG); \ -+ rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \ - if ((zsb)->z_unmounted) { \ -@@ -263,2 +261,3 @@ typedef struct znode { - -+/* Must be called before exiting the vop */ - #define ZFS_EXIT(zsb) \ -@@ -269,2 +268,3 @@ typedef struct znode { - -+/* Verifies the znode is valid */ - #define ZFS_VERIFY_ZP(zp) \ -@@ -290,5 +290,3 @@ typedef struct znode { - --/* -- * Macros to encode/decode ZFS stored time values from/to struct timespec -- */ -+/* Encode ZFS stored time values from a struct timespec */ - #define ZFS_TIME_ENCODE(tp, stmp) \ -@@ -299,2 +297,3 @@ typedef struct znode { - -+/* Decode ZFS stored time values to a struct timespec */ - #define ZFS_TIME_DECODE(tp, stmp) \ -@@ -356,3 +355,4 @@ extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, -- znode_t *zp, offset_t off, ssize_t len, int ioflag); -+ znode_t *zp, offset_t off, ssize_t len, int ioflag, -+ zil_callback_t callback, void *callback_data); - extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, -diff --git a/include/sys/zil.h b/include/sys/zil.h -index 589e28f..4000742 100644 ---- a/include/sys/zil.h -+++ b/include/sys/zil.h -@@ -244,2 +244,8 @@ typedef struct { - * would be zero. -+ * -+ * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. -+ * If create is also setting xvattr's, then acl data follows xvattr. -+ * If ACE FUIDs are needed then they will follow the xvattr_t. Following -+ * the FUIDs will be the domain table information. The FUIDs for the owner -+ * and group will be in lr_create. Name follows ACL data. - */ -@@ -252,9 +258,2 @@ typedef struct { - uint64_t lr_acl_flags; /* ACL flags */ -- /* lr_acl_bytes number of variable sized ace's follows */ -- /* if create is also setting xvattr's, then acl data follows xvattr */ -- /* if ACE FUIDs are needed then they will follow the xvattr_t */ -- /* Following the FUIDs will be the domain table information. */ -- /* The FUIDs for the owner and group will be in the lr_create */ -- /* portion of the record. */ -- /* name follows ACL data */ - } lr_acl_create_t; -@@ -364,2 +363,4 @@ typedef enum { - -+typedef void (*zil_callback_t)(void *data); -+ - typedef struct itx { -@@ -369,2 +370,4 @@ typedef struct itx { - uint8_t itx_sync; /* synchronous transaction */ -+ zil_callback_t itx_callback; /* Called when the itx is persistent */ -+ void *itx_callback_data; /* User data for the callback */ - uint64_t itx_sod; /* record size on disk */ -@@ -429,5 +432,5 @@ extern zil_stats_t zil_stats; - --#define ZIL_STAT_INCR(stat, val) \ -+#define ZIL_STAT_INCR(stat, val) \ - atomic_add_64(&zil_stats.stat.value.ui64, (val)); --#define ZIL_STAT_BUMP(stat) \ -+#define ZIL_STAT_BUMP(stat) \ - ZIL_STAT_INCR(stat, 1); -@@ -472,4 +475,4 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); - --extern int zil_suspend(zilog_t *zilog); --extern void zil_resume(zilog_t *zilog); -+extern int zil_suspend(const char *osname, void **cookiep); -+extern void zil_resume(void *cookie); - -diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h -index f5b69b7..0db4b52 100644 ---- a/include/sys/zil_impl.h -+++ b/include/sys/zil_impl.h -@@ -43,3 +43,3 @@ typedef struct lwb { - blkptr_t lwb_blk; /* on disk address of this log blk */ -- boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ -+ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ - int lwb_nused; /* # used bytes in buffer */ -diff --git a/include/sys/zio.h b/include/sys/zio.h -index 189966b..129e2bc 100644 ---- a/include/sys/zio.h -+++ b/include/sys/zio.h -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -126,3 +126,3 @@ enum zio_compress { - */ --#define ZIO_DELAY_MAX (30 * MILLISEC) -+#define ZIO_DELAY_MAX (30 * MILLISEC) - -@@ -132,15 +132,12 @@ enum zio_compress { - --#define ZIO_PRIORITY_NOW (zio_priority_table[0]) --#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) --#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) --#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) --#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) --#define ZIO_PRIORITY_AGG (zio_priority_table[5]) --#define ZIO_PRIORITY_FREE (zio_priority_table[6]) --#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) --#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) --#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) --#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) --#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) --#define ZIO_PRIORITY_TABLE_SIZE 12 -+typedef enum zio_priority { -+ ZIO_PRIORITY_SYNC_READ, -+ ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ -+ ZIO_PRIORITY_ASYNC_READ, /* prefetch */ -+ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ -+ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ -+ ZIO_PRIORITY_NUM_QUEUEABLE, -+ -+ ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ -+} zio_priority_t; - -@@ -198,3 +195,6 @@ enum zio_flag { - ZIO_FLAG_GODFATHER = 1 << 24, -- ZIO_FLAG_FASTWRITE = 1 << 25 -+ ZIO_FLAG_NOPWRITE = 1 << 25, -+ ZIO_FLAG_REEXECUTED = 1 << 26, -+ ZIO_FLAG_DELEGATED = 1 << 27, -+ ZIO_FLAG_FASTWRITE = 1 << 28 - }; -@@ -238,4 +238,3 @@ typedef void zio_done_func_t(zio_t *zio); - --extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; --extern char *zio_type_name[ZIO_TYPES]; -+extern const char *zio_type_name[ZIO_TYPES]; - -@@ -258,3 +257,3 @@ extern char *zio_type_name[ZIO_TYPES]; - */ --typedef struct zbookmark { -+struct zbookmark { - uint64_t zb_objset; -@@ -263,3 +262,3 @@ typedef struct zbookmark { - uint64_t zb_blkid; --} zbookmark_t; -+}; - -@@ -296,4 +295,5 @@ typedef struct zio_prop { - uint8_t zp_copies; -- uint8_t zp_dedup; -- uint8_t zp_dedup_verify; -+ boolean_t zp_dedup; -+ boolean_t zp_dedup_verify; -+ boolean_t zp_nopwrite; - } zio_prop_t; -@@ -378,3 +378,3 @@ struct zio { - int io_cmd; -- uint8_t io_priority; -+ zio_priority_t io_priority; - uint8_t io_reexecute; -@@ -393,3 +393,4 @@ struct zio { - /* Callback info */ -- zio_done_func_t *io_ready; -+ zio_done_func_t *io_ready; -+ zio_done_func_t *io_physdone; - zio_done_func_t *io_done; -@@ -411,3 +412,2 @@ struct zio { - uint64_t io_offset; -- uint64_t io_deadline; /* expires at timestamp + deadline */ - hrtime_t io_timestamp; /* submitted at */ -@@ -415,5 +415,3 @@ struct zio { - uint64_t io_delay; /* vdev disk service delta (ticks) */ -- avl_node_t io_offset_node; -- avl_node_t io_deadline_node; -- avl_tree_t *io_vdev_tree; -+ avl_node_t io_queue_node; - -@@ -430,2 +428,3 @@ struct zio { - uint64_t io_child_count; -+ uint64_t io_phys_children; - uint64_t io_parent_count; -@@ -455,3 +454,3 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb); -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); - -@@ -459,4 +458,5 @@ extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, -- zio_done_func_t *ready, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb); -+ zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, -+ void *private, -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); - -@@ -464,5 +464,6 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, zbookmark_t *zb); -+ zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb); - --extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); -+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, -+ boolean_t nopwrite); - -@@ -475,3 +476,3 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, - extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags); -+ zio_done_func_t *done, void *private, enum zio_flag flags); - -@@ -479,4 +480,4 @@ extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags, -- boolean_t labels); -+ zio_done_func_t *done, void *private, zio_priority_t priority, -+ enum zio_flag flags, boolean_t labels); - -@@ -484,4 +485,4 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags, -- boolean_t labels); -+ zio_done_func_t *done, void *private, zio_priority_t priority, -+ enum zio_flag flags, boolean_t labels); - -@@ -516,7 +517,8 @@ extern void zio_resubmit_stage_async(void *); - extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, -- uint64_t offset, void *data, uint64_t size, int type, int priority, -- enum zio_flag flags, zio_done_func_t *done, void *private); -+ uint64_t offset, void *data, uint64_t size, int type, -+ zio_priority_t priority, enum zio_flag flags, -+ zio_done_func_t *done, void *private); - - extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, -- void *data, uint64_t size, int type, int priority, -+ void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private); -diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h -index bd051f1..63863c7 100644 ---- a/include/sys/zio_compress.h -+++ b/include/sys/zio_compress.h -@@ -35,7 +35,6 @@ extern "C" { - --/* -- * Common signature for all zio compress/decompress functions. -- */ -+/* Common signature for all zio compress functions. */ - typedef size_t zio_compress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); -+/* Common signature for all zio decompress functions. */ - typedef int zio_decompress_func_t(void *src, void *dst, -@@ -76,5 +75,5 @@ extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); --extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len, -+extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); --extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, -+extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); -diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h -index 2d062d0..08f8201 100644 ---- a/include/sys/zio_impl.h -+++ b/include/sys/zio_impl.h -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -40,2 +40,66 @@ extern "C" { - /* -+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed. -+ * -+ * The ZFS I/O pipeline is comprised of various stages which are defined -+ * in the zio_stage enum below. The individual stages are used to construct -+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. -+ * -+ * I/O operations: (XXX - provide detail for each of the operations) -+ * -+ * Read: -+ * Write: -+ * Free: -+ * Claim: -+ * Ioctl: -+ * -+ * Although the most common pipeline are used by the basic I/O operations -+ * above, there are some helper pipelines (one could consider them -+ * sub-pipelines) which are used internally by the ZIO module and are -+ * explained below: -+ * -+ * Interlock Pipeline: -+ * The interlock pipeline is the most basic pipeline and is used by all -+ * of the I/O operations. The interlock pipeline does not perform any I/O -+ * and is used to coordinate the dependencies between I/Os that are being -+ * issued (i.e. the parent/child relationship). -+ * -+ * Vdev child Pipeline: -+ * The vdev child pipeline is responsible for performing the physical I/O. -+ * It is in this pipeline where the I/O are queued and possibly cached. -+ * -+ * In addition to performing I/O, the pipeline is also responsible for -+ * data transformations. The transformations performed are based on the -+ * specific properties that user may have selected and modify the -+ * behavior of the pipeline. Examples of supported transformations are -+ * compression, dedup, and nop writes. Transformations will either modify -+ * the data or the pipeline. This list below further describes each of -+ * the supported transformations: -+ * -+ * Compression: -+ * ZFS supports three different flavors of compression -- gzip, lzjb, and -+ * zle. Compression occurs as part of the write pipeline and is performed -+ * in the ZIO_STAGE_WRITE_BP_INIT stage. -+ * -+ * Dedup: -+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and -+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing -+ * read pipeline if the dedup bit is set on the block pointer. -+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage -+ * and added to a write pipeline if a user has enabled dedup on that -+ * particular dataset. -+ * -+ * NOP Write: -+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage -+ * and is added to an existing write pipeline if a crypographically -+ * secure checksum (i.e. SHA256) is enabled and compression is turned on. -+ * The NOP write stage will compare the checksums of the current data -+ * on-disk (level-0 blocks only) and the data that is currently being written. -+ * If the checksum values are identical then the pipeline is converted to -+ * an interlock pipeline skipping block allocation and bypassing the -+ * physical I/O. The nop write feature can handle writes in either -+ * syncing or open context (i.e. zil writes) and as a result is mutually -+ * exclusive with dedup. -+ */ -+ -+/* - * zio pipeline stage definitions -@@ -52,23 +116,25 @@ enum zio_stage { - -- ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ -- ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ -- ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ -- ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ -+ ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ -+ -+ ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ -+ ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ -+ ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ -+ ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ - -- ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ -- ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ -+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ -+ ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ - -- ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ -- ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ -- ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ -+ ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ -+ ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ -+ ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ - -- ZIO_STAGE_READY = 1 << 15, /* RWFCI */ -+ ZIO_STAGE_READY = 1 << 16, /* RWFCI */ - -- ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ -- ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ -- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ - -- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ -+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ - -- ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ -+ ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ - }; -@@ -149,3 +215,2 @@ enum zio_stage { - ZIO_STAGE_FREE_BP_INIT | \ -- ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_DVA_FREE) -diff --git a/include/sys/zpl.h b/include/sys/zpl.h -index 89cf824..56bd9ae 100644 ---- a/include/sys/zpl.h -+++ b/include/sys/zpl.h -@@ -34,2 +34,3 @@ - #include -+#include - -@@ -73,2 +74,32 @@ extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, - const struct qstr *qstr); -+#if defined(CONFIG_FS_POSIX_ACL) -+extern int zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl); -+extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); -+#if !defined(HAVE_GET_ACL) -+#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -+extern int zpl_check_acl(struct inode *inode, int mask, unsigned int flags); -+#elif defined(HAVE_CHECK_ACL) -+extern int zpl_check_acl(struct inode *inode, int mask); -+#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -+extern int zpl_permission(struct inode *ip, int mask, struct nameidata *nd); -+#elif defined(HAVE_PERMISSION) -+extern int zpl_permission(struct inode *ip, int mask); -+#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* HAVE_GET_ACL */ -+ -+extern int zpl_init_acl(struct inode *ip, struct inode *dir); -+extern int zpl_chmod_acl(struct inode *ip); -+#else -+static inline int -+zpl_init_acl(struct inode *ip, struct inode *dir) -+{ -+ return (0); -+} -+ -+static inline int -+zpl_chmod_acl(struct inode *ip) -+{ -+ return (0); -+} -+#endif /* CONFIG_FS_POSIX_ACL */ - -@@ -93,3 +124,3 @@ extern const struct inode_operations zpl_ops_shares; - --#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ -+#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ - .actor = _actor, \ -@@ -106,3 +137,3 @@ typedef struct dir_context { - --#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ -+#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ - .dirent = _dirent, \ -@@ -116,3 +147,4 @@ dir_emit(struct dir_context *ctx, const char *name, int namelen, - { -- return ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type) == 0; -+ return (ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type) -+ == 0); - } -@@ -122,4 +154,4 @@ dir_emit_dot(struct file *file, struct dir_context *ctx) - { -- return ctx->actor(ctx->dirent, ".", 1, ctx->pos, -- file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0; -+ return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, -+ file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0); - } -@@ -129,4 +161,4 @@ dir_emit_dotdot(struct file *file, struct dir_context *ctx) - { -- return ctx->actor(ctx->dirent, "..", 2, ctx->pos, -- parent_ino(file->f_path.dentry), DT_DIR) == 0; -+ return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, -+ parent_ino(file->f_path.dentry), DT_DIR) == 0); - } -@@ -138,3 +170,3 @@ dir_emit_dots(struct file *file, struct dir_context *ctx) - if (!dir_emit_dot(file, ctx)) -- return false; -+ return (false); - ctx->pos = 1; -@@ -143,6 +175,6 @@ dir_emit_dots(struct file *file, struct dir_context *ctx) - if (!dir_emit_dotdot(file, ctx)) -- return false; -+ return (false); - ctx->pos = 2; - } -- return true; -+ return (true); - } -diff --git a/include/sys/zvol.h b/include/sys/zvol.h -index c05f81a..04e0996 100644 ---- a/include/sys/zvol.h -+++ b/include/sys/zvol.h -@@ -40,6 +40,7 @@ extern boolean_t zvol_is_zvol(const char *); - extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); --extern int zvol_create_minor(const char *); --extern int zvol_create_minors(const char *); --extern int zvol_remove_minor(const char *); --extern void zvol_remove_minors(const char *); -+extern int zvol_create_minor(const char *name); -+extern int zvol_create_minors(const char *name); -+extern int zvol_remove_minor(const char *name); -+extern void zvol_remove_minors(const char *name); -+extern void zvol_rename_minors(const char *oldname, const char *newname); - extern int zvol_set_volsize(const char *, uint64_t); -diff --git a/include/zfs_comutil.h b/include/zfs_comutil.h -index 61327f9..f890543 100644 ---- a/include/zfs_comutil.h -+++ b/include/zfs_comutil.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -39,3 +40,4 @@ extern int zfs_zpl_version_map(int spa_version); - extern int zfs_spa_version_map(int zpl_version); --extern const char *zfs_history_event_names[LOG_END]; -+#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 -+extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; - -diff --git a/include/zpios-ctl.h b/include/zpios-ctl.h -index 6744ae6..82a7fdf 100644 ---- a/include/zpios-ctl.h -+++ b/include/zpios-ctl.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,8 +31,9 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_CTL_H --#define _ZPIOS_CTL_H -+#define _ZPIOS_CTL_H - --/* Contains shared definitions which both the userspace -+/* -+ * Contains shared definitions which both the userspace - * and kernelspace portions of zpios must agree on. -@@ -43,32 +44,32 @@ - --#define ZPIOS_MAJOR 232 /* XXX - Arbitrary */ --#define ZPIOS_MINORS 1 --#define ZPIOS_NAME "zpios" --#define ZPIOS_DEV "/dev/zpios" -- --#define DMU_IO 0x01 -- --#define DMU_WRITE 0x0001 --#define DMU_READ 0x0002 --#define DMU_VERIFY 0x0004 --#define DMU_REMOVE 0x0008 --#define DMU_FPP 0x0010 --#define DMU_WRITE_ZC 0x0020 /* Incompatible w/DMU_VERIFY */ --#define DMU_READ_ZC 0x0040 /* Incompatible w/DMU_VERIFY */ --#define DMU_WRITE_NOWAIT 0x0080 --#define DMU_READ_NOPF 0x0100 -- --#define ZPIOS_NAME_SIZE 16 --#define ZPIOS_PATH_SIZE 128 -- --#define PHASE_PRE_RUN "pre-run" --#define PHASE_PRE_CREATE "pre-create" --#define PHASE_PRE_WRITE "pre-write" --#define PHASE_PRE_READ "pre-read" --#define PHASE_PRE_REMOVE "pre-remove" --#define PHASE_POST_RUN "post-run" --#define PHASE_POST_CREATE "post-create" --#define PHASE_POST_WRITE "post-write" --#define PHASE_POST_READ "post-read" --#define PHASE_POST_REMOVE "post-remove" -+#define ZPIOS_MAJOR 232 /* XXX - Arbitrary */ -+#define ZPIOS_MINORS 1 -+#define ZPIOS_NAME "zpios" -+#define ZPIOS_DEV "/dev/zpios" -+ -+#define DMU_IO 0x01 -+ -+#define DMU_WRITE 0x0001 -+#define DMU_READ 0x0002 -+#define DMU_VERIFY 0x0004 -+#define DMU_REMOVE 0x0008 -+#define DMU_FPP 0x0010 -+#define DMU_WRITE_ZC 0x0020 /* Incompatible w/DMU_VERIFY */ -+#define DMU_READ_ZC 0x0040 /* Incompatible w/DMU_VERIFY */ -+#define DMU_WRITE_NOWAIT 0x0080 -+#define DMU_READ_NOPF 0x0100 -+ -+#define ZPIOS_NAME_SIZE 16 -+#define ZPIOS_PATH_SIZE 128 -+ -+#define PHASE_PRE_RUN "pre-run" -+#define PHASE_PRE_CREATE "pre-create" -+#define PHASE_PRE_WRITE "pre-write" -+#define PHASE_PRE_READ "pre-read" -+#define PHASE_PRE_REMOVE "pre-remove" -+#define PHASE_POST_RUN "post-run" -+#define PHASE_POST_CREATE "post-create" -+#define PHASE_POST_WRITE "post-write" -+#define PHASE_POST_READ "post-read" -+#define PHASE_POST_REMOVE "post-remove" - -@@ -119,4 +120,4 @@ typedef struct zpios_cmd { - uint32_t cmd_flags; /* Test flags */ -- char cmd_pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -- char cmd_post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -+ char cmd_pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -+ char cmd_post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ - char cmd_log[ZPIOS_PATH_SIZE]; /* Requested log dir */ -@@ -127,11 +128,11 @@ typedef struct zpios_cmd { - /* Valid ioctls */ --#define ZPIOS_CFG _IOWR('f', 101, zpios_cfg_t) --#define ZPIOS_CMD _IOWR('f', 102, zpios_cmd_t) -+#define ZPIOS_CFG _IOWR('f', 101, zpios_cfg_t) -+#define ZPIOS_CMD _IOWR('f', 102, zpios_cmd_t) - - /* Valid configuration commands */ --#define ZPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ --#define ZPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ -+#define ZPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ -+#define ZPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ - - #ifndef NSEC_PER_SEC --#define NSEC_PER_SEC 1000000000L -+#define NSEC_PER_SEC 1000000000L - #endif -@@ -139,3 +140,4 @@ typedef struct zpios_cmd { - static inline --void zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) -+void -+zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) - { -@@ -154,3 +156,4 @@ void zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) - static inline --zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) -+zpios_timespec_t -+zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - { -@@ -158,4 +161,4 @@ zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - zpios_timespec_normalize(&ts_delta, lhs.ts_sec + rhs.ts_sec, -- lhs.ts_nsec + rhs.ts_nsec); -- return ts_delta; -+ lhs.ts_nsec + rhs.ts_nsec); -+ return (ts_delta); - } -@@ -163,3 +166,4 @@ zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - static inline --zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) -+zpios_timespec_t -+zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - { -@@ -167,4 +171,4 @@ zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - zpios_timespec_normalize(&ts_delta, lhs.ts_sec - rhs.ts_sec, -- lhs.ts_nsec - rhs.ts_nsec); -- return ts_delta; -+ lhs.ts_nsec - rhs.ts_nsec); -+ return (ts_delta); - } -@@ -174,3 +178,4 @@ zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - static inline --zpios_timespec_t zpios_timespec_now(void) -+zpios_timespec_t -+zpios_timespec_now(void) - { -@@ -183,3 +188,3 @@ zpios_timespec_t zpios_timespec_now(void) - -- return zts_now; -+ return (zts_now); - } -@@ -189,6 +194,8 @@ zpios_timespec_t zpios_timespec_now(void) - static inline --double zpios_timespec_to_double(zpios_timespec_t ts) -+double -+zpios_timespec_to_double(zpios_timespec_t ts) - { -- return ((double)(ts.ts_sec) + -- ((double)(ts.ts_nsec) / (double)(NSEC_PER_SEC))); -+ return -+ ((double)(ts.ts_sec) + -+ ((double)(ts.ts_nsec) / (double)(NSEC_PER_SEC))); - } -diff --git a/include/zpios-internal.h b/include/zpios-internal.h -index 24a2feb..4b99b4c 100644 ---- a/include/zpios-internal.h -+++ b/include/zpios-internal.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,6 +31,6 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_INTERNAL_H --#define _ZPIOS_INTERNAL_H -+#define _ZPIOS_INTERNAL_H - -@@ -38,3 +38,3 @@ - --#define OBJ_SIZE 64 -+#define OBJ_SIZE 64 - -@@ -53,3 +53,3 @@ typedef struct thread_data { - zpios_stats_t stats; -- kmutex_t lock; -+ kmutex_t lock; - } thread_data_t; -@@ -64,3 +64,3 @@ typedef struct zpios_region { - zpios_stats_t stats; -- kmutex_t lock; -+ kmutex_t lock; - } zpios_region_t; -@@ -87,5 +87,5 @@ typedef struct run_args { - objset_t *os; -- wait_queue_head_t waitq; -+ wait_queue_head_t waitq; - volatile uint64_t threads_done; -- kmutex_t lock_work; -+ kmutex_t lock_work; - kmutex_t lock_ctl; -@@ -101,38 +101,12 @@ typedef struct run_args { - --#define ZPIOS_INFO_BUFFER_SIZE 65536 --#define ZPIOS_INFO_BUFFER_REDZONE 1024 -+#define ZPIOS_INFO_BUFFER_SIZE 65536 -+#define ZPIOS_INFO_BUFFER_REDZONE 1024 - - typedef struct zpios_info { -- spinlock_t info_lock; -- int info_size; -- char *info_buffer; -- char *info_head; /* Internal kernel use only */ -+ spinlock_t info_lock; -+ int info_size; -+ char *info_buffer; -+ char *info_head; /* Internal kernel use only */ - } zpios_info_t; - --#define zpios_print(file, format, args...) \ --({ zpios_info_t *_info_ = (zpios_info_t *)file->private_data; \ -- int _rc_; \ -- \ -- ASSERT(_info_); \ -- ASSERT(_info_->info_buffer); \ -- \ -- spin_lock(&_info_->info_lock); \ -- \ -- /* Don't allow the kernel to start a write in the red zone */ \ -- if ((int)(_info_->info_head - _info_->info_buffer) > \ -- (_info_->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { \ -- _rc_ = -EOVERFLOW; \ -- } else { \ -- _rc_ = sprintf(_info_->info_head, format, args); \ -- if (_rc_ >= 0) \ -- _info_->info_head += _rc_; \ -- } \ -- \ -- spin_unlock(&_info_->info_lock); \ -- _rc_; \ --}) -- --#define zpios_vprint(file, test, format, args...) \ -- zpios_print(file, "%*s: " format, ZPIOS_NAME_SIZE, test, args) -- - #endif /* _ZPIOS_INTERNAL_H */ -diff --git a/lib/Makefile.am b/lib/Makefile.am -index 09139d5..8e7caf2 100644 ---- a/lib/Makefile.am -+++ b/lib/Makefile.am -@@ -6,2 +6,2 @@ SUBDIRS = libspl libavl libefi libshare libunicode - # incorporate the five convenience libraries given above. --SUBDIRS += libuutil libnvpair libzpool libzfs -+SUBDIRS += libuutil libnvpair libzpool libzfs_core libzfs -diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am -index aa57dba..55f7b11 100644 ---- a/lib/libefi/Makefile.am -+++ b/lib/libefi/Makefile.am -@@ -12 +12,3 @@ libefi_la_SOURCES = \ - $(top_srcdir)/lib/libefi/rdwr_efi.c -+ -+libefi_la_LIBADD = $(LIBUUID) $(ZLIB) -diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c -index f4cf417..19a573c 100644 ---- a/lib/libefi/rdwr_efi.c -+++ b/lib/libefi/rdwr_efi.c -@@ -89,3 +89,4 @@ struct dk_map2 default_vtoc_map[NDKMAP] = { - --#if defined(i386) || defined(__amd64) || defined(__arm) || defined(__powerpc) -+#if defined(i386) || defined(__amd64) || defined(__arm) || \ -+ defined(__powerpc) || defined(__sparc) - { V_BOOT, V_UNMNT }, /* i - 8 */ -@@ -134,4 +135,4 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) - -- if (ioctl(fd, BLKSSZGET, §or_size) < 0) -- return (-1); -+ if (ioctl(fd, BLKSSZGET, §or_size) < 0) -+ return (-1); - -@@ -154,3 +155,3 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - -- memset(dki_info, 0, sizeof(*dki_info)); -+ memset(dki_info, 0, sizeof (*dki_info)); - -@@ -184,4 +185,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) { -@@ -190,4 +191,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/md", 7) == 0)) { -@@ -195,5 +196,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_MD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "md"); -+ rval = sscanf(dev_path, "/dev/md%[0-9]p%hu", -+ dki_info->dki_dname + 2, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/vd", 7) == 0)) { -@@ -202,4 +204,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) { -@@ -207,5 +209,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_VBD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9-]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "dm-"); -+ rval = sscanf(dev_path, "/dev/dm-%[0-9]p%hu", -+ dki_info->dki_dname + 3, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) { -@@ -213,5 +216,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_PCMCIA_MEM; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "ram"); -+ rval = sscanf(dev_path, "/dev/ram%[0-9]p%hu", -+ dki_info->dki_dname + 3, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) { -@@ -219,5 +223,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_VBD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "loop"); -+ rval = sscanf(dev_path, "/dev/loop%[0-9]p%hu", -+ dki_info->dki_dname + 4, -+ &dki_info->dki_partition); - } else { -@@ -397,6 +402,6 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (efi_debug) -- fprintf(stderr,"unable to read disk info: %d",errno); -+ fprintf(stderr, "unable to read disk info: %d", errno); - - errno = EIO; -- return -1; -+ return (-1); - } -@@ -408,3 +413,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI assuming " -- "LBA %d bytes\n", DEV_BSIZE); -+ "LBA %d bytes\n", DEV_BSIZE); - -@@ -417,4 +422,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI lseek " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -425,4 +430,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI read " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -432,5 +437,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI short " -- "read of %d bytes\n", error); -+ "read of %d bytes\n", error); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -443,5 +448,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI unknown " -- "LBA size\n"); -+ "LBA size\n"); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -452,4 +457,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI lseek " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -460,4 +465,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI write " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -467,5 +472,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI short " -- "write of %d bytes\n", error); -+ "write of %d bytes\n", error); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -475,3 +480,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (error == -1) -- return error; -+ return (error); - -@@ -479,3 +484,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (ioctl(fd, BLKFLSBUF, 0) == -1) -- return error; -+ return (error); - -@@ -489,3 +494,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - errno = EIO; -- return -1; -+ return (-1); - } -@@ -499,3 +504,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - --int efi_rescan(int fd) -+int -+efi_rescan(int fd) - { -@@ -509,3 +515,3 @@ int efi_rescan(int fd) - (void) fprintf(stderr, "the kernel failed to rescan " -- "the partition table: %d\n", errno); -+ "the partition table: %d\n", errno); - return (-1); -@@ -550,3 +556,3 @@ check_label(int fd, dk_efi_t *dk_ioc) - -- if(headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { -+ if (headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { - if (efi_debug) -@@ -592,3 +598,3 @@ efi_read(int fd, struct dk_gpt *vtoc) - if ((rval = efi_get_info(fd, &dki_info)) != 0) -- return rval; -+ return (rval); - -@@ -610,4 +616,4 @@ efi_read(int fd, struct dk_gpt *vtoc) - (void) fprintf(stderr, -- "unable to read disk info: %d", -- errno); -+ "unable to read disk info: %d", -+ errno); - } -@@ -644,3 +650,3 @@ efi_read(int fd, struct dk_gpt *vtoc) - if (posix_memalign((void **)&dk_ioc.dki_data, -- disk_info.dki_lbsize, label_len)) -+ disk_info.dki_lbsize, label_len)) - return (VT_ERROR); -@@ -1119,3 +1125,3 @@ efi_write(int fd, struct dk_gpt *vtoc) - if ((rval = efi_get_info(fd, &dki_info)) != 0) -- return rval; -+ return (rval); - -@@ -1158,3 +1164,3 @@ efi_write(int fd, struct dk_gpt *vtoc) - if (posix_memalign((void **)&dk_ioc.dki_data, -- vtoc->efi_lbasize, dk_ioc.dki_length)) -+ vtoc->efi_lbasize, dk_ioc.dki_length)) - return (VT_ERROR); -diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c -index 6b39ba8..ea59dcd 100644 ---- a/lib/libshare/libshare.c -+++ b/lib/libshare/libshare.c -@@ -66,3 +66,3 @@ register_fstype(const char *name, const sa_share_ops_t *ops) - if (fstype == NULL) -- return NULL; -+ return (NULL); - -@@ -77,3 +77,3 @@ register_fstype(const char *name, const sa_share_ops_t *ops) - -- return fstype; -+ return (fstype); - } -@@ -88,3 +88,3 @@ sa_init(int init_service) - if (impl_handle == NULL) -- return NULL; -+ return (NULL); - -@@ -107,10 +107,2 @@ libshare_init(void) - libshare_smb_init(); -- -- /* -- * This bit causes /etc/dfs/sharetab to be updated before libzfs gets a -- * chance to read that file; this is necessary because the sharetab file -- * might be out of sync with the NFS kernel exports (e.g. due to reboots -- * or users manually removing shares) -- */ -- sa_fini(sa_init(0)); - } -@@ -245,3 +237,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 1; -+ return (1); - } -@@ -250,3 +242,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -256,3 +248,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -263,3 +255,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -268,3 +260,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -289,3 +281,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - -- return 0; -+ return (0); - } -@@ -300,3 +292,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - if (impl_handle->zfs_libhandle == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -308,3 +300,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - if (zhp == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -314,3 +306,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - -- return SA_OK; -+ return (SA_OK); - } -@@ -323,3 +315,3 @@ update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) - if (impl_handle->zfs_libhandle == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -330,3 +322,3 @@ update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) - -- return SA_OK; -+ return (SA_OK); - } -@@ -353,3 +345,3 @@ process_share(sa_handle_impl_t impl_handle, sa_share_impl_t impl_share, - !S_ISDIR(statbuf.st_mode)) -- return SA_BAD_PATH; -+ return (SA_BAD_PATH); - -@@ -423,3 +415,3 @@ err: - -- return rc; -+ return (rc); - } -@@ -489,3 +481,3 @@ find_share(sa_handle_impl_t impl_handle, const char *sharepath) - -- return impl_share; -+ return (impl_share); - } -@@ -495,3 +487,3 @@ sa_find_share(sa_handle_t handle, char *sharepath) - { -- return (sa_share_t)find_share((sa_handle_impl_t)handle, sharepath); -+ return ((sa_share_t)find_share((sa_handle_impl_t)handle, sharepath)); - } -@@ -717,6 +709,6 @@ sa_parse_legacy_options(sa_group_t group, char *options, char *proto) - -- return fstype->ops->validate_shareopts(options); -+ return (fstype->ops->validate_shareopts(options)); - } - -- return SA_INVALID_PROTOCOL; -+ return (SA_INVALID_PROTOCOL); - } -@@ -726,3 +718,3 @@ sa_needs_refresh(sa_handle_t handle) - { -- return B_TRUE; -+ return (B_TRUE); - } -@@ -735,5 +727,5 @@ sa_get_zfs_handle(sa_handle_t handle) - if (impl_handle == NULL) -- return NULL; -+ return (NULL); - -- return impl_handle->zfs_libhandle; -+ return (impl_handle->zfs_libhandle); - } -@@ -748,3 +740,3 @@ alloc_share(const char *sharepath) - if (impl_share == NULL) -- return NULL; -+ return (NULL); - -@@ -754,3 +746,3 @@ alloc_share(const char *sharepath) - free(impl_share); -- return NULL; -+ return (NULL); - } -@@ -762,6 +754,6 @@ alloc_share(const char *sharepath) - free(impl_share); -- return NULL; -+ return (NULL); - } - -- return impl_share; -+ return (impl_share); - } -@@ -801,4 +793,4 @@ sa_zfs_process_share(sa_handle_t handle, sa_group_t group, sa_share_t share, - -- return process_share(impl_handle, impl_share, mountpoint, NULL, -- proto, shareopts, NULL, dataset, B_FALSE); -+ return (process_share(impl_handle, impl_share, mountpoint, NULL, -+ proto, shareopts, NULL, dataset, B_FALSE)); - } -diff --git a/lib/libshare/libshare_impl.h b/lib/libshare/libshare_impl.h -index dfcec2f..18d619b 100644 ---- a/lib/libshare/libshare_impl.h -+++ b/lib/libshare/libshare_impl.h -@@ -45,3 +45,3 @@ typedef struct sa_share_impl { - --#define FSINFO(impl_share, fstype) (&(impl_share->fsinfo[fstype->fsinfo_index])) -+#define FSINFO(impl_share, fstype) (&(impl_share->fsinfo[fstype->fsinfo_index])) - -diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c -index 00ba0f6..d1b207e 100644 ---- a/lib/libshare/nfs.c -+++ b/lib/libshare/nfs.c -@@ -52,3 +52,3 @@ typedef int (*nfs_host_callback_t)(const char *sharepath, const char *host, - --/** -+/* - * Invokes the specified callback function for each Solaris share option -@@ -64,3 +64,3 @@ foreach_nfs_shareopt(const char *shareopts, - if (shareopts == NULL) -- return SA_OK; -+ return (SA_OK); - -@@ -69,3 +69,3 @@ foreach_nfs_shareopt(const char *shareopts, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -97,3 +97,3 @@ foreach_nfs_shareopt(const char *shareopts, - free(shareopts_dup); -- return rc; -+ return (rc); - } -@@ -109,3 +109,3 @@ foreach_nfs_shareopt(const char *shareopts, - -- return 0; -+ return (0); - } -@@ -119,3 +119,3 @@ typedef struct nfs_host_cookie_s { - --/** -+/* - * Helper function for foreach_nfs_host. This function checks whether the -@@ -148,3 +148,3 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - if (host_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -165,3 +165,3 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - -- return rc; -+ return (rc); - } -@@ -174,6 +174,6 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Invokes a callback function for all NFS hosts that are set for a share. -@@ -198,3 +198,3 @@ foreach_nfs_host(sa_share_impl_t impl_share, nfs_host_callback_t callback, - --/** -+/* - * Converts a Solaris NFS host specification to its Linux equivalent. -@@ -219,9 +219,9 @@ get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) - if (*plinux_hostspec == NULL) { -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - } - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Used internally by nfs_enable_share to enable sharing for a single host. -@@ -283,8 +283,8 @@ nfs_enable_share_one(const char *sharepath, const char *host, - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Adds a Linux share option to an array of NFS options. -@@ -304,3 +304,3 @@ add_linux_shareopt(char **plinux_opts, const char *key, const char *value) - if (new_linux_opts == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -320,6 +320,6 @@ add_linux_shareopt(char **plinux_opts, const char *key, const char *value) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Validates and converts a single Solaris share option to its Linux -@@ -335,3 +335,3 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - strcmp(key, "sec") == 0) -- return SA_OK; -+ return (SA_OK); - -@@ -340,6 +340,6 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - -- if (strcmp(key, "root_mapping") == 0) { -- (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); -- key = "anonuid"; -- } -+ if (strcmp(key, "root_mapping") == 0) { -+ (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); -+ key = "anonuid"; -+ } - -@@ -366,3 +366,3 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { -- return SA_SYNTAX_ERR; -+ return (SA_SYNTAX_ERR); - } -@@ -371,6 +371,6 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Takes a string containing Solaris share options (e.g. "sync,no_acl") and -@@ -392,3 +392,4 @@ get_linux_shareopts(const char *shareopts, char **plinux_opts) - -- rc = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, plinux_opts); -+ rc = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, -+ plinux_opts); - -@@ -399,6 +400,6 @@ get_linux_shareopts(const char *shareopts, char **plinux_opts) - -- return rc; -+ return (rc); - } - --/** -+/* - * Enables NFS sharing for the specified share. -@@ -412,3 +413,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (!nfs_available()) { -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -418,3 +419,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (shareopts == NULL) -- return SA_OK; -+ return (SA_OK); - -@@ -423,3 +424,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (rc != SA_OK) -- return rc; -+ return (rc); - -@@ -429,6 +430,6 @@ nfs_enable_share(sa_share_impl_t impl_share) - -- return rc; -+ return (rc); - } - --/** -+/* - * Used internally by nfs_disable_share to disable sharing for a single host. -@@ -473,8 +474,8 @@ nfs_disable_share_one(const char *sharepath, const char *host, - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Disables NFS sharing for the specified share. -@@ -489,9 +490,9 @@ nfs_disable_share(sa_share_impl_t impl_share) - */ -- return SA_OK; -+ return (SA_OK); - } - -- return foreach_nfs_host(impl_share, nfs_disable_share_one, NULL); -+ return (foreach_nfs_host(impl_share, nfs_disable_share_one, NULL)); - } - --/** -+/* - * Checks whether the specified NFS share options are syntactically correct. -@@ -507,3 +508,3 @@ nfs_validate_shareopts(const char *shareopts) - if (rc != SA_OK) -- return rc; -+ return (rc); - -@@ -511,6 +512,6 @@ nfs_validate_shareopts(const char *shareopts) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Checks whether a share is currently active. -@@ -525,3 +526,3 @@ nfs_is_share_active(sa_share_impl_t impl_share) - if (!nfs_available()) -- return B_FALSE; -+ return (B_FALSE); - -@@ -532,6 +533,6 @@ nfs_is_share_active(sa_share_impl_t impl_share) - fclose(nfs_exportfs_temp_fp); -- return B_FALSE; -+ return (B_FALSE); - } - -- while (fgets(line, sizeof(line), nfs_exportfs_temp_fp) != NULL) { -+ while (fgets(line, sizeof (line), nfs_exportfs_temp_fp) != NULL) { - /* -@@ -566,3 +567,3 @@ nfs_is_share_active(sa_share_impl_t impl_share) - fclose(nfs_exportfs_temp_fp); -- return B_TRUE; -+ return (B_TRUE); - } -@@ -572,6 +573,6 @@ nfs_is_share_active(sa_share_impl_t impl_share) - -- return B_FALSE; -+ return (B_FALSE); - } - --/** -+/* - * Called to update a share's options. A share's options might be out of -@@ -606,3 +607,3 @@ nfs_update_shareopts(sa_share_impl_t impl_share, const char *resource, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -616,6 +617,6 @@ nfs_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Clears a share's NFS options. Used by libshare to -@@ -668,3 +669,3 @@ nfs_check_exportfs(void) - if (nfs_exportfs_temp_fd < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -679,3 +680,3 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -683,4 +684,3 @@ nfs_check_exportfs(void) - if (pid > 0) { -- while ((rc = waitpid(pid, &status, 0)) <= 0 && errno == EINTR) -- ; /* empty loop body */ -+ while ((rc = waitpid(pid, &status, 0)) <= 0 && errno == EINTR); - -@@ -689,3 +689,3 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -695,6 +695,6 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_CONFIG_ERR; -+ return (SA_CONFIG_ERR); - } - -- return SA_OK; -+ return (SA_OK); - } -@@ -726,6 +726,6 @@ nfs_available(void) - -- return (nfs_exportfs_temp_fd != -1) ? B_TRUE : B_FALSE; -+ return ((nfs_exportfs_temp_fd != -1) ? B_TRUE : B_FALSE); - } - --/** -+/* - * Initializes the NFS functionality of libshare. -diff --git a/lib/libshare/smb.c b/lib/libshare/smb.c -index a545bfb..1ac1a8d 100644 ---- a/lib/libshare/smb.c -+++ b/lib/libshare/smb.c -@@ -28,3 +28,3 @@ - * shares using the 'net share' command that comes with Samba. -- -+ * - * TESTING -@@ -66,3 +66,3 @@ static sa_fstype_t *smb_fstype; - --/** -+/* - * Retrieve the list of SMB shares. -@@ -85,3 +85,3 @@ smb_retrieve_shares(void) - if (shares_dir == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -93,3 +93,3 @@ smb_retrieve_shares(void) - snprintf(file_path, sizeof (file_path), -- "%s/%s", SHARE_DIR, directory->d_name); -+ "%s/%s", SHARE_DIR, directory->d_name); - -@@ -110,7 +110,7 @@ smb_retrieve_shares(void) - if (name == NULL) { -- rc = SA_NO_MEMORY; -- goto out; -+ rc = SA_NO_MEMORY; -+ goto out; - } - -- while (fgets(line, sizeof(line), share_file_fp)) { -+ while (fgets(line, sizeof (line), share_file_fp)) { - if (line[0] == '#') -@@ -120,3 +120,3 @@ smb_retrieve_shares(void) - while (line[strlen(line) - 1] == '\r' || -- line[strlen(line) - 1] == '\n') -+ line[strlen(line) - 1] == '\n') - line[strlen(line) - 1] = '\0'; -@@ -157,11 +157,12 @@ smb_retrieve_shares(void) - sizeof (shares->name)); -- shares->name [sizeof(shares->name)-1] = '\0'; -+ shares->name [sizeof (shares->name) - 1] = '\0'; - - strncpy(shares->path, path, -- sizeof (shares->path)); -- shares->path [sizeof(shares->path)-1] = '\0'; -+ sizeof (shares->path)); -+ shares->path [sizeof (shares->path) - 1] = '\0'; - - strncpy(shares->comment, comment, -- sizeof (shares->comment)); -- shares->comment[sizeof(shares->comment)-1]='\0'; -+ sizeof (shares->comment)); -+ shares->comment[sizeof (shares->comment)-1] = -+ '\0'; - -@@ -172,5 +173,5 @@ smb_retrieve_shares(void) - -- name = NULL; -- path = NULL; -- comment = NULL; -+ name = NULL; -+ path = NULL; -+ comment = NULL; - guest_ok = NULL; -@@ -192,6 +193,6 @@ out: - -- return rc; -+ return (rc); - } - --/** -+/* - * Used internally by smb_enable_share to enable sharing for a single host. -@@ -206,4 +207,4 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - /* Support ZFS share name regexp '[[:alnum:]_-.: ]' */ -- strncpy(name, sharename, sizeof(name)); -- name [sizeof(name)-1] = '\0'; -+ strncpy(name, sharename, sizeof (name)); -+ name [sizeof (name)-1] = '\0'; - -@@ -222,14 +223,16 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - -- /* CMD: net -S NET_CMD_ARG_HOST usershare add Test1 /share/Test1 \ -- * "Comment" "Everyone:F" */ -- snprintf(comment, sizeof(comment), "Comment: %s", sharepath); -- -- argv[0] = NET_CMD_PATH; -- argv[1] = (char*)"-S"; -- argv[2] = NET_CMD_ARG_HOST; -- argv[3] = (char*)"usershare"; -- argv[4] = (char*)"add"; -- argv[5] = (char*)name; -- argv[6] = (char*)sharepath; -- argv[7] = (char*)comment; -+ /* -+ * CMD: net -S NET_CMD_ARG_HOST usershare add Test1 /share/Test1 \ -+ * "Comment" "Everyone:F" -+ */ -+ snprintf(comment, sizeof (comment), "Comment: %s", sharepath); -+ -+ argv[0] = NET_CMD_PATH; -+ argv[1] = (char *)"-S"; -+ argv[2] = NET_CMD_ARG_HOST; -+ argv[3] = (char *)"usershare"; -+ argv[4] = (char *)"add"; -+ argv[5] = (char *)name; -+ argv[6] = (char *)sharepath; -+ argv[7] = (char *)comment; - argv[8] = "Everyone:F"; -@@ -239,3 +242,3 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -244,6 +247,6 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Enables SMB sharing for the specified share. -@@ -256,3 +259,3 @@ smb_enable_share(sa_share_impl_t impl_share) - if (!smb_available()) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -260,12 +263,13 @@ smb_enable_share(sa_share_impl_t impl_share) - if (shareopts == NULL) /* on/off */ -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - - if (strcmp(shareopts, "off") == 0) -- return SA_OK; -+ return (SA_OK); - - /* Magic: Enable (i.e., 'create new') share */ -- return smb_enable_share_one(impl_share->dataset, impl_share->sharepath); -+ return (smb_enable_share_one(impl_share->dataset, -+ impl_share->sharepath)); - } - --/** -+/* - * Used internally by smb_disable_share to disable sharing for a single host. -@@ -280,6 +284,6 @@ smb_disable_share_one(const char *sharename) - argv[0] = NET_CMD_PATH; -- argv[1] = (char*)"-S"; -+ argv[1] = (char *)"-S"; - argv[2] = NET_CMD_ARG_HOST; -- argv[3] = (char*)"usershare"; -- argv[4] = (char*)"delete"; -+ argv[3] = (char *)"usershare"; -+ argv[4] = (char *)"delete"; - argv[5] = strdup(sharename); -@@ -289,8 +293,8 @@ smb_disable_share_one(const char *sharename) - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Disables SMB sharing for the specified share. -@@ -307,3 +311,3 @@ smb_disable_share(sa_share_impl_t impl_share) - */ -- return SA_OK; -+ return (SA_OK); - } -@@ -312,3 +316,3 @@ smb_disable_share(sa_share_impl_t impl_share) - if (strcmp(impl_share->sharepath, shares->path) == 0) -- return smb_disable_share_one(shares->name); -+ return (smb_disable_share_one(shares->name)); - -@@ -317,6 +321,6 @@ smb_disable_share(sa_share_impl_t impl_share) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Checks whether the specified SMB share options are syntactically correct. -@@ -328,8 +332,8 @@ smb_validate_shareopts(const char *shareopts) - if ((strcmp(shareopts, "off") == 0) || (strcmp(shareopts, "on") == 0)) -- return SA_OK; -+ return (SA_OK); - -- return SA_SYNTAX_ERR; -+ return (SA_SYNTAX_ERR); - } - --/** -+/* - * Checks whether a share is currently active. -@@ -340,3 +344,3 @@ smb_is_share_active(sa_share_impl_t impl_share) - if (!smb_available()) -- return B_FALSE; -+ return (B_FALSE); - -@@ -347,3 +351,3 @@ smb_is_share_active(sa_share_impl_t impl_share) - if (strcmp(impl_share->sharepath, smb_shares->path) == 0) -- return B_TRUE; -+ return (B_TRUE); - -@@ -352,6 +356,6 @@ smb_is_share_active(sa_share_impl_t impl_share) - -- return B_FALSE; -+ return (B_FALSE); - } - --/** -+/* - * Called to update a share's options. A share's options might be out of -@@ -369,4 +373,4 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- if(!impl_share) -- return SA_SYSTEM_ERR; -+ if (!impl_share) -+ return (SA_SYSTEM_ERR); - -@@ -386,3 +390,3 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -396,6 +400,6 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Clears a share's SMB options. Used by libshare to -@@ -429,11 +433,11 @@ smb_available(void) - !S_ISDIR(statbuf.st_mode)) -- return B_FALSE; -+ return (B_FALSE); - - if (access(NET_CMD_PATH, F_OK) != 0) -- return B_FALSE; -+ return (B_FALSE); - -- return B_TRUE; -+ return (B_TRUE); - } - --/** -+/* - * Initializes the SMB functionality of libshare. -diff --git a/lib/libshare/smb.h b/lib/libshare/smb.h -index f5ac83a..7a0c0fd 100644 ---- a/lib/libshare/smb.h -+++ b/lib/libshare/smb.h -@@ -30,8 +30,8 @@ - --#define SMB_NAME_MAX 255 --#define SMB_COMMENT_MAX 255 -+#define SMB_NAME_MAX 255 -+#define SMB_COMMENT_MAX 255 - --#define SHARE_DIR "/var/lib/samba/usershares" --#define NET_CMD_PATH "/usr/bin/net" --#define NET_CMD_ARG_HOST "127.0.0.1" -+#define SHARE_DIR "/var/lib/samba/usershares" -+#define NET_CMD_PATH "/usr/bin/net" -+#define NET_CMD_ARG_HOST "127.0.0.1" - -diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am -index 089056c..dbf85c4 100644 ---- a/lib/libspl/Makefile.am -+++ b/lib/libspl/Makefile.am -@@ -32,2 +32,2 @@ libspl_la_SOURCES = \ - --libspl_la_LDFLAGS = -lrt -+libspl_la_LIBADD = -lrt -diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c -index a3223ea..f5eb4f3 100644 ---- a/lib/libspl/asm-generic/atomic.c -+++ b/lib/libspl/asm-generic/atomic.c -@@ -42,3 +42,3 @@ pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER; - --#define ATOMIC_INC(name, type) \ -+#define ATOMIC_INC(name, type) \ - void atomic_inc_##name(volatile type *target) \ -@@ -61,3 +61,3 @@ ATOMIC_INC(64, uint64_t) - --#define ATOMIC_DEC(name, type) \ -+#define ATOMIC_DEC(name, type) \ - void atomic_dec_##name(volatile type *target) \ -@@ -80,3 +80,3 @@ ATOMIC_DEC(64, uint64_t) - --#define ATOMIC_ADD(name, type1, type2) \ -+#define ATOMIC_ADD(name, type1, type2) \ - void atomic_add_##name(volatile type1 *target, type2 bits) \ -@@ -97,3 +97,4 @@ ATOMIC_ADD(64, uint64_t, int64_t) - --void atomic_add_ptr(volatile void *target, ssize_t bits) -+void -+atomic_add_ptr(volatile void *target, ssize_t bits) - { -@@ -105,3 +106,3 @@ void atomic_add_ptr(volatile void *target, ssize_t bits) - --#define ATOMIC_SUB(name, type1, type2) \ -+#define ATOMIC_SUB(name, type1, type2) \ - void atomic_sub_##name(volatile type1 *target, type2 bits) \ -@@ -122,3 +123,4 @@ ATOMIC_SUB(64, uint64_t, int64_t) - --void atomic_sub_ptr(volatile void *target, ssize_t bits) -+void -+atomic_sub_ptr(volatile void *target, ssize_t bits) - { -@@ -130,3 +132,3 @@ void atomic_sub_ptr(volatile void *target, ssize_t bits) - --#define ATOMIC_OR(name, type) \ -+#define ATOMIC_OR(name, type) \ - void atomic_or_##name(volatile type *target, type bits) \ -@@ -148,3 +150,3 @@ ATOMIC_OR(64, uint64_t) - --#define ATOMIC_AND(name, type) \ -+#define ATOMIC_AND(name, type) \ - void atomic_and_##name(volatile type *target, type bits) \ -@@ -170,3 +172,3 @@ ATOMIC_AND(64, uint64_t) - --#define ATOMIC_INC_NV(name, type) \ -+#define ATOMIC_INC_NV(name, type) \ - type atomic_inc_##name##_nv(volatile type *target) \ -@@ -177,3 +179,3 @@ ATOMIC_AND(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -191,3 +193,3 @@ ATOMIC_INC_NV(64, uint64_t) - --#define ATOMIC_DEC_NV(name, type) \ -+#define ATOMIC_DEC_NV(name, type) \ - type atomic_dec_##name##_nv(volatile type *target) \ -@@ -198,3 +200,3 @@ ATOMIC_INC_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -212,3 +214,3 @@ ATOMIC_DEC_NV(64, uint64_t) - --#define ATOMIC_ADD_NV(name, type1, type2) \ -+#define ATOMIC_ADD_NV(name, type1, type2) \ - type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits)\ -@@ -219,3 +221,3 @@ ATOMIC_DEC_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -231,3 +233,4 @@ ATOMIC_ADD_NV(64, uint64_t, int64_t) - --void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) -+void * -+atomic_add_ptr_nv(volatile void *target, ssize_t bits) - { -@@ -239,3 +242,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - -- return ptr; -+ return (ptr); - } -@@ -243,3 +246,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - --#define ATOMIC_SUB_NV(name, type1, type2) \ -+#define ATOMIC_SUB_NV(name, type1, type2) \ - type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ -@@ -250,3 +253,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -262,3 +265,4 @@ ATOMIC_SUB_NV(64, uint64_t, int64_t) - --void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) -+void * -+atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - { -@@ -270,3 +274,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - -- return ptr; -+ return (ptr); - } -@@ -274,3 +278,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - --#define ATOMIC_OR_NV(name, type) \ -+#define ATOMIC_OR_NV(name, type) \ - type atomic_or_##name##_nv(volatile type *target, type bits) \ -@@ -281,3 +285,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -295,3 +299,3 @@ ATOMIC_OR_NV(64, uint64_t) - --#define ATOMIC_AND_NV(name, type) \ -+#define ATOMIC_AND_NV(name, type) \ - type atomic_and_##name##_nv(volatile type *target, type bits) \ -@@ -302,3 +306,3 @@ ATOMIC_OR_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -320,3 +324,3 @@ ATOMIC_AND_NV(64, uint64_t) - --#define ATOMIC_CAS(name, type) \ -+#define ATOMIC_CAS(name, type) \ - type atomic_cas_##name(volatile type *target, type arg1, type arg2) \ -@@ -329,3 +333,3 @@ ATOMIC_AND_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return old; \ -+ return (old); \ - } -@@ -341,3 +345,4 @@ ATOMIC_CAS(64, uint64_t) - --void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) -+void * -+atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - { -@@ -347,7 +352,7 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - old = *(void **)target; -- if (old == arg1) -- *(void **)target = arg2; -+ if (old == arg1) -+ *(void **)target = arg2; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - -- return old; -+ return (old); - } -@@ -359,3 +364,3 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - --#define ATOMIC_SWAP(name, type) \ -+#define ATOMIC_SWAP(name, type) \ - type atomic_swap_##name(volatile type *target, type bits) \ -@@ -367,3 +372,3 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return old; \ -+ return (old); \ - } -@@ -379,3 +384,4 @@ ATOMIC_SWAP(64, uint64_t) - --void *atomic_swap_ptr(volatile void *target, void *bits) -+void * -+atomic_swap_ptr(volatile void *target, void *bits) - { -@@ -388,3 +394,3 @@ void *atomic_swap_ptr(volatile void *target, void *bits) - -- return old; -+ return (old); - } -@@ -392,3 +398,4 @@ void *atomic_swap_ptr(volatile void *target, void *bits) - --int atomic_set_long_excl(volatile ulong_t *target, uint_t value) -+int -+atomic_set_long_excl(volatile ulong_t *target, uint_t value) - { -@@ -400,3 +407,3 @@ int atomic_set_long_excl(volatile ulong_t *target, uint_t value) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); -- return -1; -+ return (-1); - } -@@ -405,6 +412,7 @@ int atomic_set_long_excl(volatile ulong_t *target, uint_t value) - -- return 0; -+ return (0); - } - --int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) -+int -+atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - { -@@ -416,3 +424,3 @@ int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); -- return -1; -+ return (-1); - } -@@ -421,6 +429,7 @@ int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - -- return 0; -+ return (0); - } - --void membar_enter(void) -+void -+membar_enter(void) - { -@@ -429,3 +438,4 @@ void membar_enter(void) - --void membar_exit(void) -+void -+membar_exit(void) - { -@@ -434,3 +444,4 @@ void membar_exit(void) - --void membar_producer(void) -+void -+membar_producer(void) - { -@@ -439,3 +450,4 @@ void membar_producer(void) - --void membar_consumer(void) -+void -+membar_consumer(void) - { -@@ -446,35 +458,42 @@ void membar_consumer(void) - --uint8_t cas8(uint8_t *target, uint8_t arg1, uint8_t arg2) -+uint8_t -+cas8(uint8_t *target, uint8_t arg1, uint8_t arg2) - { -- return atomic_cas_8(target, arg1, arg2); -+ return (atomic_cas_8(target, arg1, arg2)); - } - --uint32_t cas32(uint32_t *target, uint32_t arg1, uint32_t arg2) -+uint32_t -+cas32(uint32_t *target, uint32_t arg1, uint32_t arg2) - { -- return atomic_cas_32(target, arg1, arg2); -+ return (atomic_cas_32(target, arg1, arg2)); - } - --uint64_t cas64(uint64_t *target, uint64_t arg1, uint64_t arg2) -+uint64_t -+cas64(uint64_t *target, uint64_t arg1, uint64_t arg2) - { -- return atomic_cas_64(target, arg1, arg2); -+ return (atomic_cas_64(target, arg1, arg2)); - } - --ulong_t caslong(ulong_t *target, ulong_t arg1, ulong_t arg2) -+ulong_t -+caslong(ulong_t *target, ulong_t arg1, ulong_t arg2) - { -- return atomic_cas_ulong(target, arg1, arg2); -+ return (atomic_cas_ulong(target, arg1, arg2)); - } - --void *casptr(void *target, void *arg1, void *arg2) -+void * -+casptr(void *target, void *arg1, void *arg2) - { -- return atomic_cas_ptr(target, arg1, arg2); -+ return (atomic_cas_ptr(target, arg1, arg2)); - } - --void atomic_and_long(ulong_t *target, ulong_t bits) -+void -+atomic_and_long(ulong_t *target, ulong_t bits) - { -- return atomic_and_ulong(target, bits); -+ return (atomic_and_ulong(target, bits)); - } - --void atomic_or_long(ulong_t *target, ulong_t bits) -+void -+atomic_or_long(ulong_t *target, ulong_t bits) - { -- return atomic_or_ulong(target, bits); -+ return (atomic_or_ulong(target, bits)); - } -diff --git a/lib/libspl/getexecname.c b/lib/libspl/getexecname.c -index c564eed..478351c 100644 ---- a/lib/libspl/getexecname.c -+++ b/lib/libspl/getexecname.c -@@ -43,3 +43,4 @@ getexecname(void) - if (strlen(execname) == 0) { -- rc = readlink("/proc/self/exe", execname, sizeof(execname) - 1); -+ rc = readlink("/proc/self/exe", -+ execname, sizeof (execname) - 1); - if (rc == -1) { -@@ -55,3 +56,3 @@ getexecname(void) - pthread_mutex_unlock(&mtx); -- return ptr; -+ return (ptr); - } -diff --git a/lib/libspl/gethrestime.c b/lib/libspl/gethrestime.c -index be163f8..d37cc2d 100644 ---- a/lib/libspl/gethrestime.c -+++ b/lib/libspl/gethrestime.c -@@ -32,7 +32,7 @@ gethrestime(timestruc_t *ts) - { -- struct timeval tv; -+ struct timeval tv; - -- gettimeofday(&tv, NULL); -- ts->tv_sec = tv.tv_sec; -- ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; -+ gettimeofday(&tv, NULL); -+ ts->tv_sec = tv.tv_sec; -+ ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; - } -diff --git a/lib/libspl/gethrtime.c b/lib/libspl/gethrtime.c -index c2fd5e0..95ceb18 100644 ---- a/lib/libspl/gethrtime.c -+++ b/lib/libspl/gethrtime.c -@@ -40,6 +40,6 @@ gethrtime(void) - fprintf(stderr, "Error: clock_gettime() = %d\n", rc); -- abort(); -+ abort(); - } - -- return (((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec; -+ return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec); - } -diff --git a/lib/libspl/getmntany.c b/lib/libspl/getmntany.c -index f0b1cda..d78357a 100644 ---- a/lib/libspl/getmntany.c -+++ b/lib/libspl/getmntany.c -@@ -39,3 +39,3 @@ - --#define BUFSIZE (MNT_LINE_MAX + 2) -+#define BUFSIZE (MNT_LINE_MAX + 2) - -@@ -43,4 +43,5 @@ __thread char buf[BUFSIZE]; - --#define DIFF(xx) ((mrefp->xx != NULL) && \ -- (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) -+#define DIFF(xx) ( \ -+ (mrefp->xx != NULL) && \ -+ (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) - -@@ -51,7 +52,8 @@ getmntany(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp) - -- while (((ret = _sol_getmntent(fp, mgetp)) == 0) && -- (DIFF(mnt_special) || DIFF(mnt_mountp) || -- DIFF(mnt_fstype) || DIFF(mnt_mntopts))); -+ while ( -+ ((ret = _sol_getmntent(fp, mgetp)) == 0) && ( -+ DIFF(mnt_special) || DIFF(mnt_mountp) || -+ DIFF(mnt_fstype) || DIFF(mnt_mntopts))); - -- return ret; -+ return (ret); - } -@@ -71,3 +73,3 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) - mgetp->mnt_mntopts = mntbuf.mnt_opts; -- return 0; -+ return (0); - } -@@ -75,5 +77,5 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) - if (feof(fp)) -- return -1; -+ return (-1); - -- return MNT_TOOLONG; -+ return (MNT_TOOLONG); - } -@@ -91,3 +93,3 @@ getextmntent(FILE *fp, struct extmnttab *mp, int len) - mp->mnt_minor = 0; -- return ret; -+ return (ret); - } -@@ -97,3 +99,3 @@ getextmntent(FILE *fp, struct extmnttab *mp, int len) - -- return ret; -+ return (ret); - } -diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h -index 3704165..d749d1e 100644 ---- a/lib/libspl/include/assert.h -+++ b/lib/libspl/include/assert.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_ASSERT_H --#define _LIBSPL_ASSERT_H -+#define _LIBSPL_ASSERT_H - -diff --git a/lib/libspl/include/devid.h b/lib/libspl/include/devid.h -index 9dfdae8..5406c33 100644 ---- a/lib/libspl/include/devid.h -+++ b/lib/libspl/include/devid.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_DEVID_H --#define _LIBSPL_DEVID_H -+#define _LIBSPL_DEVID_H - -@@ -38,10 +38,70 @@ typedef struct devid_nmlist { - --static inline int devid_str_decode(char *devidstr, ddi_devid_t *retdevid, char **retminor_name) { abort(); } --static inline int devid_deviceid_to_nmlist(char *search_path, ddi_devid_t devid, char *minor_name, devid_nmlist_t **retlist) { abort(); } --static inline void devid_str_free(char *str) { abort(); } --static inline void devid_free(ddi_devid_t devid) { abort(); } --static inline void devid_free_nmlist(devid_nmlist_t *list) { abort(); } --static inline int devid_get(int fd, ddi_devid_t *retdevid) { return -1; } --static inline int devid_get_minor_name(int fd, char **retminor_name) { abort(); } --static inline char *devid_str_encode(ddi_devid_t devid, char *minor_name) { abort(); } -+static inline -+int -+devid_str_decode( -+ char *devidstr, -+ ddi_devid_t *retdevid, -+ char **retminor_name) -+{ -+ abort(); -+} -+ -+static inline -+int -+devid_deviceid_to_nmlist( -+ char *search_path, -+ ddi_devid_t devid, -+ char *minor_name, -+ devid_nmlist_t **retlist) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_str_free(char *str) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_free(ddi_devid_t devid) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_free_nmlist(devid_nmlist_t *list) -+{ -+ abort(); -+} -+ -+static inline -+int -+devid_get( -+ int fd, -+ ddi_devid_t *retdevid) -+{ -+ return (-1); -+} -+ -+static inline -+int -+devid_get_minor_name( -+ int fd, -+ char **retminor_name) -+{ -+ abort(); -+} -+ -+static inline -+char * -+devid_str_encode( -+ ddi_devid_t devid, -+ char *minor_name) -+{ -+ abort(); -+} - -diff --git a/lib/libspl/include/libdevinfo.h b/lib/libspl/include/libdevinfo.h -index f0f9d7e..be1d291 100644 ---- a/lib/libspl/include/libdevinfo.h -+++ b/lib/libspl/include/libdevinfo.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_LIBDEVINFO_H --#define _LIBSPL_LIBDEVINFO_H -+#define _LIBSPL_LIBDEVINFO_H - -diff --git a/lib/libspl/include/libgen.h b/lib/libspl/include/libgen.h -index 29e5400..7c03d81 100644 ---- a/lib/libspl/include/libgen.h -+++ b/lib/libspl/include/libgen.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_LIBGEN_H --#define _LIBSPL_LIBGEN_H -+#define _LIBSPL_LIBGEN_H - -diff --git a/lib/libspl/include/libshare.h b/lib/libspl/include/libshare.h -index a35bfac..4016ff0 100644 ---- a/lib/libspl/include/libshare.h -+++ b/lib/libspl/include/libshare.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_LIBSHARE_H --#define _LIBSPL_LIBSHARE_H -+#define _LIBSPL_LIBSHARE_H - -diff --git a/lib/libspl/include/limits.h b/lib/libspl/include/limits.h -index 341a2eb..1a42cfe 100644 ---- a/lib/libspl/include/limits.h -+++ b/lib/libspl/include/limits.h -@@ -29,11 +29,11 @@ - #ifndef _LIBSPL_LIMITS_H --#define _LIBSPL_LIMITS_H -+#define _LIBSPL_LIMITS_H - --#define DBL_DIG 15 --#define DBL_MAX 1.7976931348623157081452E+308 --#define DBL_MIN 2.2250738585072013830903E-308 -+#define DBL_DIG 15 -+#define DBL_MAX 1.7976931348623157081452E+308 -+#define DBL_MIN 2.2250738585072013830903E-308 - --#define FLT_DIG 6 --#define FLT_MAX 3.4028234663852885981170E+38F --#define FLT_MIN 1.1754943508222875079688E-38F -+#define FLT_DIG 6 -+#define FLT_MAX 3.4028234663852885981170E+38F -+#define FLT_MIN 1.1754943508222875079688E-38F - -diff --git a/lib/libspl/include/locale.h b/lib/libspl/include/locale.h -index 98ca330..6c74df7 100644 ---- a/lib/libspl/include/locale.h -+++ b/lib/libspl/include/locale.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_LOCALE_H --#define _LIBSPL_LOCALE_H -+#define _LIBSPL_LOCALE_H - -diff --git a/lib/libspl/include/note.h b/lib/libspl/include/note.h -index ed6b4ba..cb6b33e 100644 ---- a/lib/libspl/include/note.h -+++ b/lib/libspl/include/note.h -@@ -38,3 +38,3 @@ - #ifndef _NOTE_H --#define _NOTE_H -+#define _NOTE_H - -@@ -46,3 +46,3 @@ extern "C" { - --#define NOTE _NOTE -+#define NOTE _NOTE - -diff --git a/lib/libspl/include/priv.h b/lib/libspl/include/priv.h -index 3e8b138..15b76a4 100644 ---- a/lib/libspl/include/priv.h -+++ b/lib/libspl/include/priv.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_PRIV_H --#define _LIBSPL_PRIV_H -+#define _LIBSPL_PRIV_H - -@@ -32,3 +32,3 @@ - /* Couldn't find this definition in OpenGrok */ --#define PRIV_SYS_CONFIG "sys_config" -+#define PRIV_SYS_CONFIG "sys_config" - -diff --git a/lib/libspl/include/rpc/types.h b/lib/libspl/include/rpc/types.h -index 68c17f1..aa9901f 100644 ---- a/lib/libspl/include/rpc/types.h -+++ b/lib/libspl/include/rpc/types.h -@@ -26,3 +26,3 @@ - #ifndef LIBSPL_RPC_TYPES_H --#define LIBSPL_RPC_TYPES_H -+#define LIBSPL_RPC_TYPES_H - -diff --git a/lib/libspl/include/rpc/xdr.h b/lib/libspl/include/rpc/xdr.h -index cd6680f..99500d6 100644 ---- a/lib/libspl/include/rpc/xdr.h -+++ b/lib/libspl/include/rpc/xdr.h -@@ -32,3 +32,3 @@ - #ifndef LIBSPL_RPC_XDR_H --#define LIBSPL_RPC_XDR_H -+#define LIBSPL_RPC_XDR_H - -@@ -57,6 +57,6 @@ typedef struct xdr_bytesrec { - */ --#define XDR_PEEK 2 --#define XDR_SKIPBYTES 3 --#define XDR_RDMAGET 4 --#define XDR_RDMASET 5 -+#define XDR_PEEK 2 -+#define XDR_SKIPBYTES 3 -+#define XDR_RDMAGET 4 -+#define XDR_RDMASET 5 - -diff --git a/lib/libspl/include/stdio.h b/lib/libspl/include/stdio.h -index f80fdc0..6152b09 100644 ---- a/lib/libspl/include/stdio.h -+++ b/lib/libspl/include/stdio.h -@@ -29,5 +29,5 @@ - #ifndef _LIBSPL_STDIO_H --#define _LIBSPL_STDIO_H -+#define _LIBSPL_STDIO_H - --#define enable_extended_FILE_stdio(fd, sig) ((void) 0) -+#define enable_extended_FILE_stdio(fd, sig) ((void) 0) - -diff --git a/lib/libspl/include/stdlib.h b/lib/libspl/include/stdlib.h -index 67d6e96..a4ce4f7 100644 ---- a/lib/libspl/include/stdlib.h -+++ b/lib/libspl/include/stdlib.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_STDLIB_H --#define _LIBSPL_STDLIB_H -+#define _LIBSPL_STDLIB_H - -diff --git a/lib/libspl/include/string.h b/lib/libspl/include/string.h -index 213977d..9e5133e 100644 ---- a/lib/libspl/include/string.h -+++ b/lib/libspl/include/string.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_STRING_H --#define _LIBSPL_STRING_H -+#define _LIBSPL_STRING_H - -diff --git a/lib/libspl/include/strings.h b/lib/libspl/include/strings.h -index 48944e1..3f35af4 100644 ---- a/lib/libspl/include/strings.h -+++ b/lib/libspl/include/strings.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_STRINGS_H --#define _LIBSPL_STRINGS_H -+#define _LIBSPL_STRINGS_H - -diff --git a/lib/libspl/include/synch.h b/lib/libspl/include/synch.h -index 2da270a..7ce2a53 100644 ---- a/lib/libspl/include/synch.h -+++ b/lib/libspl/include/synch.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYNCH_H --#define _LIBSPL_SYNCH_H -+#define _LIBSPL_SYNCH_H - -diff --git a/lib/libspl/include/sys/bitmap.h b/lib/libspl/include/sys/bitmap.h -index 8fef7fc..95122ab 100644 ---- a/lib/libspl/include/sys/bitmap.h -+++ b/lib/libspl/include/sys/bitmap.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_BITMAP_H --#define _LIBSPL_SYS_BITMAP_H -+#define _LIBSPL_SYS_BITMAP_H - -diff --git a/lib/libspl/include/sys/callb.h b/lib/libspl/include/sys/callb.h -index 29a6a67..8ffd187 100644 ---- a/lib/libspl/include/sys/callb.h -+++ b/lib/libspl/include/sys/callb.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_CALLB_H --#define _SYS_CALLB_H -+#define _SYS_CALLB_H - -diff --git a/lib/libspl/include/sys/cmn_err.h b/lib/libspl/include/sys/cmn_err.h -index d199361..63ff4eb 100644 ---- a/lib/libspl/include/sys/cmn_err.h -+++ b/lib/libspl/include/sys/cmn_err.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_CMN_ERR_H --#define _LIBSPL_SYS_CMN_ERR_H -+#define _LIBSPL_SYS_CMN_ERR_H - -diff --git a/lib/libspl/include/sys/compress.h b/lib/libspl/include/sys/compress.h -index 6e03e73..282f178 100644 ---- a/lib/libspl/include/sys/compress.h -+++ b/lib/libspl/include/sys/compress.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_COMPRESS_H --#define _LIBSPL_SYS_COMPRESS_H -+#define _LIBSPL_SYS_COMPRESS_H - -diff --git a/lib/libspl/include/sys/cred.h b/lib/libspl/include/sys/cred.h -index 6a58315..463b3ab 100644 ---- a/lib/libspl/include/sys/cred.h -+++ b/lib/libspl/include/sys/cred.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_CRED_H --#define _LIBSPL_SYS_CRED_H -+#define _LIBSPL_SYS_CRED_H - -diff --git a/lib/libspl/include/sys/debug.h b/lib/libspl/include/sys/debug.h -index 0069620..fde4a01 100644 ---- a/lib/libspl/include/sys/debug.h -+++ b/lib/libspl/include/sys/debug.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_DEBUG_H --#define _LIBSPL_SYS_DEBUG_H -+#define _LIBSPL_SYS_DEBUG_H - -diff --git a/lib/libspl/include/sys/feature_tests.h b/lib/libspl/include/sys/feature_tests.h -index 96f6271..1a68b75 100644 ---- a/lib/libspl/include/sys/feature_tests.h -+++ b/lib/libspl/include/sys/feature_tests.h -@@ -27,5 +27,5 @@ - #ifndef _SYS_FEATURE_TESTS_H --#define _SYS_FEATURE_TESTS_H -+#define _SYS_FEATURE_TESTS_H - --#define __NORETURN __attribute__((__noreturn__)) -+#define __NORETURN __attribute__((__noreturn__)) - -diff --git a/lib/libspl/include/sys/file.h b/lib/libspl/include/sys/file.h -index 9aaba35..163a4dc 100644 ---- a/lib/libspl/include/sys/file.h -+++ b/lib/libspl/include/sys/file.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_FILE_H --#define _LIBSPL_SYS_FILE_H -+#define _LIBSPL_SYS_FILE_H - -@@ -33,17 +33,17 @@ - --#define FREAD 1 --#define FWRITE 2 --//#define FAPPEND 8 -+#define FREAD 1 -+#define FWRITE 2 -+// #define FAPPEND 8 - --#define FCREAT O_CREAT --#define FTRUNC O_TRUNC --#define FOFFMAX O_LARGEFILE --#define FSYNC O_SYNC --#define FDSYNC O_DSYNC --#define FRSYNC O_RSYNC --#define FEXCL O_EXCL -+#define FCREAT O_CREAT -+#define FTRUNC O_TRUNC -+#define FOFFMAX O_LARGEFILE -+#define FSYNC O_SYNC -+#define FDSYNC O_DSYNC -+#define FRSYNC O_RSYNC -+#define FEXCL O_EXCL - --#define FNODSYNC 0x10000 /* fsync pseudo flag */ --#define FNOFOLLOW 0x20000 /* don't follow symlinks */ --#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ -+#define FNODSYNC 0x10000 /* fsync pseudo flag */ -+#define FNOFOLLOW 0x20000 /* don't follow symlinks */ -+#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ - -diff --git a/lib/libspl/include/sys/frame.h b/lib/libspl/include/sys/frame.h -index f936ab8..a4c7d8b 100644 ---- a/lib/libspl/include/sys/frame.h -+++ b/lib/libspl/include/sys/frame.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_FRAME_H --#define _SYS_FRAME_H -+#define _SYS_FRAME_H - -diff --git a/lib/libspl/include/sys/int_limits.h b/lib/libspl/include/sys/int_limits.h -index 2b50ddd..7af68cd 100644 ---- a/lib/libspl/include/sys/int_limits.h -+++ b/lib/libspl/include/sys/int_limits.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_INT_LIMITS_H --#define _LIBSPL_SYS_INT_LIMITS_H -+#define _LIBSPL_SYS_INT_LIMITS_H - -diff --git a/lib/libspl/include/sys/int_types.h b/lib/libspl/include/sys/int_types.h -index b325122..51e9e02 100644 ---- a/lib/libspl/include/sys/int_types.h -+++ b/lib/libspl/include/sys/int_types.h -@@ -27,3 +27,3 @@ - #ifndef _SOL_SYS_INT_TYPES_H --#define _SOL_SYS_INT_TYPES_H -+#define _SOL_SYS_INT_TYPES_H - -diff --git a/lib/libspl/include/sys/inttypes.h b/lib/libspl/include/sys/inttypes.h -index 7630f2d..d7d0639 100644 ---- a/lib/libspl/include/sys/inttypes.h -+++ b/lib/libspl/include/sys/inttypes.h -@@ -27,3 +27,3 @@ - #ifndef _SOL_SYS_INTTYPES_H --#define _SOL_SYS_INTTYPES_H -+#define _SOL_SYS_INTTYPES_H - -@@ -31,3 +31,3 @@ - --#define _INT64_TYPE -+#define _INT64_TYPE - -diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h -index 4ab07eb..446dbfc 100644 ---- a/lib/libspl/include/sys/isa_defs.h -+++ b/lib/libspl/include/sys/isa_defs.h -@@ -37,3 +37,3 @@ extern "C" { - #if !defined(__x86_64) --#define __x86_64 -+#define __x86_64 - #endif -@@ -41,3 +41,3 @@ extern "C" { - #if !defined(__amd64) --#define __amd64 -+#define __amd64 - #endif -@@ -45,3 +45,3 @@ extern "C" { - #if !defined(__x86) --#define __x86 -+#define __x86 - #endif -@@ -49,3 +49,3 @@ extern "C" { - #if !defined(_LP64) --#define _LP64 -+#define _LP64 - #endif -@@ -53,6 +53,6 @@ extern "C" { - #if !defined(_LITTLE_ENDIAN) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -62,3 +62,3 @@ extern "C" { - #if !defined(__i386) --#define __i386 -+#define __i386 - #endif -@@ -66,3 +66,3 @@ extern "C" { - #if !defined(__x86) --#define __x86 -+#define __x86 - #endif -@@ -70,3 +70,3 @@ extern "C" { - #if !defined(_ILP32) --#define _ILP32 -+#define _ILP32 - #endif -@@ -74,6 +74,6 @@ extern "C" { - #if !defined(_LITTLE_ENDIAN) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -83,3 +83,3 @@ extern "C" { - #if !defined(__powerpc) --#define __powerpc -+#define __powerpc - #endif -@@ -87,3 +87,3 @@ extern "C" { - #if !defined(__powerpc__) --#define __powerpc__ -+#define __powerpc__ - #endif -@@ -92,5 +92,5 @@ extern "C" { - #ifdef __powerpc64__ --#define _LP64 -+#define _LP64 - #else --#define _LP32 -+#define _LP32 - #endif -@@ -99,6 +99,6 @@ extern "C" { - #if !defined(_BIG_ENDIAN) --#define _BIG_ENDIAN -+#define _BIG_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -108,3 +108,3 @@ extern "C" { - #if !defined(__arm) --#define __arm -+#define __arm - #endif -@@ -112,3 +112,3 @@ extern "C" { - #if !defined(__arm__) --#define __arm__ -+#define __arm__ - #endif -@@ -116,10 +116,38 @@ extern "C" { - #if defined(__ARMEL__) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #else --#define _BIG_ENDIAN -+#define _BIG_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - --#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ -+/* sparc arch specific defines */ -+#elif defined(__sparc) || defined(__sparc__) -+ -+#if !defined(__sparc) -+#define __sparc -+#endif -+ -+#if !defined(__sparc__) -+#define __sparc__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+/* sparc64 arch specific defines */ -+#elif defined(__sparc64) || defined(__sparc64__) -+ -+#if !defined(__sparc64) -+#define __sparc64 -+#endif -+ -+#if !defined(__sparc64__) -+#define __sparc64__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+#else /* Currently x86_64, i386, arm, powerpc, and sparc are supported */ - #error "Unsupported ISA type" -diff --git a/lib/libspl/include/sys/kmem.h b/lib/libspl/include/sys/kmem.h -index 401e040..83d4756 100644 ---- a/lib/libspl/include/sys/kmem.h -+++ b/lib/libspl/include/sys/kmem.h -@@ -37,4 +37,4 @@ extern "C" { - --#define kmem_alloc(size, flags) malloc(size) --#define kmem_free(ptr, size) free(ptr) -+#define kmem_alloc(size, flags) malloc(size) -+#define kmem_free(ptr, size) free(ptr) - -diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h -index 6bd2ec8..fcd3ed9 100644 ---- a/lib/libspl/include/sys/kstat.h -+++ b/lib/libspl/include/sys/kstat.h -@@ -230,6 +230,4 @@ typedef struct kstat32 { - /* ks_ndata >= 1 */ --#define KSTAT_TYPE_TXG 5 /* txg statistics */ -- /* ks_ndata >= 0 */ - --#define KSTAT_NUM_TYPES 6 -+#define KSTAT_NUM_TYPES 5 - -@@ -702,25 +700,2 @@ typedef struct kstat_timer { - --/* -- * TXG statistics - bytes read/written and iops performed -- */ --typedef enum kstat_txg_state { -- TXG_STATE_OPEN = 1, -- TXG_STATE_QUIESCING = 2, -- TXG_STATE_SYNCING = 3, -- TXG_STATE_COMMITTED = 4, --} kstat_txg_state_t; -- --typedef struct kstat_txg { -- u_longlong_t txg; /* txg id */ -- kstat_txg_state_t state; /* txg state */ -- hrtime_t birth; /* birth time stamp */ -- u_longlong_t nread; /* number of bytes read */ -- u_longlong_t nwritten; /* number of bytes written */ -- uint_t reads; /* number of read operations */ -- uint_t writes; /* number of write operations */ -- hrtime_t open_time; /* open time */ -- hrtime_t quiesce_time; /* quiesce time */ -- hrtime_t sync_time; /* sync time */ --} kstat_txg_t; -- - #if defined(_KERNEL) -diff --git a/lib/libspl/include/sys/mkdev.h b/lib/libspl/include/sys/mkdev.h -index 76e3a4f..5978de6 100644 ---- a/lib/libspl/include/sys/mkdev.h -+++ b/lib/libspl/include/sys/mkdev.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_MKDEV_H --#define _LIBSPL_SYS_MKDEV_H -+#define _LIBSPL_SYS_MKDEV_H - -diff --git a/lib/libspl/include/sys/mntent.h b/lib/libspl/include/sys/mntent.h -index 8fad65b..b57ffee 100644 ---- a/lib/libspl/include/sys/mntent.h -+++ b/lib/libspl/include/sys/mntent.h -@@ -41,2 +41,3 @@ - #define MOUNT_SOMEOK 0x40 /* At least on mount succeeded */ -+#define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY */ - -@@ -48,3 +49,2 @@ - #define MNTOPT_CONTEXT "context" /* selinux context */ --#define MNTOPT_NOCONTEXT "nocontext" /* No selinux context (zfs-only) */ - #define MNTOPT_FSCONTEXT "fscontext" /* selinux fscontext */ -@@ -94,6 +94,8 @@ - #define MNTOPT_ZFSUTIL "zfsutil" /* called by zfs utility */ -+#define MNTOPT_ACL "acl" /* passed by util-linux-2.24 mount */ -+#define MNTOPT_NOACL "noacl" /* likewise */ -+#define MNTOPT_POSIXACL "posixacl" /* likewise */ - --#define ZS_COMMENT 0x00000000 /* comment */ --#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ --#define ZS_NOCONTEXT 0x00000002 /* do not add selinux context */ -+#define ZS_COMMENT 0x00000000 /* comment */ -+#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ - -diff --git a/lib/libspl/include/sys/mnttab.h b/lib/libspl/include/sys/mnttab.h -index a30549a..6bfbdd6 100644 ---- a/lib/libspl/include/sys/mnttab.h -+++ b/lib/libspl/include/sys/mnttab.h -@@ -21,3 +21,3 @@ - */ --/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T*/ -+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ - /* All Rights Reserved */ -@@ -30,3 +30,3 @@ - #ifndef _SYS_MNTTAB_H --#define _SYS_MNTTAB_H -+#define _SYS_MNTTAB_H - -@@ -79,7 +79,7 @@ static inline char *_sol_hasmntopt(struct mnttab *mnt, char *opt) - -- return hasmntopt(&mnt_new, opt); -+ return (hasmntopt(&mnt_new, opt)); - } - --#define hasmntopt _sol_hasmntopt --#define getmntent _sol_getmntent -+#define hasmntopt _sol_hasmntopt -+#define getmntent _sol_getmntent - -diff --git a/lib/libspl/include/sys/mount.h b/lib/libspl/include/sys/mount.h -index 7b1e06b..41cd839 100644 ---- a/lib/libspl/include/sys/mount.h -+++ b/lib/libspl/include/sys/mount.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_SYS_MOUNT_H --#define _LIBSPL_SYS_MOUNT_H -+#define _LIBSPL_SYS_MOUNT_H - -@@ -41,3 +41,3 @@ - #if !defined(BLKGETSIZE64) --#define BLKGETSIZE64 _IOR(0x12, 114, size_t) -+#define BLKGETSIZE64 _IOR(0x12, 114, size_t) - #endif -@@ -50,3 +50,12 @@ - #if !defined(MS_DIRSYNC) --#define MS_DIRSYNC S_WRITE -+#define MS_DIRSYNC S_WRITE -+#endif -+ -+/* -+ * Some old glibc headers don't correctly define MS_POSIXACL and -+ * instead leave it undefined. When using these older headers define -+ * MS_POSIXACL to the reserved value of (1<<16). -+ */ -+#if !defined(MS_POSIXACL) -+#define MS_POSIXACL (1<<16) - #endif -@@ -64,5 +73,5 @@ - #ifdef MNT_FORCE --# define MS_FORCE MNT_FORCE -+#define MS_FORCE MNT_FORCE - #else --# define MS_FORCE 0x00000001 -+#define MS_FORCE 0x00000001 - #endif /* MNT_FORCE */ -@@ -70,5 +79,5 @@ - #ifdef MNT_DETACH --# define MS_DETACH MNT_DETACH -+#define MS_DETACH MNT_DETACH - #else --# define MS_DETACH 0x00000002 -+#define MS_DETACH 0x00000002 - #endif /* MNT_DETACH */ -@@ -80,3 +89,3 @@ - */ --#define MS_OVERLAY 0x00000004 -+#define MS_OVERLAY 0x00000004 - -diff --git a/lib/libspl/include/sys/param.h b/lib/libspl/include/sys/param.h -index 75cf0b7..4090cef 100644 ---- a/lib/libspl/include/sys/param.h -+++ b/lib/libspl/include/sys/param.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_PARAM_H --#define _LIBSPL_SYS_PARAM_H -+#define _LIBSPL_SYS_PARAM_H - -@@ -45,17 +45,17 @@ - */ --#define MAXBSIZE 8192 --#define DEV_BSIZE 512 --#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ -+#define MAXBSIZE 8192 -+#define DEV_BSIZE 512 -+#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ - --#define MAXNAMELEN 256 --#define MAXOFFSET_T LLONG_MAX -+#define MAXNAMELEN 256 -+#define MAXOFFSET_T LLONG_MAX - --#define UID_NOBODY 60001 /* user ID no body */ --#define GID_NOBODY UID_NOBODY --#define UID_NOACCESS 60002 /* user ID no access */ -+#define UID_NOBODY 60001 /* user ID no body */ -+#define GID_NOBODY UID_NOBODY -+#define UID_NOACCESS 60002 /* user ID no access */ - --#define MAXUID UINT32_MAX /* max user id */ --#define MAXPROJID MAXUID /* max project id */ -+#define MAXUID UINT32_MAX /* max user id */ -+#define MAXPROJID MAXUID /* max project id */ - --#define PAGESIZE (sysconf(_SC_PAGESIZE)) -+#define PAGESIZE (sysconf(_SC_PAGESIZE)) - -diff --git a/lib/libspl/include/sys/priv.h b/lib/libspl/include/sys/priv.h -index 4a3ab96..76c76d1 100644 ---- a/lib/libspl/include/sys/priv.h -+++ b/lib/libspl/include/sys/priv.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_PRIV_H --#define _LIBSPL_SYS_PRIV_H -+#define _LIBSPL_SYS_PRIV_H - -diff --git a/lib/libspl/include/sys/processor.h b/lib/libspl/include/sys/processor.h -index 0af9dc0..78e95d0 100644 ---- a/lib/libspl/include/sys/processor.h -+++ b/lib/libspl/include/sys/processor.h -@@ -27,5 +27,5 @@ - #ifndef _LIBSPL_SYS_PROCESSOR_H --#define _LIBSPL_SYS_PROCESSOR_H -+#define _LIBSPL_SYS_PROCESSOR_H - --#define getcpuid() (-1) -+#define getcpuid() (-1) - -diff --git a/lib/libspl/include/sys/sdt.h b/lib/libspl/include/sys/sdt.h -index 79733ee..f68f790 100644 ---- a/lib/libspl/include/sys/sdt.h -+++ b/lib/libspl/include/sys/sdt.h -@@ -27,9 +27,9 @@ - #ifndef _LIBSPL_SYS_SDT_H --#define _LIBSPL_SYS_SDT_H -+#define _LIBSPL_SYS_SDT_H - --#define DTRACE_PROBE(a) ((void) 0) --#define DTRACE_PROBE1(a,b,c) ((void) 0) --#define DTRACE_PROBE2(a,b,c,d,e) ((void) 0) --#define DTRACE_PROBE3(a,b,c,d,e,f,g) ((void) 0) --#define DTRACE_PROBE4(a,b,c,d,e,f,g,h,i) ((void) 0) -+#define DTRACE_PROBE(a) ((void) 0) -+#define DTRACE_PROBE1(a, b, c) ((void) 0) -+#define DTRACE_PROBE2(a, b, c, d, e) ((void) 0) -+#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void) 0) -+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void) 0) - -diff --git a/lib/libspl/include/sys/stack.h b/lib/libspl/include/sys/stack.h -index 41f0beb..59807e9 100644 ---- a/lib/libspl/include/sys/stack.h -+++ b/lib/libspl/include/sys/stack.h -@@ -25,3 +25,3 @@ - #ifndef _SYS_STACK_H --#define _SYS_STACK_H -+#define _SYS_STACK_H - -@@ -29,3 +29,3 @@ - --#define STACK_BIAS 0 -+#define STACK_BIAS 0 - -@@ -41,3 +41,3 @@ stack_getbounds(stack_t *sp) - if (rc) -- return rc; -+ return (rc); - -@@ -49,3 +49,3 @@ stack_getbounds(stack_t *sp) - -- return rc; -+ return (rc); - } -@@ -59,7 +59,9 @@ thr_stksegment(stack_t *sp) - if (rc) -- return rc; -+ return (rc); - -- /* thr_stksegment() is expected to set sp.ss_sp to the high stack -- * address, but the stack_getbounds() interface is expected to -- * set sp.ss_sp to the low address. Adjust accordingly. */ -+ /* -+ * thr_stksegment() is expected to set sp.ss_sp to the high stack -+ * address, but the stack_getbounds() interface is expected to -+ * set sp.ss_sp to the low address. Adjust accordingly. -+ */ - sp->ss_sp = (void *)(((uintptr_t)sp->ss_sp) + sp->ss_size); -@@ -67,3 +69,3 @@ thr_stksegment(stack_t *sp) - -- return rc; -+ return (rc); - } -diff --git a/lib/libspl/include/sys/stat.h b/lib/libspl/include/sys/stat.h -index b9ad152..3e8d27e 100644 ---- a/lib/libspl/include/sys/stat.h -+++ b/lib/libspl/include/sys/stat.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_SYS_STAT_H --#define _LIBSPL_SYS_STAT_H -+#define _LIBSPL_SYS_STAT_H - -@@ -39,3 +39,3 @@ fstat64_blk(int fd, struct stat64 *st) - if (fstat64(fd, st) == -1) -- return -1; -+ return (-1); - -@@ -44,6 +44,6 @@ fstat64_blk(int fd, struct stat64 *st) - if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) -- return -1; -+ return (-1); - } - -- return 0; -+ return (0); - } -diff --git a/lib/libspl/include/sys/stropts.h b/lib/libspl/include/sys/stropts.h -index e036b0e..08c2e79 100644 ---- a/lib/libspl/include/sys/stropts.h -+++ b/lib/libspl/include/sys/stropts.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_SYS_STROPTS_H --#define _LIBSPL_SYS_STROPTS_H -+#define _LIBSPL_SYS_STROPTS_H - -diff --git a/lib/libspl/include/sys/sysevent.h b/lib/libspl/include/sys/sysevent.h -index 980d145..074d841 100644 ---- a/lib/libspl/include/sys/sysevent.h -+++ b/lib/libspl/include/sys/sysevent.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSEVENT_H --#define _LIBSPL_SYS_SYSEVENT_H -+#define _LIBSPL_SYS_SYSEVENT_H - -diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/sys/sysmacros.h -index 07ab8c9..698b0a7 100644 ---- a/lib/libspl/include/sys/sysmacros.h -+++ b/lib/libspl/include/sys/sysmacros.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSMACROS_H --#define _LIBSPL_SYS_SYSMACROS_H -+#define _LIBSPL_SYS_SYSMACROS_H - -@@ -33,14 +33,14 @@ - #ifndef MIN --#define MIN(a, b) ((a) < (b) ? (a) : (b)) -+#define MIN(a, b) ((a) < (b) ? (a) : (b)) - #endif - #ifndef MAX --#define MAX(a, b) ((a) < (b) ? (b) : (a)) -+#define MAX(a, b) ((a) < (b) ? (b) : (a)) - #endif - #ifndef ABS --#define ABS(a) ((a) < 0 ? -(a) : (a)) -+#define ABS(a) ((a) < 0 ? -(a) : (a)) - #endif - --#define makedevice(maj,min) makedev(maj,min) --#define _sysconf(a) sysconf(a) --#define __NORETURN __attribute__ ((noreturn)) -+#define makedevice(maj, min) makedev(maj, min) -+#define _sysconf(a) sysconf(a) -+#define __NORETURN __attribute__((noreturn)) - -@@ -49,15 +49,15 @@ - */ --#define P2ALIGN(x, align) ((x) & -(align)) --#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) --#define P2ROUNDUP(x, align) (-(-(x) & -(align))) --#define P2ROUNDUP_TYPED(x, align, type) \ -+#define P2ALIGN(x, align) ((x) & -(align)) -+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) -+#define P2ROUNDUP(x, align) (-(-(x) & -(align))) -+#define P2ROUNDUP_TYPED(x, align, type) \ - (-(-(type)(x) & -(type)(align))) --#define P2BOUNDARY(off, len, align) \ -+#define P2BOUNDARY(off, len, align) \ - (((off) ^ ((off) + (len) - 1)) > (align) - 1) --#define P2PHASE(x, align) ((x) & ((align) - 1)) --#define P2NPHASE(x, align) (-(x) & ((align) - 1)) --#define P2NPHASE_TYPED(x, align, type) \ -+#define P2PHASE(x, align) ((x) & ((align) - 1)) -+#define P2NPHASE(x, align) (-(x) & ((align) - 1)) -+#define P2NPHASE_TYPED(x, align, type) \ - (-(type)(x) & ((type)(align) - 1)) --#define ISP2(x) (((x) & ((x) - 1)) == 0) --#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) -+#define ISP2(x) (((x) & ((x) - 1)) == 0) -+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) - -@@ -74,18 +74,18 @@ - */ --#define P2ALIGN_TYPED(x, align, type) \ -- ((type)(x) & -(type)(align)) --#define P2PHASE_TYPED(x, align, type) \ -- ((type)(x) & ((type)(align) - 1)) --#define P2NPHASE_TYPED(x, align, type) \ -- (-(type)(x) & ((type)(align) - 1)) --#define P2ROUNDUP_TYPED(x, align, type) \ -- (-(-(type)(x) & -(type)(align))) --#define P2END_TYPED(x, align, type) \ -- (-(~(type)(x) & -(type)(align))) --#define P2PHASEUP_TYPED(x, align, phase, type) \ -- ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) --#define P2CROSS_TYPED(x, y, align, type) \ -- (((type)(x) ^ (type)(y)) > (type)(align) - 1) --#define P2SAMEHIGHBIT_TYPED(x, y, type) \ -- (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) -+#define P2ALIGN_TYPED(x, align, type) \ -+ ((type)(x) & -(type)(align)) -+#define P2PHASE_TYPED(x, align, type) \ -+ ((type)(x) & ((type)(align) - 1)) -+#define P2NPHASE_TYPED(x, align, type) \ -+ (-(type)(x) & ((type)(align) - 1)) -+#define P2ROUNDUP_TYPED(x, align, type) \ -+ (-(-(type)(x) & -(type)(align))) -+#define P2END_TYPED(x, align, type) \ -+ (-(~(type)(x) & -(type)(align))) -+#define P2PHASEUP_TYPED(x, align, phase, type) \ -+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) -+#define P2CROSS_TYPED(x, y, align, type) \ -+ (((type)(x) ^ (type)(y)) > (type)(align) - 1) -+#define P2SAMEHIGHBIT_TYPED(x, y, type) \ -+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) - -diff --git a/lib/libspl/include/sys/systeminfo.h b/lib/libspl/include/sys/systeminfo.h -index 9f561aa..3f7cef5 100644 ---- a/lib/libspl/include/sys/systeminfo.h -+++ b/lib/libspl/include/sys/systeminfo.h -@@ -27,6 +27,6 @@ - #ifndef _LIBSPL_SYS_SYSTEMINFO_H --#define _LIBSPL_SYS_SYSTEMINFO_H -+#define _LIBSPL_SYS_SYSTEMINFO_H - --#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ --#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ -+#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ -+#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ - /* to hold a decimal or hex */ -@@ -34,3 +34,3 @@ - --#define sysinfo(cmd,buf,cnt) (-1) -+#define sysinfo(cmd, buf, cnt) (-1) - -diff --git a/lib/libspl/include/sys/systm.h b/lib/libspl/include/sys/systm.h -index 5cb088d..1ed031d 100644 ---- a/lib/libspl/include/sys/systm.h -+++ b/lib/libspl/include/sys/systm.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSTM_H --#define _LIBSPL_SYS_SYSTM_H -+#define _LIBSPL_SYS_SYSTM_H - -diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h -index 0cbbd92..f0da440 100644 ---- a/lib/libspl/include/sys/time.h -+++ b/lib/libspl/include/sys/time.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_TIME_H --#define _LIBSPL_SYS_TIME_H -+#define _LIBSPL_SYS_TIME_H - -@@ -33,3 +33,3 @@ - #ifndef SEC --#define SEC 1 -+#define SEC 1 - #endif -@@ -37,3 +37,3 @@ - #ifndef MILLISEC --#define MILLISEC 1000 -+#define MILLISEC 1000 - #endif -@@ -41,3 +41,3 @@ - #ifndef MICROSEC --#define MICROSEC 1000000 -+#define MICROSEC 1000000 - #endif -@@ -45,3 +45,3 @@ - #ifndef NANOSEC --#define NANOSEC 1000000000 -+#define NANOSEC 1000000000 - #endif -@@ -49,3 +49,11 @@ - #ifndef NSEC_PER_USEC --#define NSEC_PER_USEC 1000L -+#define NSEC_PER_USEC 1000L -+#endif -+ -+#ifndef MSEC2NSEC -+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) -+#endif -+ -+#ifndef NSEC2MSEC -+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) - #endif -diff --git a/lib/libspl/include/sys/types.h b/lib/libspl/include/sys/types.h -index 77a5b23..bd34dec 100644 ---- a/lib/libspl/include/sys/types.h -+++ b/lib/libspl/include/sys/types.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_TYPES_H --#define _LIBSPL_SYS_TYPES_H -+#define _LIBSPL_SYS_TYPES_H - -@@ -55,5 +55,5 @@ typedef longlong_t diskaddr_t; - --typedef ulong_t pfn_t; /* page frame number */ --typedef ulong_t pgcnt_t; /* number of pages */ --typedef long spgcnt_t; /* signed number of pages */ -+typedef ulong_t pfn_t; /* page frame number */ -+typedef ulong_t pgcnt_t; /* number of pages */ -+typedef long spgcnt_t; /* signed number of pages */ - -diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h -index 8adc923..97e8412 100644 ---- a/lib/libspl/include/sys/uio.h -+++ b/lib/libspl/include/sys/uio.h -@@ -47,4 +47,4 @@ typedef struct iovec iovec_t; - typedef enum uio_rw { -- UIO_READ = 0, -- UIO_WRITE = 1, -+ UIO_READ = 0, -+ UIO_WRITE = 1, - } uio_rw_t; -@@ -52,5 +52,5 @@ typedef enum uio_rw { - typedef enum uio_seg { -- UIO_USERSPACE = 0, -- UIO_SYSSPACE = 1, -- UIO_USERISPACE= 2, -+ UIO_USERSPACE = 0, -+ UIO_SYSSPACE = 1, -+ UIO_USERISPACE = 2, - } uio_seg_t; -@@ -104,4 +104,4 @@ typedef struct xuio { - --#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv --#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw -+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - -diff --git a/lib/libspl/include/sys/utsname.h b/lib/libspl/include/sys/utsname.h -index fd323b9..e16e22d 100644 ---- a/lib/libspl/include/sys/utsname.h -+++ b/lib/libspl/include/sys/utsname.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UTSNAME_H --#define _LIBSPL_UTSNAME_H -+#define _LIBSPL_UTSNAME_H - -diff --git a/lib/libspl/include/sys/va_list.h b/lib/libspl/include/sys/va_list.h -index cf60454..04ad148 100644 ---- a/lib/libspl/include/sys/va_list.h -+++ b/lib/libspl/include/sys/va_list.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_VA_LIST_H --#define _SYS_VA_LIST_H -+#define _SYS_VA_LIST_H - -diff --git a/lib/libspl/include/sys/varargs.h b/lib/libspl/include/sys/varargs.h -index b8a63d8..3d00a33 100644 ---- a/lib/libspl/include/sys/varargs.h -+++ b/lib/libspl/include/sys/varargs.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_VARARGS_H --#define _LIBSPL_SYS_VARARGS_H -+#define _LIBSPL_SYS_VARARGS_H - -diff --git a/lib/libspl/include/sys/vnode.h b/lib/libspl/include/sys/vnode.h -index f25e9e9..efcdd2c 100644 ---- a/lib/libspl/include/sys/vnode.h -+++ b/lib/libspl/include/sys/vnode.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_VNODE_H --#define _LIBSPL_SYS_VNODE_H -+#define _LIBSPL_SYS_VNODE_H - -diff --git a/lib/libspl/include/sys/zone.h b/lib/libspl/include/sys/zone.h -index ea7c8bd..bbb964d 100644 ---- a/lib/libspl/include/sys/zone.h -+++ b/lib/libspl/include/sys/zone.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_ZONE_H --#define _LIBSPL_SYS_ZONE_H -+#define _LIBSPL_SYS_ZONE_H - -diff --git a/lib/libspl/include/thread.h b/lib/libspl/include/thread.h -index a72f6d2..74694e2 100644 ---- a/lib/libspl/include/thread.h -+++ b/lib/libspl/include/thread.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_THREAD_H --#define _LIBSPL_THREAD_H -+#define _LIBSPL_THREAD_H - -diff --git a/lib/libspl/include/tzfile.h b/lib/libspl/include/tzfile.h -index 441b8cf..7bd4087 100644 ---- a/lib/libspl/include/tzfile.h -+++ b/lib/libspl/include/tzfile.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_TZFILE_H --#define _LIBSPL_TZFILE_H -+#define _LIBSPL_TZFILE_H - -diff --git a/lib/libspl/include/ucred.h b/lib/libspl/include/ucred.h -index 4ca424e..8178fde 100644 ---- a/lib/libspl/include/ucred.h -+++ b/lib/libspl/include/ucred.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UCRED_H --#define _LIBSPL_UCRED_H -+#define _LIBSPL_UCRED_H - -diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h -index f102f66..0d0778c 100644 ---- a/lib/libspl/include/umem.h -+++ b/lib/libspl/include/umem.h -@@ -27,5 +27,6 @@ - #ifndef _LIBSPL_UMEM_H --#define _LIBSPL_UMEM_H -+#define _LIBSPL_UMEM_H - --/* XXX: We should use the real portable umem library if it is detected -+/* -+ * XXX: We should use the real portable umem library if it is detected - * at configure time. However, if the library is not available, we can -@@ -50,4 +51,4 @@ typedef void vmem_t; - */ --#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ --#define UMEM_NOFAIL 0x0100 /* Never fails */ -+#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ -+#define UMEM_NOFAIL 0x0100 /* Never fails */ - -@@ -56,8 +57,8 @@ typedef void vmem_t; - */ --#define UMC_NOTOUCH 0x00010000 --#define UMC_NODEBUG 0x00020000 --#define UMC_NOMAGAZINE 0x00040000 --#define UMC_NOHASH 0x00080000 -+#define UMC_NOTOUCH 0x00010000 -+#define UMC_NODEBUG 0x00020000 -+#define UMC_NOMAGAZINE 0x00040000 -+#define UMC_NOHASH 0x00080000 - --#define UMEM_CACHE_NAMELEN 31 -+#define UMEM_CACHE_NAMELEN 31 - -@@ -89,3 +90,3 @@ umem_alloc(size_t size, int flags) - -- return ptr; -+ return (ptr); - } -@@ -107,6 +108,8 @@ umem_alloc_aligned(size_t size, size_t align, int flags) - abort(); -- return NULL; -+ return (NULL); - } - -- return ptr; -+ ASSERT0(P2PHASE_TYPED(ptr, align, uint64_t)); -+ -+ return (ptr); - } -@@ -122,3 +125,3 @@ umem_zalloc(size_t size, int flags) - -- return ptr; -+ return (ptr); - } -@@ -135,7 +138,8 @@ umem_nofail_callback(umem_nofail_callback_t *cb) {} - static inline umem_cache_t * --umem_cache_create(char *name, size_t bufsize, size_t align, -- umem_constructor_t *constructor, -- umem_destructor_t *destructor, -- umem_reclaim_t *reclaim, -- void *priv, void *vmp, int cflags) -+umem_cache_create( -+ char *name, size_t bufsize, size_t align, -+ umem_constructor_t *constructor, -+ umem_destructor_t *destructor, -+ umem_reclaim_t *reclaim, -+ void *priv, void *vmp, int cflags) - { -@@ -143,3 +147,3 @@ umem_cache_create(char *name, size_t bufsize, size_t align, - -- cp = umem_alloc(sizeof(umem_cache_t), UMEM_DEFAULT); -+ cp = umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); - if (cp) { -@@ -156,3 +160,3 @@ umem_cache_create(char *name, size_t bufsize, size_t align, - -- return cp; -+ return (cp); - } -@@ -162,3 +166,3 @@ umem_cache_destroy(umem_cache_t *cp) - { -- umem_free(cp, sizeof(umem_cache_t)); -+ umem_free(cp, sizeof (umem_cache_t)); - } -@@ -171,3 +175,4 @@ umem_cache_alloc(umem_cache_t *cp, int flags) - if (cp->cache_align != 0) -- ptr = umem_alloc_aligned(cp->cache_bufsize, cp->cache_align, flags); -+ ptr = umem_alloc_aligned( -+ cp->cache_bufsize, cp->cache_align, flags); - else -@@ -178,3 +183,3 @@ umem_cache_alloc(umem_cache_t *cp, int flags) - -- return ptr; -+ return (ptr); - } -diff --git a/lib/libspl/include/unistd.h b/lib/libspl/include/unistd.h -index dc95e28..53851f4 100644 ---- a/lib/libspl/include/unistd.h -+++ b/lib/libspl/include/unistd.h -@@ -29,17 +29,17 @@ - #ifndef _LIBSPL_UNISTD_H --#define _LIBSPL_UNISTD_H -+#define _LIBSPL_UNISTD_H - - #if !defined(HAVE_IOCTL_IN_UNISTD_H) --# if defined(HAVE_IOCTL_IN_SYS_IOCTL_H) --# include --# elif defined(HAVE_IOCTL_IN_STROPTS_H) --# include --# else --# error "System call ioctl() unavailable" --# endif --#endif -+#if defined(HAVE_IOCTL_IN_SYS_IOCTL_H) -+#include -+#elif defined(HAVE_IOCTL_IN_STROPTS_H) -+#include -+#else /* HAVE_IOCTL_IN_STROPTS_H */ -+#error "System call ioctl() unavailable" -+#endif /* HAVE_IOCTL_IN_SYS_IOCTL_H */ -+#endif /* !HAVE_IOCTL_IN_UNISTD_H */ - - #if !defined(HAVE_ISSETUGID) --# include --# define issetugid() (geteuid() == 0 || getegid() == 0) -+#include -+#define issetugid() (geteuid() == 0 || getegid() == 0) - #endif -diff --git a/lib/libspl/include/util/sscanf.h b/lib/libspl/include/util/sscanf.h -index 9d13bf2..ead36ac 100644 ---- a/lib/libspl/include/util/sscanf.h -+++ b/lib/libspl/include/util/sscanf.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UTIL_SSCANF_H --#define _LIBSPL_UTIL_SSCANF_H -+#define _LIBSPL_UTIL_SSCANF_H - -diff --git a/lib/libspl/include/zone.h b/lib/libspl/include/zone.h -index dd24a1b..b4a6deb 100644 ---- a/lib/libspl/include/zone.h -+++ b/lib/libspl/include/zone.h -@@ -38,4 +38,4 @@ extern "C" { - --#define GLOBAL_ZONEID 0 --#define GLOBAL_ZONEID_NAME "global" -+#define GLOBAL_ZONEID 0 -+#define GLOBAL_ZONEID_NAME "global" - -diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c -index f98e31e..2f09188 100644 ---- a/lib/libspl/mkdirp.c -+++ b/lib/libspl/mkdirp.c -@@ -148,4 +148,6 @@ simplify(const char *str) - -- if (!str) -+ if (!str) { -+ errno = ENOENT; - return (NULL); -+ } - -diff --git a/lib/libspl/zone.c b/lib/libspl/zone.c -index f4269a7..5ca93b2 100644 ---- a/lib/libspl/zone.c -+++ b/lib/libspl/zone.c -@@ -29,22 +29,25 @@ - --zoneid_t getzoneid() -+zoneid_t -+getzoneid() - { -- return GLOBAL_ZONEID; -+ return (GLOBAL_ZONEID); - } - --zoneid_t getzoneidbyname(const char *name) -+zoneid_t -+getzoneidbyname(const char *name) - { -- if(name == NULL) -- return GLOBAL_ZONEID; -+ if (name == NULL) -+ return (GLOBAL_ZONEID); - -- if(strcmp(name, GLOBAL_ZONEID_NAME) == 0) -- return GLOBAL_ZONEID; -+ if (strcmp(name, GLOBAL_ZONEID_NAME) == 0) -+ return (GLOBAL_ZONEID); - -- return EINVAL; -+ return (EINVAL); - } - --ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) -+ssize_t -+getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - { -- if(id != GLOBAL_ZONEID) -- return EINVAL; -+ if (id != GLOBAL_ZONEID) -+ return (EINVAL); - -@@ -52,4 +55,4 @@ ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - -- if(buf == NULL || buflen == 0) -- return ret; -+ if (buf == NULL || buflen == 0) -+ return (ret); - -@@ -58,3 +61,3 @@ ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - -- return ret; -+ return (ret); - } -diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am -index 524efaa..8b1f517 100644 ---- a/lib/libzfs/Makefile.am -+++ b/lib/libzfs/Makefile.am -@@ -24,2 +24,3 @@ libzfs_la_SOURCES = \ - libzfs_la_LIBADD = \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ - $(top_builddir)/lib/libshare/libshare.la \ -@@ -28,2 +29,3 @@ libzfs_la_LIBADD = \ - --libzfs_la_LDFLAGS = -lm -ldl -version-info 1:1:0 $(LIBSELINUX) -+libzfs_la_LIBADD += -lm -ldl $(LIBBLKID) -+libzfs_la_LDFLAGS = -version-info 2:0:0 -diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c -index 3a83e2d..0bcfc04 100644 ---- a/lib/libzfs/libzfs_changelist.c -+++ b/lib/libzfs/libzfs_changelist.c -@@ -293,6 +293,2 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - cn = uu_list_next(clp->cl_list, cn)) { -- zfs_handle_t *hdl; -- -- hdl = cn->cn_handle; -- - /* -@@ -300,3 +296,3 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - */ -- if (!isa_child_of(hdl->zfs_name, src)) -+ if (!isa_child_of(cn->cn_handle->zfs_name, src)) - continue; -@@ -306,13 +302,9 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - */ -- remove_mountpoint(hdl); -+ remove_mountpoint(cn->cn_handle); - - (void) strlcpy(newname, dst, sizeof (newname)); -- (void) strcat(newname, hdl->zfs_name + strlen(src)); -- -- if (ZFS_IS_VOLUME(hdl)) { -- (void) zvol_remove_link(hdl->zfs_hdl, hdl->zfs_name); -- (void) zvol_create_link(hdl->zfs_hdl, newname); -- } -+ (void) strcat(newname, cn->cn_handle->zfs_name + strlen(src)); - -- (void) strlcpy(hdl->zfs_name, newname, sizeof (hdl->zfs_name)); -+ (void) strlcpy(cn->cn_handle->zfs_name, newname, -+ sizeof (cn->cn_handle->zfs_name)); - } -diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c -index ee94fe1..4175635 100644 ---- a/lib/libzfs/libzfs_config.c -+++ b/lib/libzfs/libzfs_config.c -@@ -108,3 +108,3 @@ namespace_reload(libzfs_handle_t *hdl) - nvpair_t *elem; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - void *cookie; -@@ -263,3 +263,3 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error; -diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c -index 244b687..5532531 100644 ---- a/lib/libzfs/libzfs_dataset.c -+++ b/lib/libzfs/libzfs_dataset.c -@@ -23,6 +23,8 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright (c) 2012 Pawel Jakub Dawidek . -- * Copyright 2012 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -63,3 +65,2 @@ - --static int zvol_create_link_common(libzfs_handle_t *, const char *, int); - static int userquota_propname_decode(const char *propname, boolean_t zoned, -@@ -315,3 +316,3 @@ get_recvd_props_ioctl(zfs_handle_t *zhp) - nvlist_t *recvdprops; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -378,3 +379,3 @@ get_stats(zfs_handle_t *zhp) - int rc = 0; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -418,4 +419,3 @@ make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) - else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) -- return (-1); /* zpios' and other testing datasets are -- of this type, ignore if encountered */ -+ return (-1); - else -@@ -441,3 +441,3 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -642,2 +642,3 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - mnttab_node_t *mtn; -+ avl_index_t where; - -@@ -645,2 +646,3 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - continue; -+ - mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); -@@ -650,2 +652,13 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); -+ -+ /* Exclude duplicate mounts */ -+ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { -+ free(mtn->mtn_mt.mnt_special); -+ free(mtn->mtn_mt.mnt_mountp); -+ free(mtn->mtn_mt.mnt_fstype); -+ free(mtn->mtn_mt.mnt_mntopts); -+ free(mtn); -+ continue; -+ } -+ - avl_add(&hdl->libzfs_mnttab_cache, mtn); -@@ -1410,2 +1423,3 @@ zfs_is_namespace_prop(zfs_prop_t prop) - case ZFS_PROP_ATIME: -+ case ZFS_PROP_RELATIME: - case ZFS_PROP_DEVICES: -@@ -1429,3 +1443,3 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret = -1; -@@ -1436,4 +1450,3 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - zfs_prop_t prop; -- boolean_t do_prefix; -- uint64_t idx; -+ boolean_t do_prefix = B_TRUE; - int added_resv = 0; -@@ -1476,8 +1489,13 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - /* -- * If the dataset's canmount property is being set to noauto, -- * then we want to prevent unmounting & remounting it. -+ * We don't want to unmount & remount the dataset when changing -+ * its canmount property to 'on' or 'noauto'. We only use -+ * the changelist logic to unmount when setting canmount=off. - */ -- do_prefix = !((prop == ZFS_PROP_CANMOUNT) && -- (zprop_string_to_index(prop, propval, &idx, -- ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO)); -+ if (prop == ZFS_PROP_CANMOUNT) { -+ uint64_t idx; -+ int err = zprop_string_to_index(prop, propval, &idx, -+ ZFS_TYPE_DATASET); -+ if (err == 0 && idx != ZFS_CANMOUNT_OFF) -+ do_prefix = B_FALSE; -+ } - -@@ -1551,3 +1569,3 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret; -@@ -1639,2 +1657,11 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) - (void) get_stats(zhp); -+ -+ /* -+ * Remount the filesystem to propagate the change -+ * if one of the options handled by the generic -+ * Linux namespace layer has been modified. -+ */ -+ if (zfs_is_namespace_prop(prop) && -+ zfs_is_mounted(zhp, NULL)) -+ ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); - } -@@ -1726,3 +1753,3 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *zplprops = NULL; -@@ -1741,2 +1768,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - -+ case ZFS_PROP_RELATIME: -+ mntopt_on = MNTOPT_RELATIME; -+ mntopt_off = MNTOPT_NORELATIME; -+ break; -+ - case ZFS_PROP_DEVICES: -@@ -1801,2 +1833,3 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - case ZFS_PROP_ATIME: -+ case ZFS_PROP_RELATIME: - case ZFS_PROP_DEVICES: -@@ -1868,2 +1901,6 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - -+ case ZFS_PROP_INCONSISTENT: -+ *val = zhp->zfs_dmustats.dds_inconsistent; -+ break; -+ - default: -@@ -2000,6 +2037,3 @@ get_clones_cb(zfs_handle_t *zhp, void *arg) - if (strcmp(gca->buf, gca->origin) == 0) { -- if (nvlist_add_boolean(gca->value, zfs_get_name(zhp)) != 0) { -- zfs_close(zhp); -- return (no_memory(zhp->zfs_hdl)); -- } -+ fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); - gca->numclones--; -@@ -2116,3 +2150,4 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, - &t) == 0) -- (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t) val); -+ (void) snprintf(propbuf, proplen, "%llu", -+ (u_longlong_t) val); - } -@@ -2578,3 +2613,3 @@ zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -2623,3 +2658,3 @@ zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, - (void) snprintf(propbuf, proplen, "%llu", -- (u_longlong_t)propvalue); -+ (u_longlong_t)propvalue); - } else if (propvalue == 0 && -@@ -2638,3 +2673,3 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - const char *snapname; -@@ -2680,3 +2715,4 @@ zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - if (literal) { -- (void) snprintf(propbuf, proplen, "%llu", (long long unsigned int)propvalue); -+ (void) snprintf(propbuf, proplen, "%llu", -+ (u_longlong_t)propvalue); - } else { -@@ -2688,21 +2724,2 @@ zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - --int --zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, -- uint64_t *usedp) --{ -- int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- -- (void) strlcpy(zc.zc_name, lastsnap->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, firstsnap->zfs_name, sizeof (zc.zc_value)); -- -- err = ioctl(lastsnap->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_SNAPS, &zc); -- if (err) -- return (err); -- -- *usedp = zc.zc_cookie; -- -- return (0); --} -- - /* -@@ -2777,3 +2794,3 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char parent[ZFS_MAXNAMELEN]; -@@ -2907,3 +2924,2 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - (cp = strchr(cp, '/')); *cp = '/', cp++) { -- char *logstr; - -@@ -2918,7 +2934,4 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - -- logstr = hdl->libzfs_log_str; -- hdl->libzfs_log_str = NULL; - if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, - NULL) != 0) { -- hdl->libzfs_log_str = logstr; - opname = dgettext(TEXT_DOMAIN, "create"); -@@ -2927,3 +2940,2 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - -- hdl->libzfs_log_str = logstr; - h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); -@@ -2985,3 +2997,2 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - int ret; -@@ -2991,2 +3002,3 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - uint64_t zoned; -+ dmu_objset_type_t ost; - -@@ -3010,4 +3022,3 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - */ -- (void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name)); -- if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { -+ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -@@ -3018,5 +3029,5 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - if (type == ZFS_TYPE_VOLUME) -- zc.zc_objset_type = DMU_OST_ZVOL; -+ ost = DMU_OST_ZVOL; - else -- zc.zc_objset_type = DMU_OST_ZFS; -+ ost = DMU_OST_ZFS; - -@@ -3072,22 +3083,5 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - -- if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0) -- return (-1); -- nvlist_free(props); -- - /* create the dataset */ -- ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); -- -- if (ret == 0 && type == ZFS_TYPE_VOLUME) { -- ret = zvol_create_link(hdl, path); -- if (ret) { -- (void) zfs_standard_error(hdl, errno, -- dgettext(TEXT_DOMAIN, -- "Volume successfully created, but device links " -- "were not created")); -- zcmd_free_nvlists(&zc); -- return (-1); -- } -- } -- -- zcmd_free_nvlists(&zc); -+ ret = lzc_create(path, ost, props); -+ nvlist_free(props); - -@@ -3149,3 +3143,3 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3154,5 +3148,2 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) - if (ZFS_IS_VOLUME(zhp)) { -- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) -- return (-1); -- - zc.zc_objset_type = DMU_OST_ZVOL; -@@ -3184,3 +3175,2 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) - struct destroydata *dd = arg; -- zfs_handle_t *szhp; - char name[ZFS_MAXNAMELEN]; -@@ -3191,16 +3181,4 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) - -- szhp = make_dataset_handle(zhp->zfs_hdl, name); -- if (szhp) { -+ if (lzc_exists(name)) - verify(nvlist_add_boolean(dd->nvl, name) == 0); -- zfs_close(szhp); -- } -- -- if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- (void) zvol_remove_link(zhp->zfs_hdl, name); -- /* -- * NB: this is simply a best-effort. We don't want to -- * return an error, because then we wouldn't visit all -- * the volumes. -- */ -- } - -@@ -3224,3 +3202,3 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - -- if (nvlist_next_nvpair(dd.nvl, NULL) == NULL) { -+ if (nvlist_empty(dd.nvl)) { - ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, -@@ -3229,3 +3207,3 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - } else { -- ret = zfs_destroy_snaps_nvl(zhp, dd.nvl, defer); -+ ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); - } -@@ -3236,32 +3214,39 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - /* -- * Destroys all the snapshots named in the nvlist. They must be underneath -- * the zhp (either snapshots of it, or snapshots of its descendants). -+ * Destroys all the snapshots named in the nvlist. - */ - int --zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) -+zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) - { - int ret; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ nvlist_t *errlist; -+ nvpair_t *pair; - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, snaps) != 0) -- return (-1); -- zc.zc_defer_destroy = defer; -+ ret = lzc_destroy_snaps(snaps, defer, &errlist); - -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS_NVL, &zc); -- if (ret != 0) { -+ if (ret == 0) -+ return (0); -+ -+ if (nvlist_empty(errlist)) { - char errbuf[1024]; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); - -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot destroy snapshots in %s"), zc.zc_name); -+ ret = zfs_standard_error(hdl, ret, errbuf); -+ } -+ for (pair = nvlist_next_nvpair(errlist, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { -+ char errbuf[1024]; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), -+ nvpair_name(pair)); - -- switch (errno) { -+ switch (fnvpair_value_int32(pair)) { - case EEXIST: -- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, -- "snapshot is cloned")); -- return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf)); -- -+ zfs_error_aux(hdl, -+ dgettext(TEXT_DOMAIN, "snapshot is cloned")); -+ ret = zfs_error(hdl, EZFS_EXISTS, errbuf); -+ break; - default: -- return (zfs_standard_error(zhp->zfs_hdl, errno, -- errbuf)); -+ ret = zfs_standard_error(hdl, errno, errbuf); -+ break; - } -@@ -3269,3 +3254,3 @@ zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) - -- return (0); -+ return (ret); - } -@@ -3278,3 +3263,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - char parent[ZFS_MAXNAMELEN]; -@@ -3283,3 +3267,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - libzfs_handle_t *hdl = zhp->zfs_hdl; -- zfs_type_t type; - uint64_t zoned; -@@ -3302,11 +3285,10 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - /* do the clone */ -- if (ZFS_IS_VOLUME(zhp)) { -- zc.zc_objset_type = DMU_OST_ZVOL; -- type = ZFS_TYPE_VOLUME; -- } else { -- zc.zc_objset_type = DMU_OST_ZFS; -- type = ZFS_TYPE_FILESYSTEM; -- } - - if (props) { -+ zfs_type_t type; -+ if (ZFS_IS_VOLUME(zhp)) { -+ type = ZFS_TYPE_VOLUME; -+ } else { -+ type = ZFS_TYPE_FILESYSTEM; -+ } - if ((props = zfs_valid_proplist(hdl, type, props, zoned, -@@ -3314,16 +3296,6 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - return (-1); -- -- if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { -- nvlist_free(props); -- return (-1); -- } -- -- nvlist_free(props); - } - -- (void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc); -- -- zcmd_free_nvlists(&zc); -+ ret = lzc_clone(target, zhp->zfs_name, props); -+ nvlist_free(props); - -@@ -3356,4 +3328,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - } -- } else if (ZFS_IS_VOLUME(zhp)) { -- ret = zvol_create_link(zhp->zfs_hdl, target); - } -@@ -3363,59 +3333,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - --typedef struct promote_data { -- char cb_mountpoint[MAXPATHLEN]; -- const char *cb_target; -- const char *cb_errbuf; -- uint64_t cb_pivot_txg; --} promote_data_t; -- --static int --promote_snap_cb(zfs_handle_t *zhp, void *data) --{ -- promote_data_t *pd = data; -- zfs_handle_t *szhp; -- char snapname[MAXPATHLEN]; -- int rv = 0; -- -- /* We don't care about snapshots after the pivot point */ -- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) { -- zfs_close(zhp); -- return (0); -- } -- -- /* Remove the device link if it's a zvol. */ -- if (ZFS_IS_VOLUME(zhp)) -- (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name); -- -- /* Check for conflicting names */ -- (void) strlcpy(snapname, pd->cb_target, sizeof (snapname)); -- (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname)); -- szhp = make_dataset_handle(zhp->zfs_hdl, snapname); -- if (szhp != NULL) { -- zfs_close(szhp); -- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, -- "snapshot name '%s' from origin \n" -- "conflicts with '%s' from target"), -- zhp->zfs_name, snapname); -- rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf); -- } -- zfs_close(zhp); -- return (rv); --} -- --static int --promote_snap_done_cb(zfs_handle_t *zhp, void *data) --{ -- promote_data_t *pd = data; -- -- /* We don't care about snapshots after the pivot point */ -- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) { -- /* Create the device link if it's a zvol. */ -- if (ZFS_IS_VOLUME(zhp)) -- (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); -- } -- -- zfs_close(zhp); -- return (0); --} -- - /* -@@ -3427,8 +3340,5 @@ zfs_promote(zfs_handle_t *zhp) - libzfs_handle_t *hdl = zhp->zfs_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char parent[MAXPATHLEN]; -- char *cp; - int ret; -- zfs_handle_t *pzhp; -- promote_data_t pd; - char errbuf[1024]; -@@ -3450,25 +3360,3 @@ zfs_promote(zfs_handle_t *zhp) - } -- cp = strchr(parent, '@'); -- *cp = '\0'; - -- /* Walk the snapshots we will be moving */ -- pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); -- if (pzhp == NULL) -- return (-1); -- pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG); -- zfs_close(pzhp); -- pd.cb_target = zhp->zfs_name; -- pd.cb_errbuf = errbuf; -- pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET); -- if (pzhp == NULL) -- return (-1); -- (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint, -- sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE); -- ret = zfs_iter_snapshots(pzhp, B_FALSE, promote_snap_cb, &pd); -- if (ret != 0) { -- zfs_close(pzhp); -- return (-1); -- } -- -- /* issue the ioctl */ - (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, -@@ -3481,13 +3369,5 @@ zfs_promote(zfs_handle_t *zhp) - -- (void) zfs_iter_snapshots(pzhp, B_FALSE, promote_snap_done_cb, -- &pd); -- zfs_close(pzhp); -- - switch (save_errno) { - case EEXIST: -- /* -- * There is a conflicting snapshot name. We -- * should have caught this above, but they could -- * have renamed something in the mean time. -- */ -+ /* There is a conflicting snapshot name. */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -@@ -3500,8 +3380,3 @@ zfs_promote(zfs_handle_t *zhp) - } -- } else { -- (void) zfs_iter_snapshots(zhp, B_FALSE, promote_snap_done_cb, -- &pd); - } -- -- zfs_close(pzhp); - return (ret); -@@ -3509,33 +3384,25 @@ zfs_promote(zfs_handle_t *zhp) - --struct createdata { -- const char *cd_snapname; -- int cd_ifexists; --}; -+typedef struct snapdata { -+ nvlist_t *sd_nvl; -+ const char *sd_snapname; -+} snapdata_t; - - static int --zfs_create_link_cb(zfs_handle_t *zhp, void *arg) -+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) - { -- struct createdata *cd = arg; -- int ret; -- -- if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- char name[MAXPATHLEN]; -+ snapdata_t *sd = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; - -- (void) strlcpy(name, zhp->zfs_name, sizeof (name)); -- (void) strlcat(name, "@", sizeof (name)); -- (void) strlcat(name, cd->cd_snapname, sizeof (name)); -- (void) zvol_create_link_common(zhp->zfs_hdl, name, -- cd->cd_ifexists); -- /* -- * NB: this is simply a best-effort. We don't want to -- * return an error, because then we wouldn't visit all -- * the volumes. -- */ -- } -+ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zfs_get_name(zhp), sd->sd_snapname); - -- ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd); -+ fnvlist_add_boolean(sd->sd_nvl, name); - -+ rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); -+ } - zfs_close(zhp); - -- return (ret); -+ return (rv); - } -@@ -3543,89 +3410,107 @@ zfs_create_link_cb(zfs_handle_t *zhp, void *arg) - /* -- * Takes a snapshot of the given dataset. -+ * Creates snapshots. The keys in the snaps nvlist are the snapshots to be -+ * created. - */ - int --zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, -- nvlist_t *props) -+zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) - { -- const char *delim; -- char parent[ZFS_MAXNAMELEN]; -- zfs_handle_t *zhp; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - int ret; - char errbuf[1024]; -+ nvpair_t *elem; -+ nvlist_t *errors; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot snapshot '%s'"), path); -+ "cannot create snapshots ")); - -- /* validate the target name */ -- if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) -- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); -- -- if (props) { -- if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, -- props, B_FALSE, NULL, errbuf)) == NULL) -- return (-1); -+ elem = NULL; -+ while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { -+ const char *snapname = nvpair_name(elem); - -- if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { -- nvlist_free(props); -- return (-1); -+ /* validate the target name */ -+ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, -+ B_TRUE)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot create snapshot '%s'"), snapname); -+ return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } -+ } - -- nvlist_free(props); -+ if (props != NULL && -+ (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, -+ props, B_FALSE, NULL, errbuf)) == NULL) { -+ return (-1); - } - -- /* make sure the parent exists and is of the appropriate type */ -- delim = strchr(path, '@'); -- (void) strncpy(parent, path, delim - path); -- parent[delim - path] = '\0'; -+ ret = lzc_snapshot(snaps, props, &errors); - -- if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM | -- ZFS_TYPE_VOLUME)) == NULL) { -- zcmd_free_nvlists(&zc); -- return (-1); -+ if (ret != 0) { -+ boolean_t printed = B_FALSE; -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot create snapshot '%s'"), nvpair_name(elem)); -+ (void) zfs_standard_error(hdl, -+ fnvpair_value_int32(elem), errbuf); -+ printed = B_TRUE; -+ } -+ if (!printed) { -+ switch (ret) { -+ case EXDEV: -+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -+ "multiple snapshots of same " -+ "fs not allowed")); -+ (void) zfs_error(hdl, EZFS_EXISTS, errbuf); -+ -+ break; -+ default: -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ } - } - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value)); -- if (ZFS_IS_VOLUME(zhp)) -- zc.zc_objset_type = DMU_OST_ZVOL; -- else -- zc.zc_objset_type = DMU_OST_ZFS; -- zc.zc_cookie = recursive; -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc); -+ nvlist_free(props); -+ nvlist_free(errors); -+ return (ret); -+} - -- zcmd_free_nvlists(&zc); -+int -+zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, -+ nvlist_t *props) -+{ -+ int ret; -+ snapdata_t sd = { 0 }; -+ char fsname[ZFS_MAXNAMELEN]; -+ char *cp; -+ zfs_handle_t *zhp; -+ char errbuf[1024]; - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -- if (ret != 0) -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); -+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -+ "cannot snapshot %s"), path); - -- if (ret == 0 && recursive) { -- struct createdata cd; -+ if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) -+ return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - -- cd.cd_snapname = delim + 1; -- cd.cd_ifexists = B_FALSE; -- (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd); -- } -- if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) { -- ret = zvol_create_link(zhp->zfs_hdl, path); -- if (ret != 0) { -- (void) zfs_standard_error(hdl, errno, -- dgettext(TEXT_DOMAIN, -- "Volume successfully snapshotted, but device links " -- "were not created")); -- zfs_close(zhp); -- return (-1); -- } -+ (void) strlcpy(fsname, path, sizeof (fsname)); -+ cp = strchr(fsname, '@'); -+ *cp = '\0'; -+ sd.sd_snapname = cp + 1; -+ -+ if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | -+ ZFS_TYPE_VOLUME)) == NULL) { -+ return (-1); - } - -- if (ret != 0) -- (void) zfs_standard_error(hdl, errno, errbuf); -+ verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); -+ if (recursive) { -+ (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); -+ } else { -+ fnvlist_add_boolean(sd.sd_nvl, path); -+ } - -+ ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); -+ nvlist_free(sd.sd_nvl); - zfs_close(zhp); -- - return (ret); -@@ -3657,3 +3542,2 @@ rollback_destroy(zfs_handle_t *zhp, void *data) - cbp->cb_create) { -- char *logstr; - -@@ -3664,6 +3548,3 @@ rollback_destroy(zfs_handle_t *zhp, void *data) - -- logstr = zhp->zfs_hdl->libzfs_log_str; -- zhp->zfs_hdl->libzfs_log_str = NULL; - cbp->cb_error |= zfs_destroy(zhp, B_FALSE); -- zhp->zfs_hdl->libzfs_log_str = logstr; - } -@@ -3704,3 +3585,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - boolean_t restore_resv = 0; -@@ -3729,4 +3609,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) -- return (-1); - if (zfs_which_resv_prop(zhp, &resv_prop) < 0) -@@ -3738,9 +3616,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- -- if (ZFS_IS_VOLUME(zhp)) -- zc.zc_objset_type = DMU_OST_ZVOL; -- else -- zc.zc_objset_type = DMU_OST_ZFS; -- - /* -@@ -3751,5 +3622,5 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - * snapshot since we verified that this was the most recent. -- * - */ -- if ((err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_ROLLBACK, &zc)) != 0) { -+ err = lzc_rollback(zhp->zfs_name, NULL, 0); -+ if (err != 0) { - (void) zfs_standard_error_fmt(zhp->zfs_hdl, errno, -@@ -3768,6 +3639,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { -- if ((err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name))) { -- zfs_close(zhp); -- return (err); -- } - if (restore_resv) { -@@ -3791,3 +3658,3 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - int ret; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char *delim; -@@ -3885,3 +3752,2 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - if (recursive) { -- struct destroydata dd; - -@@ -3900,11 +3766,2 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - -- dd.snapname = delim + 1; -- -- /* We remove any zvol links prior to renaming them */ -- verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); -- ret = zfs_iter_filesystems(zhrp, zfs_check_snap_cb, &dd); -- nvlist_free(dd.nvl); -- if (ret) { -- goto error; -- } - } else { -@@ -3958,23 +3815,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - */ -- if (recursive) { -- struct createdata cd; -- -- /* only create links for datasets that had existed */ -- cd.cd_snapname = delim + 1; -- cd.cd_ifexists = B_TRUE; -- (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb, -- &cd); -- } else { -+ if (!recursive) - (void) changelist_postfix(cl); -- } - } else { -- if (recursive) { -- struct createdata cd; -- -- /* only create links for datasets that had existed */ -- cd.cd_snapname = strchr(target, '@') + 1; -- cd.cd_ifexists = B_TRUE; -- ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb, -- &cd); -- } else { -+ if (!recursive) { - changelist_rename(cl, zfs_get_name(zhp), target); -@@ -3997,122 +3837,2 @@ error: - --/* -- * Given a zvol dataset, issue the ioctl to create the appropriate minor node, -- * and wait briefly for udev to create the /dev link. -- */ --int --zvol_create_link(libzfs_handle_t *hdl, const char *dataset) --{ -- return (zvol_create_link_common(hdl, dataset, B_FALSE)); --} -- --static int --zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) --{ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- char path[MAXPATHLEN]; -- int error; -- -- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -- -- /* -- * Issue the appropriate ioctl. -- */ -- if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) { -- switch (errno) { -- case EEXIST: -- /* -- * Silently ignore the case where the link already -- * exists. This allows 'zfs volinit' to be run multiple -- * times without errors. -- */ -- return (0); -- -- case ENODEV: -- /* -- * snapdev set to hidden : -- * device creation was not permitted (see zvol.c) -- * ignore error quietly -- */ -- return (0); -- -- case ENOENT: -- /* -- * Dataset does not exist in the kernel. If we -- * don't care (see zfs_rename), then ignore the -- * error quietly. -- */ -- if (ifexists) { -- return (0); -- } -- -- /* FALLTHROUGH */ -- -- default: -- return (zfs_standard_error_fmt(hdl, errno, -- dgettext(TEXT_DOMAIN, "cannot create device links " -- "for '%s'"), dataset)); -- } -- } -- -- /* -- * Wait up to 10 seconds for udev to create the device. -- */ -- (void) snprintf(path, sizeof (path), "%s/%s", ZVOL_DIR, dataset); -- error = zpool_label_disk_wait(path, 10000); -- if (error) -- (void) printf(gettext("%s may not be immediately " -- "available\n"), path); -- -- return (0); --} -- --/* -- * Remove a minor node for the given zvol and the associated /dev links. -- */ --int --zvol_remove_link(libzfs_handle_t *hdl, const char *dataset) --{ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- int timeout = 3000; /* in milliseconds */ -- int error = 0; -- int i; -- -- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -- -- /* -- * Due to concurrent updates by udev the device may be reported as -- * busy. In this case don't immediately fail. Instead briefly delay -- * and retry the ioctl() which is now likely to succeed. If unable -- * remove the link after timeout milliseconds return the failure. -- */ -- for (i = 0; i < timeout; i++) { -- error = ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc); -- if (error && errno == EBUSY) { -- usleep(1000); -- continue; -- } else { -- break; -- } -- } -- -- if (error) { -- switch (errno) { -- case ENXIO: -- /* -- * Silently ignore the case where the link no longer -- * exists, so that 'zfs volfini' can be run multiple -- * times without errors. -- */ -- return (0); -- -- default: -- return (zfs_standard_error_fmt(hdl, errno, -- dgettext(TEXT_DOMAIN, "cannot remove device " -- "links for '%s': %s"), dataset, strerror(errno))); -- } -- } -- -- return (0); --} -- - nvlist_t * -@@ -4137,3 +3857,4 @@ zfs_get_user_props(zfs_handle_t *zhp) - int --zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) -+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, -+ boolean_t literal) - { -@@ -4199,3 +3920,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - for (entry = *plp; entry != NULL; entry = entry->pl_next) { -- if (entry->pl_fixed) -+ if (entry->pl_fixed && !literal) - continue; -@@ -4204,3 +3925,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - if (zfs_prop_get(zhp, entry->pl_prop, -- buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) { -+ buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { - if (strlen(buf) > entry->pl_width) -@@ -4210,3 +3931,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - zfs_prop_to_name(entry->pl_prop), -- buf, sizeof (buf), B_FALSE) == 0) -+ buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) -@@ -4223,3 +3944,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - entry->pl_user_prop, -- buf, sizeof (buf), B_FALSE) == 0) -+ buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) -@@ -4267,3 +3988,3 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *nvlist = NULL; -@@ -4349,3 +4070,3 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_useracct_t buf[100]; -@@ -4386,33 +4107,104 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - -+struct holdarg { -+ nvlist_t *nvl; -+ const char *snapname; -+ const char *tag; -+ boolean_t recursive; -+ int error; -+}; -+ -+static int -+zfs_hold_one(zfs_handle_t *zhp, void *arg) -+{ -+ struct holdarg *ha = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; -+ -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zhp->zfs_name, ha->snapname); -+ -+ if (lzc_exists(name)) -+ fnvlist_add_string(ha->nvl, name, ha->tag); -+ -+ if (ha->recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); -+ zfs_close(zhp); -+ return (rv); -+} -+ - int - zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, -- boolean_t recursive, boolean_t temphold, boolean_t enoent_ok, -- int cleanup_fd, uint64_t dsobj, uint64_t createtxg) -+ boolean_t recursive, int cleanup_fd) -+{ -+ int ret; -+ struct holdarg ha; -+ -+ ha.nvl = fnvlist_alloc(); -+ ha.snapname = snapname; -+ ha.tag = tag; -+ ha.recursive = recursive; -+ (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); -+ -+ if (nvlist_empty(ha.nvl)) { -+ char errbuf[1024]; -+ -+ fnvlist_free(ha.nvl); -+ ret = ENOENT; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot hold snapshot '%s@%s'"), -+ zhp->zfs_name, snapname); -+ (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); -+ return (ret); -+ } -+ -+ ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); -+ fnvlist_free(ha.nvl); -+ -+ return (ret); -+} -+ -+int -+zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ int ret; -+ nvlist_t *errors; - libzfs_handle_t *hdl = zhp->zfs_hdl; -+ char errbuf[1024]; -+ nvpair_t *elem; - -- ASSERT(!recursive || dsobj == 0); -+ errors = NULL; -+ ret = lzc_hold(holds, cleanup_fd, &errors); - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -- if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) -- >= sizeof (zc.zc_string)) -- return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); -- zc.zc_cookie = recursive; -- zc.zc_temphold = temphold; -- zc.zc_cleanup_fd = cleanup_fd; -- zc.zc_sendobj = dsobj; -- zc.zc_createtxg = createtxg; -+ if (ret == 0) { -+ /* There may be errors even in the success case. */ -+ fnvlist_free(errors); -+ return (0); -+ } - -- if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { -- char errbuf[ZFS_MAXNAMELEN+32]; -+ if (nvlist_empty(errors)) { -+ /* no hold-specific errors */ -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot hold")); -+ switch (ret) { -+ case ENOTSUP: -+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -+ "pool must be upgraded")); -+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); -+ break; -+ case EINVAL: -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; -+ default: -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ } - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot hold '%s@%s'"), zc.zc_name, snapname); -- switch (errno) { -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot hold snapshot '%s'"), nvpair_name(elem)); -+ switch (fnvpair_value_int32(elem)) { - case E2BIG: -@@ -4424,17 +4216,13 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - */ -- return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf)); -- case ENOTSUP: -- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "pool must be upgraded")); -- return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); -+ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); -+ break; - case EINVAL: -- return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; - case EEXIST: -- return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); -- case ENOENT: -- if (enoent_ok) -- return (ENOENT); -- /* FALLTHROUGH */ -+ (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); -+ break; - default: -- return (zfs_standard_error_fmt(hdl, errno, errbuf)); -+ (void) zfs_standard_error(hdl, -+ fnvpair_value_int32(elem), errbuf); - } -@@ -4442,3 +4230,32 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - -- return (0); -+ fnvlist_free(errors); -+ return (ret); -+} -+ -+static int -+zfs_release_one(zfs_handle_t *zhp, void *arg) -+{ -+ struct holdarg *ha = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; -+ nvlist_t *existing_holds; -+ -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zhp->zfs_name, ha->snapname); -+ -+ if (lzc_get_holds(name, &existing_holds) != 0) { -+ ha->error = ENOENT; -+ } else if (!nvlist_exists(existing_holds, ha->tag)) { -+ ha->error = ESRCH; -+ } else { -+ nvlist_t *torelease = fnvlist_alloc(); -+ fnvlist_add_boolean(torelease, ha->tag); -+ fnvlist_add_nvlist(ha->nvl, name, torelease); -+ fnvlist_free(torelease); -+ } -+ -+ if (ha->recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); -+ zfs_close(zhp); -+ return (rv); - } -@@ -4449,25 +4266,45 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ int ret; -+ struct holdarg ha; -+ nvlist_t *errors = NULL; -+ nvpair_t *elem; - libzfs_handle_t *hdl = zhp->zfs_hdl; -+ char errbuf[1024]; - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -- if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) -- >= sizeof (zc.zc_string)) -- return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); -- zc.zc_cookie = recursive; -+ ha.nvl = fnvlist_alloc(); -+ ha.snapname = snapname; -+ ha.tag = tag; -+ ha.recursive = recursive; -+ ha.error = 0; -+ (void) zfs_release_one(zfs_handle_dup(zhp), &ha); -+ -+ if (nvlist_empty(ha.nvl)) { -+ fnvlist_free(ha.nvl); -+ ret = ha.error; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot release hold from snapshot '%s@%s'"), -+ zhp->zfs_name, snapname); -+ if (ret == ESRCH) { -+ (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); -+ } else { -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ return (ret); -+ } - -- if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { -- char errbuf[ZFS_MAXNAMELEN+32]; -+ ret = lzc_release(ha.nvl, &errors); -+ fnvlist_free(ha.nvl); - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -+ if (ret == 0) { -+ /* There may be errors even in the success case. */ -+ fnvlist_free(errors); -+ return (0); -+ } -+ -+ if (nvlist_empty(errors)) { -+ /* no hold-specific errors */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot release '%s' from '%s@%s'"), tag, zc.zc_name, -- snapname); -+ "cannot release")); - switch (errno) { -- case ESRCH: -- return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); - case ENOTSUP: -@@ -4475,7 +4312,26 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - "pool must be upgraded")); -- return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); -+ break; -+ default: -+ (void) zfs_standard_error_fmt(hdl, errno, errbuf); -+ } -+ } -+ -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot release hold from snapshot '%s'"), -+ nvpair_name(elem)); -+ switch (fnvpair_value_int32(elem)) { -+ case ESRCH: -+ (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); -+ break; - case EINVAL: -- return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; - default: -- return (zfs_standard_error_fmt(hdl, errno, errbuf)); -+ (void) zfs_standard_error_fmt(hdl, -+ fnvpair_value_int32(elem), errbuf); - } -@@ -4483,3 +4339,4 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - -- return (0); -+ fnvlist_free(errors); -+ return (ret); - } -@@ -4489,3 +4346,3 @@ zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -4494,3 +4351,3 @@ zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) - int err = 0; -- char errbuf[ZFS_MAXNAMELEN+32]; -+ char errbuf[1024]; - -@@ -4556,6 +4413,6 @@ zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char *nvbuf; -- char errbuf[ZFS_MAXNAMELEN+32]; -+ char errbuf[1024]; - size_t nvsz; -@@ -4610,34 +4467,14 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- libzfs_handle_t *hdl = zhp->zfs_hdl; -- int nvsz = 2048; -- void *nvbuf; -- int err = 0; -- char errbuf[ZFS_MAXNAMELEN+32]; -- -- assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); -- --tryagain: -- -- nvbuf = malloc(nvsz); -- if (nvbuf == NULL) { -- err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); -- goto out; -- } -+ int err; -+ char errbuf[1024]; - -- zc.zc_nvlist_dst_size = nvsz; -- zc.zc_nvlist_dst = (uintptr_t)nvbuf; -+ err = lzc_get_holds(zhp->zfs_name, nvl); - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN); -+ if (err != 0) { -+ libzfs_handle_t *hdl = zhp->zfs_hdl; - -- if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), -- zc.zc_name); -- switch (errno) { -- case ENOMEM: -- free(nvbuf); -- nvsz = zc.zc_nvlist_dst_size; -- goto tryagain; -- -+ zhp->zfs_name); -+ switch (err) { - case ENOTSUP: -@@ -4657,15 +4494,4 @@ tryagain: - } -- } else { -- /* success */ -- int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); -- if (rc) { -- (void) snprintf(errbuf, sizeof (errbuf), -- dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), -- zc.zc_name); -- err = zfs_standard_error_fmt(hdl, rc, errbuf); -- } - } - -- free(nvbuf); --out: - return (err); -@@ -4673,2 +4499,7 @@ out: - -+/* -+ * Convert the zvol's volume size to an appropriate reservation. -+ * Note: If this routine is updated, it is necessary to update the ZFS test -+ * suite's shell version in reservation.kshlib. -+ */ - uint64_t -diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c -index 77d5a09..7472d24 100644 ---- a/lib/libzfs/libzfs_diff.c -+++ b/lib/libzfs/libzfs_diff.c -@@ -92,3 +92,3 @@ get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error; -@@ -381,3 +381,3 @@ write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *lhdl = di->zhp->zfs_hdl; -@@ -509,3 +509,3 @@ make_temp_snapshot(differ_info_t *di) - libzfs_handle_t *hdl = di->zhp->zfs_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -751,3 +751,3 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char errbuf[1024]; -diff --git a/lib/libzfs/libzfs_fru.c b/lib/libzfs/libzfs_fru.c -index 78f2f9c..6be927f 100644 ---- a/lib/libzfs/libzfs_fru.c -+++ b/lib/libzfs/libzfs_fru.c -@@ -363,3 +363,3 @@ zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru) - { -- zfs_cmd_t zc = { 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -463,3 +463,2 @@ libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) - { -- return; - } -@@ -467,3 +466 @@ libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) - #endif /* HAVE_LIBTOPO */ -- -- -diff --git a/lib/libzfs/libzfs_graph.c b/lib/libzfs/libzfs_graph.c -index 0e538e3..63d9138 100644 ---- a/lib/libzfs/libzfs_graph.c -+++ b/lib/libzfs/libzfs_graph.c -@@ -381,3 +381,3 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_vertex_t *zvp; -@@ -475,3 +475,3 @@ external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c -index 9e79bd9..b5a079c 100644 ---- a/lib/libzfs/libzfs_import.c -+++ b/lib/libzfs/libzfs_import.c -@@ -170,3 +170,3 @@ fix_paths(nvlist_t *nv, name_entry_t *names) - if ((strlen(path) == strlen(ne->ne_name)) && -- !strncmp(path, ne->ne_name, strlen(path))) { -+ strncmp(path, ne->ne_name, strlen(path)) == 0) { - best = ne; -@@ -367,3 +367,3 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) - nvlist_t *nvl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -967,3 +967,3 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools) - -- err = blkid_dev_set_search(iter, "TYPE", "zfs"); -+ err = blkid_dev_set_search(iter, "TYPE", "zfs_member"); - if (err != 0) { -@@ -999,3 +999,3 @@ err_blkid2: - err_blkid1: -- return err; -+ return (err); - } -@@ -1127,10 +1127,10 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) - if ((strncmp(name, "watchdog", 8) == 0) || -- (strncmp(name, "fuse", 4) == 0) || -- (strncmp(name, "ppp", 3) == 0) || -- (strncmp(name, "tty", 3) == 0) || -- (strncmp(name, "vcs", 3) == 0) || -- (strncmp(name, "parport", 7) == 0) || -- (strncmp(name, "lp", 2) == 0) || -- (strncmp(name, "fd", 2) == 0) || -- (strncmp(name, "hpet", 4) == 0) || -+ (strncmp(name, "fuse", 4) == 0) || -+ (strncmp(name, "ppp", 3) == 0) || -+ (strncmp(name, "tty", 3) == 0) || -+ (strncmp(name, "vcs", 3) == 0) || -+ (strncmp(name, "parport", 7) == 0) || -+ (strncmp(name, "lp", 2) == 0) || -+ (strncmp(name, "fd", 2) == 0) || -+ (strncmp(name, "hpet", 4) == 0) || - (strncmp(name, "core", 4) == 0)) -@@ -1167,3 +1167,3 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) - if (strcmp(iarg->poolname, pname)) -- matched = B_FALSE; -+ matched = B_FALSE; - -diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c -index 8215d3c..e527bdc 100644 ---- a/lib/libzfs/libzfs_iter.c -+++ b/lib/libzfs/libzfs_iter.c -@@ -23,4 +23,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2010 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -105,3 +105,3 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *nzhp; -@@ -142,3 +142,3 @@ zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *nzhp; -@@ -308,4 +308,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - { -- char buf[ZFS_MAXNAMELEN]; -- char *comma_separated, *cp; -+ char *buf, *comma_separated, *cp; - int err = 0; -@@ -313,3 +312,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - -- (void) strlcpy(buf, spec_orig, sizeof (buf)); -+ buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); - cp = buf; -@@ -371,2 +370,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - -+ free(buf); - return (ret); -@@ -451,4 +451,8 @@ iter_dependents_cb(zfs_handle_t *zhp, void *arg) - } -+ - if (!first && err == 0) - err = ida->func(zhp, ida->data); -+ else -+ zfs_close(zhp); -+ - return (err); -diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c -index bded1f0..b85c5d0 100644 ---- a/lib/libzfs/libzfs_mount.c -+++ b/lib/libzfs/libzfs_mount.c -@@ -281,3 +281,3 @@ do_mount(const char *src, const char *mntpt, char *opts) - (char *)src, -- (char *)mntpt, -+ (char *)mntpt, - (char *)NULL }; -@@ -289,16 +289,18 @@ do_mount(const char *src, const char *mntpt, char *opts) - if (rc & MOUNT_FILEIO) -- return EIO; -+ return (EIO); - if (rc & MOUNT_USER) -- return EINTR; -+ return (EINTR); - if (rc & MOUNT_SOFTWARE) -- return EPIPE; -+ return (EPIPE); -+ if (rc & MOUNT_BUSY) -+ return (EBUSY); - if (rc & MOUNT_SYSERR) -- return EAGAIN; -+ return (EAGAIN); - if (rc & MOUNT_USAGE) -- return EINVAL; -+ return (EINVAL); - -- return ENXIO; /* Generic error */ -+ return (ENXIO); /* Generic error */ - } - -- return 0; -+ return (0); - } -@@ -890,3 +892,3 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, - for (curr_proto = proto; *curr_proto != PROTO_END; -- curr_proto++) { -+ curr_proto++) { - -@@ -1165,3 +1167,6 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) - -- rewind(hdl->libzfs_mnttab); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) -+ return (ENOENT); -+ - used = alloc = 0; -diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c -index a6cacd3..b1ddd98 100644 ---- a/lib/libzfs/libzfs_pool.c -+++ b/lib/libzfs/libzfs_pool.c -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -65,3 +66,3 @@ zpool_get_all_props(zpool_handle_t *zhp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -237,4 +238,3 @@ zpool_pool_state_to_name(pool_state_t state) - /* -- * Get a zpool property value for 'prop' and return the value in -- * a pre-allocated buffer. -+ * API compatibility wrapper around zpool_get_prop_literal - */ -@@ -244,2 +244,13 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - { -+ return (zpool_get_prop_literal(zhp, prop, buf, len, srctype, B_FALSE)); -+} -+ -+/* -+ * Get a zpool property value for 'prop' and return the value in -+ * a pre-allocated buffer. -+ */ -+int -+zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, -+ size_t len, zprop_source_t *srctype, boolean_t literal) -+{ - uint64_t intval; -@@ -309,3 +320,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - case ZPOOL_PROP_ASHIFT: -- (void) zfs_nicenum(intval, buf, len); -+ if (literal) -+ (void) snprintf(buf, len, "%llu", -+ (u_longlong_t)intval); -+ else -+ (void) zfs_nicenum(intval, buf, len); - break; -@@ -693,3 +708,3 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret = -1; -@@ -1142,3 +1157,3 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *zc_fsprops = NULL; -@@ -1146,3 +1161,2 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - char msg[1024]; -- char *altroot; - int ret = -1; -@@ -1210,4 +1224,5 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "one or more vdevs refer to the same device, or one of\n" -- "the devices is part of an active md or lvm device")); -+ "one or more vdevs refer to the same device, or " -+ "one of\nthe devices is part of an active md or " -+ "lvm device")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); -@@ -1247,17 +1262,2 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - -- /* -- * If this is an alternate root pool, then we automatically set the -- * mountpoint of the root dataset to be '/'. -- */ -- if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), -- &altroot) == 0) { -- zfs_handle_t *zhp; -- -- verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL); -- verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), -- "/") == 0); -- -- zfs_close(zhp); -- } -- - create_failed: -@@ -1274,5 +1274,5 @@ create_failed: - int --zpool_destroy(zpool_handle_t *zhp) -+zpool_destroy(zpool_handle_t *zhp, const char *log_str) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zfp = NULL; -@@ -1286,2 +1286,3 @@ zpool_destroy(zpool_handle_t *zhp) - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); -+ zc.zc_history = (uint64_t)(uintptr_t)log_str; - -@@ -1319,3 +1320,3 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret; -@@ -1411,9 +1412,2 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - -- case EDOM: -- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "root pool can not have multiple vdevs" -- " or separate logs")); -- (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); -- break; -- - case ENOTBLK: -@@ -1442,6 +1436,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - */ --int --zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) -+static int -+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, -+ const char *log_str) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -1454,2 +1449,3 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) - zc.zc_guid = hardforce; -+ zc.zc_history = (uint64_t)(uintptr_t)log_str; - -@@ -1475,5 +1471,5 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) - int --zpool_export(zpool_handle_t *zhp, boolean_t force) -+zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) - { -- return (zpool_export_common(zhp, force, B_FALSE)); -+ return (zpool_export_common(zhp, force, B_FALSE, log_str)); - } -@@ -1481,5 +1477,5 @@ zpool_export(zpool_handle_t *zhp, boolean_t force) - int --zpool_export_force(zpool_handle_t *zhp) -+zpool_export_force(zpool_handle_t *zhp, const char *log_str) - { -- return (zpool_export_common(zhp, B_TRUE, B_TRUE)); -+ return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); - } -@@ -1719,3 +1715,3 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zpool_rewind_policy_t policy; -@@ -1911,3 +1907,3 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2173,3 +2169,3 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, - -- guid = strtoull(path, &end, 10); -+ guid = strtoull(path, &end, 0); - if (guid != 0 && *end == '\0') { -@@ -2387,3 +2383,3 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2434,3 +2430,3 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, - error = zfs_resolve_shortname(path, buf, -- sizeof(buf)); -+ sizeof (buf)); - if (error != 0) -@@ -2471,3 +2467,3 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2521,3 +2517,3 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2526,3 +2522,3 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - (void) snprintf(msg, sizeof (msg), -- dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); -+ dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); - -@@ -2556,3 +2552,3 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2561,3 +2557,3 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - (void) snprintf(msg, sizeof (msg), -- dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); -+ dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); - -@@ -2610,3 +2606,3 @@ zpool_vdev_attach(zpool_handle_t *zhp, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2786,3 +2782,3 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2884,3 +2880,3 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3095,3 +3091,3 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3140,3 +3136,3 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3216,3 +3212,3 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3222,3 +3218,3 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) - dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), -- (u_longlong_t)guid); -+ (u_longlong_t)guid); - -@@ -3242,3 +3238,3 @@ zpool_reguid(zpool_handle_t *zhp) - libzfs_handle_t *hdl = zhp->zpool_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3260,3 +3256,3 @@ zpool_reopen(zpool_handle_t *zhp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3340,3 +3336,3 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3467,3 +3463,3 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, - &value) == 0 && value) { -- return strip_partition(hdl, path); -+ return (strip_partition(hdl, path)); - } -@@ -3515,3 +3511,3 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - uint64_t count; -@@ -3611,3 +3607,3 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -3625,4 +3621,3 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) - void --zpool_set_history_str(const char *subcommand, int argc, char **argv, -- char *history_str) -+zfs_save_arguments(int argc, char **argv, char *string, int len) - { -@@ -3630,9 +3625,6 @@ zpool_set_history_str(const char *subcommand, int argc, char **argv, - -- (void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN); -+ (void) strlcpy(string, basename(argv[0]), len); - for (i = 1; i < argc; i++) { -- if (strlen(history_str) + 1 + strlen(argv[i]) > -- HIS_MAX_RECORD_LEN) -- break; -- (void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN); -- (void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN); -+ (void) strlcat(string, " ", len); -+ (void) strlcat(string, argv[i], len); - } -@@ -3640,21 +3632,17 @@ zpool_set_history_str(const char *subcommand, int argc, char **argv, - --/* -- * Stage command history for logging. -- */ - int --zpool_stage_history(libzfs_handle_t *hdl, const char *history_str) -+zpool_log_history(libzfs_handle_t *hdl, const char *message) - { -- if (history_str == NULL) -- return (EINVAL); -- -- if (strlen(history_str) > HIS_MAX_RECORD_LEN) -- return (EINVAL); -- -- if (hdl->libzfs_log_str != NULL) -- free(hdl->libzfs_log_str); -- -- if ((hdl->libzfs_log_str = strdup(history_str)) == NULL) -- return (no_memory(hdl)); -- -- return (0); -+ zfs_cmd_t zc = {"\0"}; -+ nvlist_t *args; -+ int err; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_string(args, "message", message); -+ err = zcmd_write_src_nvlist(hdl, &zc, args); -+ if (err == 0) -+ err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); -+ nvlist_free(args); -+ zcmd_free_nvlists(&zc); -+ return (err); - } -@@ -3673,3 +3661,3 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -3797,10 +3785,11 @@ zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) - /* -- * Retrieve the next event. If there is a new event available 'nvp' will -- * contain a newly allocated nvlist and 'dropped' will be set to the number -- * of missed events since the last call to this function. When 'nvp' is -- * set to NULL it indicates no new events are available. In either case -- * the function returns 0 and it is up to the caller to free 'nvp'. In -- * the case of a fatal error the function will return a non-zero value. -- * When the function is called in blocking mode it will not return until -- * a new event is available. -+ * Retrieve the next event given the passed 'zevent_fd' file descriptor. -+ * If there is a new event available 'nvp' will contain a newly allocated -+ * nvlist and 'dropped' will be set to the number of missed events since -+ * the last call to this function. When 'nvp' is set to NULL it indicates -+ * no new events are available. In either case the function returns 0 and -+ * it is up to the caller to free 'nvp'. In the case of a fatal error the -+ * function will return a non-zero value. When the function is called in -+ * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), -+ * it will not return until a new event is available. - */ -@@ -3808,5 +3797,5 @@ int - zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, -- int *dropped, int block, int cleanup_fd) -+ int *dropped, unsigned flags, int zevent_fd) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error = 0; -@@ -3815,5 +3804,5 @@ zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, - *dropped = 0; -- zc.zc_cleanup_fd = cleanup_fd; -+ zc.zc_cleanup_fd = zevent_fd; - -- if (!block) -+ if (flags & ZEVENT_NONBLOCK) - zc.zc_guid = ZEVENT_NONBLOCK; -@@ -3832,3 +3821,3 @@ retry: - /* Blocking error case should not occur */ -- if (block) -+ if (!(flags & ZEVENT_NONBLOCK)) - error = zpool_standard_error_fmt(hdl, errno, -@@ -3869,3 +3858,3 @@ zpool_events_clear(libzfs_handle_t *hdl, int *count) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3884,2 +3873,38 @@ zpool_events_clear(libzfs_handle_t *hdl, int *count) - -+/* -+ * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for -+ * the passed zevent_fd file handle. On success zero is returned, -+ * otherwise -1 is returned and hdl->libzfs_error is set to the errno. -+ */ -+int -+zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) -+{ -+ zfs_cmd_t zc = {"\0"}; -+ int error = 0; -+ -+ zc.zc_guid = eid; -+ zc.zc_cleanup_fd = zevent_fd; -+ -+ if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { -+ switch (errno) { -+ case ENOENT: -+ error = zfs_error_fmt(hdl, EZFS_NOENT, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ -+ case ENOMEM: -+ error = zfs_error_fmt(hdl, EZFS_NOMEM, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ -+ default: -+ error = zpool_standard_error_fmt(hdl, errno, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ } -+ } -+ -+ return (error); -+} -+ - void -@@ -3888,3 +3913,3 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - boolean_t mounted = B_FALSE; -@@ -3895,3 +3920,4 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - /* special case for the MOS */ -- (void) snprintf(pathname, len, ":<0x%llx>", (longlong_t)obj); -+ (void) snprintf(pathname, len, ":<0x%llx>", -+ (longlong_t)obj); - return; -@@ -3927,3 +3953,4 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - } else { -- (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); -+ (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, -+ (longlong_t)obj); - } -@@ -4027,3 +4054,3 @@ zpool_label_disk_check(char *path) - if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) -- return errno; -+ return (errno); - -@@ -4031,3 +4058,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return err; -+ return (err); - } -@@ -4037,3 +4064,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return EIDRM; -+ return (EIDRM); - } -@@ -4042,3 +4069,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return 0; -+ return (0); - } -@@ -4182,3 +4209,3 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) - -- return 0; -+ return (0); - } -diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c -index 9dbfb16..12ac9bd 100644 ---- a/lib/libzfs/libzfs_sendrecv.c -+++ b/lib/libzfs/libzfs_sendrecv.c -@@ -24,5 +24,6 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -- * Copyright (c) 2012 Pawel Jakub Dawidek . - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2012 Pawel Jakub Dawidek . - * All rights reserved -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -801,2 +802,3 @@ typedef struct send_dump_data { - nvlist_t *fss; -+ nvlist_t *snapholds; - avl_tree_t *fsavl; -@@ -814,3 +816,3 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -878,3 +880,3 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -950,39 +952,15 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - --static int --hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) -+static void -+gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) - { -- zfs_handle_t *pzhp; -- int error = 0; -- char *thissnap; -- - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - -- if (sdd->dryrun) -- return (0); -- - /* -- * zfs_send() only opens a cleanup_fd for sends that need it, -+ * zfs_send() only sets snapholds for sends that need them, - * e.g. replication and doall. - */ -- if (sdd->cleanup_fd == -1) -- return (0); -- -- thissnap = strchr(zhp->zfs_name, '@') + 1; -- *(thissnap - 1) = '\0'; -- pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET); -- *(thissnap - 1) = '@'; -- -- /* -- * It's OK if the parent no longer exists. The send code will -- * handle that error. -- */ -- if (pzhp) { -- error = zfs_hold(pzhp, thissnap, sdd->holdtag, -- B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd, -- zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID), -- zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG)); -- zfs_close(pzhp); -- } -+ if (sdd->snapholds == NULL) -+ return; - -- return (error); -+ fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); - } -@@ -994,3 +972,3 @@ send_progress_thread(void *arg) - -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp = pa->pa_zhp; -@@ -1042,3 +1020,2 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - pthread_t tid; -- - char *thissnap; -@@ -1048,2 +1025,3 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - -+ err = 0; - thissnap = strchr(zhp->zfs_name, '@') + 1; -@@ -1053,13 +1031,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - if (!sdd->seenfrom && isfromsnap) { -- err = hold_for_send(zhp, sdd); -- if (err == 0) { -- sdd->seenfrom = B_TRUE; -- (void) strcpy(sdd->prevsnap, thissnap); -- sdd->prevsnap_obj = zfs_prop_get_int(zhp, -- ZFS_PROP_OBJSETID); -- } else if (err == ENOENT) { -- err = 0; -- } -+ gather_holds(zhp, sdd); -+ sdd->seenfrom = B_TRUE; -+ (void) strcpy(sdd->prevsnap, thissnap); -+ sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zfs_close(zhp); -- return (err); -+ return (0); - } -@@ -1114,10 +1087,3 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - -- err = hold_for_send(zhp, sdd); -- if (err) { -- if (err == ENOENT) -- err = 0; -- zfs_close(zhp); -- return (err); -- } -- -+ gather_holds(zhp, sdd); - fromorigin = sdd->prevsnap[0] == '\0' && -@@ -1197,3 +1163,3 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) - boolean_t missingfrom = B_FALSE; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -1389,3 +1355,3 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - int spa_version; -- pthread_t tid; -+ pthread_t tid = 0; - int pipefd[2]; -@@ -1462,7 +1428,4 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - nvlist_free(hdrnv); -- if (err) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); -+ if (err) - goto stderr_out; -- } - } -@@ -1490,4 +1453,2 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - if (err == -1) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); - err = errno; -@@ -1502,4 +1463,2 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - if (err == -1) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); - err = errno; -@@ -1515,3 +1474,3 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sdd.tosnap = tosnap; -- if (flags->dedup) -+ if (tid != 0) - sdd.outfd = pipefd[0]; -@@ -1552,10 +1511,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - } -+ sdd.snapholds = fnvlist_alloc(); - } else { - sdd.cleanup_fd = -1; -+ sdd.snapholds = NULL; - } -- if (flags->verbose) { -+ if (flags->verbose || sdd.snapholds != NULL) { - /* - * Do a verbose no-op dry run to get all the verbose output -- * before generating any data. Then do a non-verbose real -- * run to generate the streams. -+ * or to gather snapshot hold's before generating any data, -+ * then do a non-verbose real run to generate the streams. - */ -@@ -1563,14 +1524,41 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - err = dump_filesystems(zhp, &sdd); -- sdd.dryrun = flags->dryrun; -- sdd.verbose = B_FALSE; -- if (flags->parsable) { -- (void) fprintf(stderr, "size\t%llu\n", -- (longlong_t)sdd.size); -- } else { -- char buf[16]; -- zfs_nicenum(sdd.size, buf, sizeof (buf)); -- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, -- "total estimated size is %s\n"), buf); -+ -+ if (err != 0) -+ goto stderr_out; -+ -+ if (flags->verbose) { -+ if (flags->parsable) { -+ (void) fprintf(stderr, "size\t%llu\n", -+ (longlong_t)sdd.size); -+ } else { -+ char buf[16]; -+ zfs_nicenum(sdd.size, buf, sizeof (buf)); -+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, -+ "total estimated size is %s\n"), buf); -+ } -+ } -+ -+ /* Ensure no snaps found is treated as an error. */ -+ if (!sdd.seento) { -+ err = ENOENT; -+ goto err_out; -+ } -+ -+ /* Skip the second run if dryrun was requested. */ -+ if (flags->dryrun) -+ goto err_out; -+ -+ if (sdd.snapholds != NULL) { -+ err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); -+ if (err != 0) -+ goto stderr_out; -+ -+ fnvlist_free(sdd.snapholds); -+ sdd.snapholds = NULL; - } -+ -+ sdd.dryrun = B_FALSE; -+ sdd.verbose = B_FALSE; - } -+ - err = dump_filesystems(zhp, &sdd); -@@ -1579,3 +1567,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - -- if (flags->dedup) { -+ /* Ensure no snaps found is treated as an error. */ -+ if (err == 0 && !sdd.seento) -+ err = ENOENT; -+ -+ if (tid != 0) { -+ if (err != 0) -+ (void) pthread_cancel(tid); - (void) close(pipefd[0]); -@@ -1609,8 +1603,12 @@ stderr_out: - err_out: -+ fsavl_destroy(fsavl); -+ nvlist_free(fss); -+ fnvlist_free(sdd.snapholds); -+ - if (sdd.cleanup_fd != -1) - VERIFY(0 == close(sdd.cleanup_fd)); -- if (flags->dedup) { -+ if (tid != 0) { - (void) pthread_cancel(tid); -- (void) pthread_join(tid, NULL); - (void) close(pipefd[0]); -+ (void) pthread_join(tid, NULL); - } -@@ -1685,3 +1683,3 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - static int seq; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -1721,8 +1719,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - -- if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) { -+ if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { - seq++; - -- (void) strncpy(newname, name, baselen); -- (void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen, -- "recv-%ld-%u", (long) getpid(), seq); -+ (void) snprintf(newname, ZFS_MAXNAMELEN, "%.*srecv-%u-%u", -+ baselen, name, getpid(), seq); - (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value)); -@@ -1758,3 +1755,3 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err = 0; -@@ -2017,3 +2014,3 @@ again: - /* promote it! */ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *origin_nvfs; -@@ -2089,3 +2086,3 @@ again: - stream_snapname, &props)) { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -2520,3 +2517,3 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - time_t begin_time; -@@ -2651,3 +2648,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - */ -- (void) strcpy(zc.zc_top_ds, tosnap); - (void) strcpy(zc.zc_value, tosnap); -@@ -2796,8 +2792,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - } -- if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME && -- zvol_remove_link(hdl, zhp->zfs_name) != 0) { -- zfs_close(zhp); -- zcmd_free_nvlists(&zc); -- return (-1); -- } - zfs_close(zhp); -@@ -2894,3 +2884,3 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - if (err == 0 && snapprops_nvlist) { -- zfs_cmd_t zc2 = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc2 = {"\0"}; - -@@ -3007,6 +2997,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - *cp = '@'; -- err = zvol_create_link(hdl, h->zfs_name); -- if (err == 0 && ioctl_err == 0) -- err = zvol_create_link(hdl, -- zc.zc_value); - } else if (newfs || stream_avl) { -diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c -index e6e9230..534ff85 100644 ---- a/lib/libzfs/libzfs_status.c -+++ b/lib/libzfs/libzfs_status.c -@@ -24,2 +24,3 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -68,2 +69,3 @@ static char *zfs_msgid_table[] = { - "ZFS-8000-K4", -+ "ZFS-8000-ER", - }; -@@ -152,2 +154,12 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) - -+ /* -+ * Check any L2 cache devs -+ */ -+ if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, -+ &children) == 0) { -+ for (c = 0; c < children; c++) -+ if (find_vdev_problem(child[c], func)) -+ return (B_TRUE); -+ } -+ - return (B_FALSE); -@@ -173,3 +185,3 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) - static zpool_status_t --check_status(nvlist_t *config, boolean_t isimport) -+check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) - { -@@ -184,2 +196,3 @@ check_status(nvlist_t *config, boolean_t isimport) - uint64_t hostid = 0; -+ uint64_t errata = 0; - unsigned long system_hostid = gethostid() & 0xffffffff; -@@ -347,2 +360,11 @@ check_status(nvlist_t *config, boolean_t isimport) - -+ /* -+ * Informational errata available. -+ */ -+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRATA, &errata); -+ if (errata) { -+ *erratap = errata; -+ return (ZPOOL_STATUS_ERRATA); -+ } -+ - return (ZPOOL_STATUS_OK); -@@ -351,5 +373,5 @@ check_status(nvlist_t *config, boolean_t isimport) - zpool_status_t --zpool_get_status(zpool_handle_t *zhp, char **msgid) -+zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) - { -- zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE); -+ zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata); - -@@ -364,5 +386,5 @@ zpool_get_status(zpool_handle_t *zhp, char **msgid) - zpool_status_t --zpool_import_status(nvlist_t *config, char **msgid) -+zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) - { -- zpool_status_t ret = check_status(config, B_TRUE); -+ zpool_status_t ret = check_status(config, B_TRUE, errata); - -diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c -index 5bb88e9..e99603b 100644 ---- a/lib/libzfs/libzfs_util.c -+++ b/lib/libzfs/libzfs_util.c -@@ -47,2 +47,3 @@ - #include -+#include - -@@ -618,4 +619,4 @@ libzfs_module_loaded(const char *module) - -- memcpy(path, path_prefix, sizeof(path_prefix) - 1); -- strcpy(path + sizeof(path_prefix) - 1, module); -+ memcpy(path, path_prefix, sizeof (path_prefix) - 1); -+ strcpy(path + sizeof (path_prefix) - 1, module); - -@@ -653,8 +654,8 @@ libzfs_run_process(const char *path, char *argv[], int flags) - if (rc < 0 || !WIFEXITED(status)) -- return -1; -+ return (-1); - -- return WEXITSTATUS(status); -+ return (WEXITSTATUS(status)); - } - -- return -1; -+ return (-1); - } -@@ -667,5 +668,5 @@ libzfs_load_module(const char *module) - if (libzfs_module_loaded(module)) -- return 0; -+ return (0); - -- return libzfs_run_process("/sbin/modprobe", argv, 0); -+ return (libzfs_run_process("/sbin/modprobe", argv, 0)); - } -@@ -679,4 +680,4 @@ libzfs_init(void) - (void) fprintf(stderr, gettext("Failed to load ZFS module " -- "stack.\nLoad the module manually by running " -- "'insmod /zfs.ko' as root.\n")); -+ "stack.\nLoad the module manually by running " -+ "'insmod /zfs.ko' as root.\n")); - return (NULL); -@@ -690,7 +691,7 @@ libzfs_init(void) - (void) fprintf(stderr, gettext("Unable to open %s: %s.\n"), -- ZFS_DEV, strerror(errno)); -+ ZFS_DEV, strerror(errno)); - if (errno == ENOENT) - (void) fprintf(stderr, -- gettext("Verify the ZFS module stack is " -- "loaded by running '/sbin/modprobe zfs'.\n")); -+ gettext("Verify the ZFS module stack is " -+ "loaded by running '/sbin/modprobe zfs'.\n")); - -@@ -714,2 +715,10 @@ libzfs_init(void) - -+ if (libzfs_core_init() != 0) { -+ (void) close(hdl->libzfs_fd); -+ (void) fclose(hdl->libzfs_mnttab); -+ (void) fclose(hdl->libzfs_sharetab); -+ free(hdl); -+ return (NULL); -+ } -+ - zfs_prop_init(); -@@ -735,4 +744,2 @@ libzfs_fini(libzfs_handle_t *hdl) - zfs_uninit_libshare(hdl); -- if (hdl->libzfs_log_str) -- (void) free(hdl->libzfs_log_str); - zpool_free_handles(hdl); -@@ -741,2 +748,3 @@ libzfs_fini(libzfs_handle_t *hdl) - libzfs_mnttab_fini(hdl); -+ libzfs_core_fini(); - free(hdl); -@@ -787,3 +795,6 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) - -- rewind(hdl->libzfs_mnttab); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) -+ return (NULL); -+ - while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { -@@ -908,3 +919,3 @@ zfs_strcmp_shortname(char *name, char *cmp_name, int wholedisk) - -- if ((path_len == cmp_len) && !strcmp(path_name, cmp_name)) { -+ if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) { - error = 0; -@@ -951,3 +962,3 @@ zfs_strcmp_pathname(char *name, char *cmp, int wholedisk) - if (name[0] != '/') -- return zfs_strcmp_shortname(name, cmp_name, wholedisk); -+ return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); - -@@ -1063,13 +1074,3 @@ zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) - { -- int error; -- -- zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str; -- error = ioctl(hdl->libzfs_fd, request, zc); -- if (hdl->libzfs_log_str) { -- free(hdl->libzfs_log_str); -- hdl->libzfs_log_str = NULL; -- } -- zc->zc_history = 0; -- -- return (error); -+ return (ioctl(hdl->libzfs_fd, request, zc)); - } -@@ -1317,6 +1318,6 @@ str2shift(libzfs_handle_t *hdl, const char *buf) - (toupper(buf[0]) != 'B' && -- ((toupper(buf[1]) == 'B' && buf[2] == '\0') || -- (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && -- buf[3] == '\0')))) -- return (10*i); -+ ((toupper(buf[1]) == 'B' && buf[2] == '\0') || -+ (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && -+ buf[3] == '\0')))) -+ return (10 * i); - -diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am -new file mode 100644 -index 0000000..0ecd208 ---- /dev/null -+++ b/lib/libzfs_core/Makefile.am -@@ -0,0 +1,15 @@ -+include $(top_srcdir)/config/Rules.am -+ -+DEFAULT_INCLUDES += \ -+ -I$(top_srcdir)/include \ -+ -I$(top_srcdir)/lib/libspl/include -+ -+lib_LTLIBRARIES = libzfs_core.la -+ -+libzfs_core_la_SOURCES = \ -+ $(top_srcdir)/lib/libzfs_core/libzfs_core.c -+ -+libzfs_core_la_LIBADD = \ -+ $(top_builddir)/lib/libnvpair/libnvpair.la -+ -+libzfs_core_la_LDFLAGS = -version-info 1:0:0 -diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c -new file mode 100644 -index 0000000..3befa4d ---- /dev/null -+++ b/lib/libzfs_core/libzfs_core.c -@@ -0,0 +1,607 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+/* -+ * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. -+ * It has the following characteristics: -+ * -+ * - Thread Safe. libzfs_core is accessible concurrently from multiple -+ * threads. This is accomplished primarily by avoiding global data -+ * (e.g. caching). Since it's thread-safe, there is no reason for a -+ * process to have multiple libzfs "instances". Therefore, we store -+ * our few pieces of data (e.g. the file descriptor) in global -+ * variables. The fd is reference-counted so that the libzfs_core -+ * library can be "initialized" multiple times (e.g. by different -+ * consumers within the same process). -+ * -+ * - Committed Interface. The libzfs_core interface will be committed, -+ * therefore consumers can compile against it and be confident that -+ * their code will continue to work on future releases of this code. -+ * Currently, the interface is Evolving (not Committed), but we intend -+ * to commit to it once it is more complete and we determine that it -+ * meets the needs of all consumers. -+ * -+ * - Programatic Error Handling. libzfs_core communicates errors with -+ * defined error numbers, and doesn't print anything to stdout/stderr. -+ * -+ * - Thin Layer. libzfs_core is a thin layer, marshaling arguments -+ * to/from the kernel ioctls. There is generally a 1:1 correspondence -+ * between libzfs_core functions and ioctls to /dev/zfs. -+ * -+ * - Clear Atomicity. Because libzfs_core functions are generally 1:1 -+ * with kernel ioctls, and kernel ioctls are general atomic, each -+ * libzfs_core function is atomic. For example, creating multiple -+ * snapshots with a single call to lzc_snapshot() is atomic -- it -+ * can't fail with only some of the requested snapshots created, even -+ * in the event of power loss or system crash. -+ * -+ * - Continued libzfs Support. Some higher-level operations (e.g. -+ * support for "zfs send -R") are too complicated to fit the scope of -+ * libzfs_core. This functionality will continue to live in libzfs. -+ * Where appropriate, libzfs will use the underlying atomic operations -+ * of libzfs_core. For example, libzfs may implement "zfs send -R | -+ * zfs receive" by using individual "send one snapshot", rename, -+ * destroy, and "receive one snapshot" operations in libzfs_core. -+ * /sbin/zfs and /zbin/zpool will link with both libzfs and -+ * libzfs_core. Other consumers should aim to use only libzfs_core, -+ * since that will be the supported, stable interface going forwards. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static int g_fd; -+static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; -+static int g_refcount; -+ -+int -+libzfs_core_init(void) -+{ -+ (void) pthread_mutex_lock(&g_lock); -+ if (g_refcount == 0) { -+ g_fd = open("/dev/zfs", O_RDWR); -+ if (g_fd < 0) { -+ (void) pthread_mutex_unlock(&g_lock); -+ return (errno); -+ } -+ } -+ g_refcount++; -+ (void) pthread_mutex_unlock(&g_lock); -+ return (0); -+} -+ -+void -+libzfs_core_fini(void) -+{ -+ (void) pthread_mutex_lock(&g_lock); -+ ASSERT3S(g_refcount, >, 0); -+ g_refcount--; -+ if (g_refcount == 0) -+ (void) close(g_fd); -+ (void) pthread_mutex_unlock(&g_lock); -+} -+ -+static int -+lzc_ioctl(zfs_ioc_t ioc, const char *name, -+ nvlist_t *source, nvlist_t **resultp) -+{ -+ zfs_cmd_t zc = {"\0"}; -+ int error = 0; -+ char *packed; -+ size_t size; -+ -+ ASSERT3S(g_refcount, >, 0); -+ -+ (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); -+ -+ packed = fnvlist_pack(source, &size); -+ zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; -+ zc.zc_nvlist_src_size = size; -+ -+ if (resultp != NULL) { -+ *resultp = NULL; -+ zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); -+ zc.zc_nvlist_dst = (uint64_t)(uintptr_t) -+ malloc(zc.zc_nvlist_dst_size); -+ if (zc.zc_nvlist_dst == (uint64_t)0) { -+ error = ENOMEM; -+ goto out; -+ } -+ } -+ -+ while (ioctl(g_fd, ioc, &zc) != 0) { -+ if (errno == ENOMEM && resultp != NULL) { -+ free((void *)(uintptr_t)zc.zc_nvlist_dst); -+ zc.zc_nvlist_dst_size *= 2; -+ zc.zc_nvlist_dst = (uint64_t)(uintptr_t) -+ malloc(zc.zc_nvlist_dst_size); -+ if (zc.zc_nvlist_dst == (uint64_t)0) { -+ error = ENOMEM; -+ goto out; -+ } -+ } else { -+ error = errno; -+ break; -+ } -+ } -+ if (zc.zc_nvlist_dst_filled) { -+ *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, -+ zc.zc_nvlist_dst_size); -+ } -+ -+out: -+ fnvlist_pack_free(packed, size); -+ free((void *)(uintptr_t)zc.zc_nvlist_dst); -+ return (error); -+} -+ -+int -+lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props) -+{ -+ int error; -+ nvlist_t *args = fnvlist_alloc(); -+ fnvlist_add_int32(args, "type", type); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); -+ nvlist_free(args); -+ return (error); -+} -+ -+int -+lzc_clone(const char *fsname, const char *origin, -+ nvlist_t *props) -+{ -+ int error; -+ nvlist_t *args = fnvlist_alloc(); -+ fnvlist_add_string(args, "origin", origin); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); -+ nvlist_free(args); -+ return (error); -+} -+ -+/* -+ * Creates snapshots. -+ * -+ * The keys in the snaps nvlist are the snapshots to be created. -+ * They must all be in the same pool. -+ * -+ * The props nvlist is properties to set. Currently only user properties -+ * are supported. { user:prop_name -> string value } -+ * -+ * The returned results nvlist will have an entry for each snapshot that failed. -+ * The value will be the (int32) error code. -+ * -+ * The return value will be 0 if all snapshots were created, otherwise it will -+ * be the errno of a (unspecified) snapshot that failed. -+ */ -+int -+lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) -+{ -+ nvpair_t *elem; -+ nvlist_t *args; -+ int error; -+ char pool[MAXNAMELEN]; -+ -+ *errlist = NULL; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(snaps, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "snaps", snaps); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ -+ error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); -+ nvlist_free(args); -+ -+ return (error); -+} -+ -+/* -+ * Destroys snapshots. -+ * -+ * The keys in the snaps nvlist are the snapshots to be destroyed. -+ * They must all be in the same pool. -+ * -+ * Snapshots that do not exist will be silently ignored. -+ * -+ * If 'defer' is not set, and a snapshot has user holds or clones, the -+ * destroy operation will fail and none of the snapshots will be -+ * destroyed. -+ * -+ * If 'defer' is set, and a snapshot has user holds or clones, it will be -+ * marked for deferred destruction, and will be destroyed when the last hold -+ * or clone is removed/destroyed. -+ * -+ * The return value will be 0 if all snapshots were destroyed (or marked for -+ * later destruction if 'defer' is set) or didn't exist to begin with. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) snapshot -+ * that failed, no snapshots will be destroyed, and the errlist will have an -+ * entry for each snapshot that failed. The value in the errlist will be -+ * the (int32) error code. -+ */ -+int -+lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) -+{ -+ nvpair_t *elem; -+ nvlist_t *args; -+ int error; -+ char pool[MAXNAMELEN]; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(snaps, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "snaps", snaps); -+ if (defer) -+ fnvlist_add_boolean(args, "defer"); -+ -+ error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); -+ nvlist_free(args); -+ -+ return (error); -+} -+ -+int -+lzc_snaprange_space(const char *firstsnap, const char *lastsnap, -+ uint64_t *usedp) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ char fs[MAXNAMELEN]; -+ char *atp; -+ -+ /* determine the fs name */ -+ (void) strlcpy(fs, firstsnap, sizeof (fs)); -+ atp = strchr(fs, '@'); -+ if (atp == NULL) -+ return (EINVAL); -+ *atp = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_string(args, "firstsnap", firstsnap); -+ -+ err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); -+ nvlist_free(args); -+ if (err == 0) -+ *usedp = fnvlist_lookup_uint64(result, "used"); -+ fnvlist_free(result); -+ -+ return (err); -+} -+ -+boolean_t -+lzc_exists(const char *dataset) -+{ -+ /* -+ * The objset_stats ioctl is still legacy, so we need to construct our -+ * own zfs_cmd_t rather than using zfsc_ioctl(). -+ */ -+ zfs_cmd_t zc = {"\0"}; -+ -+ (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -+ return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); -+} -+ -+/* -+ * Create "user holds" on snapshots. If there is a hold on a snapshot, -+ * the snapshot can not be destroyed. (However, it can be marked for deletion -+ * by lzc_destroy_snaps(defer=B_TRUE).) -+ * -+ * The keys in the nvlist are snapshot names. -+ * The snapshots must all be in the same pool. -+ * The value is the name of the hold (string type). -+ * -+ * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). -+ * In this case, when the cleanup_fd is closed (including on process -+ * termination), the holds will be released. If the system is shut down -+ * uncleanly, the holds will be released when the pool is next opened -+ * or imported. -+ * -+ * Holds for snapshots which don't exist will be skipped and have an entry -+ * added to errlist, but will not cause an overall failure. -+ * -+ * The return value will be 0 if all holds, for snapshots that existed, -+ * were succesfully created. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) hold that -+ * failed and no holds will be created. -+ * -+ * In all cases the errlist will have an entry for each hold that failed -+ * (name = snapshot), with its value being the error code (int32). -+ */ -+int -+lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) -+{ -+ char pool[MAXNAMELEN]; -+ nvlist_t *args; -+ nvpair_t *elem; -+ int error; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(holds, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "holds", holds); -+ if (cleanup_fd != -1) -+ fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); -+ -+ error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); -+ nvlist_free(args); -+ return (error); -+} -+ -+/* -+ * Release "user holds" on snapshots. If the snapshot has been marked for -+ * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have -+ * any clones, and all the user holds are removed, then the snapshot will be -+ * destroyed. -+ * -+ * The keys in the nvlist are snapshot names. -+ * The snapshots must all be in the same pool. -+ * The value is a nvlist whose keys are the holds to remove. -+ * -+ * Holds which failed to release because they didn't exist will have an entry -+ * added to errlist, but will not cause an overall failure. -+ * -+ * The return value will be 0 if the nvl holds was empty or all holds that -+ * existed, were successfully removed. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) hold that -+ * failed to release and no holds will be released. -+ * -+ * In all cases the errlist will have an entry for each hold that failed to -+ * to release. -+ */ -+int -+lzc_release(nvlist_t *holds, nvlist_t **errlist) -+{ -+ char pool[MAXNAMELEN]; -+ nvpair_t *elem; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(holds, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); -+} -+ -+/* -+ * Retrieve list of user holds on the specified snapshot. -+ * -+ * On success, *holdsp will be set to a nvlist which the caller must free. -+ * The keys are the names of the holds, and the value is the creation time -+ * of the hold (uint64) in seconds since the epoch. -+ */ -+int -+lzc_get_holds(const char *snapname, nvlist_t **holdsp) -+{ -+ int error; -+ nvlist_t *innvl = fnvlist_alloc(); -+ error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp); -+ fnvlist_free(innvl); -+ return (error); -+} -+ -+/* -+ * If fromsnap is NULL, a full (non-incremental) stream will be sent. -+ */ -+int -+lzc_send(const char *snapname, const char *fromsnap, int fd) -+{ -+ nvlist_t *args; -+ int err; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_int32(args, "fd", fd); -+ if (fromsnap != NULL) -+ fnvlist_add_string(args, "fromsnap", fromsnap); -+ err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); -+ nvlist_free(args); -+ return (err); -+} -+ -+/* -+ * If fromsnap is NULL, a full (non-incremental) stream will be estimated. -+ */ -+int -+lzc_send_space(const char *snapname, const char *fromsnap, uint64_t *spacep) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ -+ args = fnvlist_alloc(); -+ if (fromsnap != NULL) -+ fnvlist_add_string(args, "fromsnap", fromsnap); -+ err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); -+ nvlist_free(args); -+ if (err == 0) -+ *spacep = fnvlist_lookup_uint64(result, "space"); -+ nvlist_free(result); -+ return (err); -+} -+ -+static int -+recv_read(int fd, void *buf, int ilen) -+{ -+ char *cp = buf; -+ int rv; -+ int len = ilen; -+ -+ do { -+ rv = read(fd, cp, len); -+ cp += rv; -+ len -= rv; -+ } while (rv > 0); -+ -+ if (rv < 0 || len != 0) -+ return (EIO); -+ -+ return (0); -+} -+ -+/* -+ * The simplest receive case: receive from the specified fd, creating the -+ * specified snapshot. Apply the specified properties a "received" properties -+ * (which can be overridden by locally-set properties). If the stream is a -+ * clone, its origin snapshot must be specified by 'origin'. The 'force' -+ * flag will cause the target filesystem to be rolled back or destroyed if -+ * necessary to receive. -+ * -+ * Return 0 on success or an errno on failure. -+ * -+ * Note: this interface does not work on dedup'd streams -+ * (those with DMU_BACKUP_FEATURE_DEDUP). -+ */ -+int -+lzc_receive(const char *snapname, nvlist_t *props, const char *origin, -+ boolean_t force, int fd) -+{ -+ /* -+ * The receive ioctl is still legacy, so we need to construct our own -+ * zfs_cmd_t rather than using zfsc_ioctl(). -+ */ -+ zfs_cmd_t zc = {"\0"}; -+ char *atp; -+ char *packed = NULL; -+ size_t size; -+ dmu_replay_record_t drr; -+ int error; -+ -+ ASSERT3S(g_refcount, >, 0); -+ -+ /* zc_name is name of containing filesystem */ -+ (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name)); -+ atp = strchr(zc.zc_name, '@'); -+ if (atp == NULL) -+ return (EINVAL); -+ *atp = '\0'; -+ -+ /* if the fs does not exist, try its parent. */ -+ if (!lzc_exists(zc.zc_name)) { -+ char *slashp = strrchr(zc.zc_name, '/'); -+ if (slashp == NULL) -+ return (ENOENT); -+ *slashp = '\0'; -+ -+ } -+ -+ /* zc_value is full name of the snapshot to create */ -+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -+ -+ if (props != NULL) { -+ /* zc_nvlist_src is props to set */ -+ packed = fnvlist_pack(props, &size); -+ zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; -+ zc.zc_nvlist_src_size = size; -+ } -+ -+ /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */ -+ if (origin != NULL) -+ (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); -+ -+ /* zc_begin_record is non-byteswapped BEGIN record */ -+ error = recv_read(fd, &drr, sizeof (drr)); -+ if (error != 0) -+ goto out; -+ zc.zc_begin_record = drr.drr_u.drr_begin; -+ -+ /* zc_cookie is fd to read from */ -+ zc.zc_cookie = fd; -+ -+ /* zc guid is force flag */ -+ zc.zc_guid = force; -+ -+ /* zc_cleanup_fd is unused */ -+ zc.zc_cleanup_fd = -1; -+ -+ error = ioctl(g_fd, ZFS_IOC_RECV, &zc); -+ if (error != 0) -+ error = errno; -+ -+out: -+ if (packed != NULL) -+ fnvlist_pack_free(packed, size); -+ free((void*)(uintptr_t)zc.zc_nvlist_dst); -+ return (error); -+} -+ -+/* -+ * Roll back this filesystem or volume to its most recent snapshot. -+ * If snapnamebuf is not NULL, it will be filled in with the name -+ * of the most recent snapshot. -+ * -+ * Return 0 on success or an errno on failure. -+ */ -+int -+lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ -+ args = fnvlist_alloc(); -+ err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); -+ nvlist_free(args); -+ if (err == 0 && snapnamebuf != NULL) { -+ const char *snapname = fnvlist_lookup_string(result, "target"); -+ (void) strlcpy(snapnamebuf, snapname, snapnamelen); -+ } -+ return (err); -+} -diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am -index cbba388..e4189a3 100644 ---- a/lib/libzpool/Makefile.am -+++ b/lib/libzpool/Makefile.am -@@ -27,2 +27,3 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/dbuf.c \ -+ $(top_srcdir)/module/zfs/dbuf_stats.c \ - $(top_srcdir)/module/zfs/ddt.c \ -@@ -47,2 +48,4 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/dsl_synctask.c \ -+ $(top_srcdir)/module/zfs/dsl_destroy.c \ -+ $(top_srcdir)/module/zfs/dsl_userhold.c \ - $(top_srcdir)/module/zfs/fm.c \ -@@ -62,2 +65,3 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/spa_misc.c \ -+ $(top_srcdir)/module/zfs/spa_stats.c \ - $(top_srcdir)/module/zfs/space_map.c \ -@@ -99,3 +103,4 @@ libzpool_la_LIBADD = \ - --libzpool_la_LDFLAGS = -pthread -version-info 1:1:0 -+libzpool_la_LIBADD += $(ZLIB) -+libzpool_la_LDFLAGS = -version-info 2:0:0 - -diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c -index f7aeeb4..05bbd06 100644 ---- a/lib/libzpool/kernel.c -+++ b/lib/libzpool/kernel.c -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -75,3 +76,3 @@ thread_init(void) - /* Create entry for primary kthread */ -- kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); -+ kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); - kt->t_tid = pthread_self(); -@@ -94,3 +95,3 @@ thread_fini(void) - -- umem_free(kt, sizeof(kthread_t)); -+ umem_free(kt, sizeof (kthread_t)); - -@@ -118,3 +119,3 @@ zk_thread_current(void) - -- return kt; -+ return (kt); - } -@@ -138,3 +139,3 @@ zk_thread_helper(void *arg) - -- return NULL; -+ return (NULL); - } -@@ -143,3 +144,3 @@ kthread_t * - zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, -- size_t len, proc_t *pp, int state, pri_t pri, int detachstate) -+ size_t len, proc_t *pp, int state, pri_t pri, int detachstate) - { -@@ -151,3 +152,3 @@ zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, - -- kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); -+ kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); - kt->t_func = func; -@@ -189,3 +190,3 @@ zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, - -- return kt; -+ return (kt); - } -@@ -199,3 +200,3 @@ zk_thread_exit(void) - -- umem_free(kt, sizeof(kthread_t)); -+ umem_free(kt, sizeof (kthread_t)); - -@@ -225,4 +226,4 @@ zk_thread_join(kt_did_t tid) - kstat_t * --kstat_create(char *module, int instance, char *name, char *class, -- uchar_t type, ulong_t ndata, uchar_t ks_flag) -+kstat_create(const char *module, int instance, const char *name, -+ const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) - { -@@ -241,2 +242,39 @@ kstat_delete(kstat_t *ksp) - -+/*ARGSUSED*/ -+void -+kstat_waitq_enter(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_waitq_exit(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_enter(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_exit(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_waitq_to_runq(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_back_to_waitq(kstat_io_t *kiop) -+{} -+ -+void -+kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)) -+{} -+ - /* -@@ -492,2 +530,37 @@ top: - -+/*ARGSUSED*/ -+clock_t -+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, -+ int flag) -+{ -+ int error; -+ timestruc_t ts; -+ hrtime_t delta; -+ -+ ASSERT(flag == 0); -+ -+top: -+ delta = tim - gethrtime(); -+ if (delta <= 0) -+ return (-1); -+ -+ ts.tv_sec = delta / NANOSEC; -+ ts.tv_nsec = delta % NANOSEC; -+ -+ ASSERT(mutex_owner(mp) == curthread); -+ mp->m_owner = NULL; -+ error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); -+ mp->m_owner = curthread; -+ -+ if (error == ETIME) -+ return (-1); -+ -+ if (error == EINTR) -+ goto top; -+ -+ ASSERT(error == 0); -+ -+ return (1); -+} -+ - void -@@ -666,3 +739,3 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - */ -- abort(); -+ abort(); - } -@@ -1044,2 +1117,4 @@ kernel_init(int mode) - { -+ extern uint_t rrw_tsd_key; -+ - umem_nofail_callback(umem_out_of_memory); -@@ -1061,2 +1136,4 @@ kernel_init(int mode) - spa_init(mode); -+ -+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy); - } -@@ -1084,2 +1161,8 @@ crgetuid(cred_t *cr) - -+uid_t -+crgetruid(cred_t *cr) -+{ -+ return (0); -+} -+ - gid_t -diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c -index 96c0d5c..72807f6 100644 ---- a/lib/libzpool/taskq.c -+++ b/lib/libzpool/taskq.c -@@ -159,3 +159,3 @@ taskq_empty_ent(taskq_ent_t *t) - { -- return t->tqent_next == NULL; -+ return (t->tqent_next == NULL); - } -@@ -289,3 +289,4 @@ taskq_create(const char *name, int nthreads, pri_t pri, - tq->tq_task.tqent_prev = &tq->tq_task; -- tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP); -+ tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *), -+ KM_SLEEP); - -diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am -index 9d44398..113cd0d 100644 ---- a/man/man1/Makefile.am -+++ b/man/man1/Makefile.am -@@ -1,3 +1,3 @@ --man_MANS = zhack.1 zpios.1 ztest.1 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = zhack.1 zpios.1 ztest.1 -+EXTRA_DIST = cstyle.1 - -diff --git a/man/man1/cstyle.1 b/man/man1/cstyle.1 -new file mode 100644 -index 0000000..f467c55 ---- /dev/null -+++ b/man/man1/cstyle.1 -@@ -0,0 +1,167 @@ -+.\" Copyright 2009 Sun Microsystems, Inc. All rights reserved. -+.\" Use is subject to license terms. -+.\" -+.\" CDDL HEADER START -+.\" -+.\" The contents of this file are subject to the terms of the -+.\" Common Development and Distribution License (the "License"). -+.\" You may not use this file except in compliance with the License. -+.\" -+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+.\" or http://www.opensolaris.org/os/licensing. -+.\" See the License for the specific language governing permissions -+.\" and limitations under the License. -+.\" -+.\" When distributing Covered Code, include this CDDL HEADER in each -+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+.\" If applicable, add the following below this CDDL HEADER, with the -+.\" fields enclosed by brackets "[]" replaced with your own identifying -+.\" information: Portions Copyright [yyyy] [name of copyright owner] -+.\" -+.\" CDDL HEADER END -+.\" -+.TH cstyle 1 "28 March 2005" -+.SH NAME -+.I cstyle -+\- check for some common stylistic errors in C source files -+.SH SYNOPSIS -+\fBcstyle [-chpvCP] [-o constructs] [file...]\fP -+.LP -+.SH DESCRIPTION -+.IX "OS-Net build tools" "cstyle" "" "\fBcstyle\fP" -+.LP -+.I cstyle -+inspects C source files (*.c and *.h) for common sylistic errors. It -+attempts to check for the cstyle documented in -+\fIhttp://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf\fP. -+Note that there is much in that document that -+.I cannot -+be checked for; just because your code is \fBcstyle(1)\fP clean does not -+mean that you've followed Sun's C style. \fICaveat emptor\fP. -+.LP -+.SH OPTIONS -+.LP -+The following options are supported: -+.TP 4 -+.B \-c -+Check continuation line indentation inside of functions. Sun's C style -+states that all statements must be indented to an appropriate tab stop, -+and any continuation lines after them must be indented \fIexactly\fP four -+spaces from the start line. This option enables a series of checks -+designed to find contination line problems within functions only. The -+checks have some limitations; see CONTINUATION CHECKING, below. -+.LP -+.TP 4 -+.B \-h -+Performs heuristic checks that are sometimes wrong. Not generally used. -+.LP -+.TP 4 -+.B \-p -+Performs some of the more picky checks. Includes ANSI #else and #endif -+rules, and tries to detect spaces after casts. Used as part of the -+putback checks. -+.LP -+.TP 4 -+.B \-v -+Verbose output; includes the text of the line of error, and, for -+\fB-c\fP, the first statement in the current continuation block. -+.LP -+.TP 4 -+.B \-C -+Ignore errors in header comments (i.e. block comments starting in the -+first column). Not generally used. -+.LP -+.TP 4 -+.B \-P -+Check for use of non-POSIX types. Historically, types like "u_int" and -+"u_long" were used, but they are now deprecated in favor of the POSIX -+types uint_t, ulong_t, etc. This detects any use of the deprecated -+types. Used as part of the putback checks. -+.LP -+.TP 4 -+.B \-o \fIconstructs\fP -+Allow a comma-seperated list of additional constructs. Available -+constructs include: -+.LP -+.TP 10 -+.B doxygen -+Allow doxygen-style block comments (\fB/**\fP and \fB/*!\fP) -+.LP -+.TP 10 -+.B splint -+Allow splint-style lint comments (\fB/*@...@*/\fP) -+.LP -+.SH NOTES -+.LP -+The cstyle rule for the OS/Net consolidation is that all new files must -+be \fB-pP\fP clean. For existing files, the following invocations are -+run against both the old and new files: -+.LP -+.TP 4 -+\fBcstyle file\fB -+.LP -+.TP 4 -+\fBcstyle -p file\fB -+.LP -+.TP 4 -+\fBcstyle -pP file\fB -+.LP -+If the old file gave no errors for one of the invocations, the new file -+must also give no errors. This way, files can only become more clean. -+.LP -+.SH CONTINUATION CHECKING -+.LP -+The continuation checker is a resonably simple state machine that knows -+something about how C is layed out, and can match parenthesis, etc. over -+multiple lines. It does have some limitations: -+.LP -+.TP 4 -+.B 1. -+Preprocessor macros which cause unmatched parenthesis will confuse the -+checker for that line. To fix this, you'll need to make sure that each -+branch of the #if statement has balanced parenthesis. -+.LP -+.TP 4 -+.B 2. -+Some \fBcpp\fP macros do not require ;s after them. Any such macros -+*must* be ALL_CAPS; any lower case letters will cause bad output. -+.LP -+The bad output will generally be corrected after the next \fB;\fP, -+\fB{\fP, or \fB}\fP. -+.LP -+Some continuation error messages deserve some additional explanation -+.LP -+.TP 4 -+.B -+multiple statements continued over multiple lines -+A multi-line statement which is not broken at statement -+boundries. For example: -+.RS 4 -+.HP 4 -+if (this_is_a_long_variable == another_variable) a = -+.br -+b + c; -+.LP -+Will trigger this error. Instead, do: -+.HP 8 -+if (this_is_a_long_variable == another_variable) -+.br -+a = b + c; -+.RE -+.LP -+.TP 4 -+.B -+empty if/for/while body not on its own line -+For visibility, empty bodies for if, for, and while statements should be -+on their own line. For example: -+.RS 4 -+.HP 4 -+while (do_something(&x) == 0); -+.LP -+Will trigger this error. Instead, do: -+.HP 8 -+while (do_something(&x) == 0) -+.br -+; -+.RE -+ -diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 -index 26a46f1..007be77 100644 ---- a/man/man1/zhack.1 -+++ b/man/man1/zhack.1 -@@ -25,4 +25,5 @@ - .TH zhack 1 "2013 MAR 16" "ZFS on Linux" "User Commands" -+ - .SH NAME --.BR zhack " \- libzpool debugging tool" -+zhack \- libzpool debugging tool - .SH DESCRIPTION -diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am -index aac4d0b..fcb73f4 100644 ---- a/man/man5/Makefile.am -+++ b/man/man5/Makefile.am -@@ -1,3 +1,2 @@ --man_MANS = vdev_id.conf.5 zpool-features.5 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = vdev_id.conf.5 zpool-features.5 zfs-module-parameters.5 - -diff --git a/man/man5/vdev_id.conf.5 b/man/man5/vdev_id.conf.5 -index df3f59f..7ac3247 100644 ---- a/man/man5/vdev_id.conf.5 -+++ b/man/man5/vdev_id.conf.5 -@@ -51,5 +51,13 @@ connected to the disk enclosure being mapped. - .TP --\fIslot\fR --Maps a disk slot number as reported by the operating system --to an alternative slot number. -+\fIslot\fR [channel] -+Maps a disk slot number as reported by the operating system to an -+alternative slot number. If the \fIchannel\fR parameter is specified -+then the mapping is only applied to slots in the named channel, -+otherwise the mapping is applied to all channels. The first-specified -+\fIslot\fR rule that can match a slot takes precedence. Therefore a -+channel-specific mapping for a given slot should generally appear before -+a generic mapping for the same slot. In this way a custom mapping may -+be applied to a particular channel and a default mapping applied to the -+others. -+ - .TP -@@ -86,46 +94,29 @@ arbitrary slot re-mapping. - .P -+.nf - multipath no --.br - topology sas_direct --.br - phys_per_port 4 --.br - --.br - # PCI_SLOT HBA PORT CHANNEL NAME --.br - channel 85:00.0 1 A --.br - channel 85:00.0 0 B --.br - channel 86:00.0 1 C --.br - channel 86:00.0 0 D --.br - --.br -+ # Custom mapping for Channel A -+ - # Linux Mapped --.br -- # Slot Slot --.br -- slot 1 7 --.br -- slot 2 10 --.br -- slot 3 3 --.br -- slot 4 6 --.br -- slot 5 2 --.br -- slot 6 8 --.br -- slot 7 1 --.br -- slot 8 4 --.br -- slot 9 9 --.br -- slot 10 5 --.br -+ # Slot Slot Channel -+ slot 1 7 A -+ slot 2 10 A -+ slot 3 3 A -+ slot 4 6 A -+ -+ # Default mapping for B, C, and D -+ -+ slot 1 4 -+ slot 2 2 -+ slot 3 1 -+ slot 4 3 -+.fi - .P -@@ -135,16 +126,11 @@ keyword takes only two arguments in this example. - .P -+.nf - topology sas_switch --.br - --.br - # SWITCH PORT CHANNEL NAME --.br - channel 1 A --.br - channel 2 B --.br - channel 3 C --.br - channel 4 D --.br -+.fi - .P -@@ -153,16 +139,11 @@ definitions - one per physical path. - .P -+.nf - multipath yes --.br - --.br - # PCI_SLOT HBA PORT CHANNEL NAME --.br - channel 85:00.0 1 A --.br - channel 85:00.0 0 B --.br - channel 86:00.0 1 A --.br - channel 86:00.0 0 B --.br -+.fi - .P -@@ -170,11 +151,8 @@ A configuration using device link aliases. - .P --.br -+.nf - # by-vdev --.br - # name fully qualified or base name of device link --.br - alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca --.br - alias d2 wwn-0x5000c5002def789e --.br -+.fi - .P -diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 -new file mode 100644 -index 0000000..e0d44d2 ---- /dev/null -+++ b/man/man5/zfs-module-parameters.5 -@@ -0,0 +1,1375 @@ -+'\" te -+.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. -+.\" The contents of this file are subject to the terms of the Common Development -+.\" and Distribution License (the "License"). You may not use this file except -+.\" in compliance with the License. You can obtain a copy of the license at -+.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -+.\" -+.\" See the License for the specific language governing permissions and -+.\" limitations under the License. When distributing Covered Code, include this -+.\" CDDL HEADER in each file and include the License file at -+.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this -+.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your -+.\" own identifying information: -+.\" Portions Copyright [yyyy] [name of copyright owner] -+.TH ZFS-MODULE-PARAMETERS 5 "Nov 16, 2013" -+.SH NAME -+zfs\-module\-parameters \- ZFS module parameters -+.SH DESCRIPTION -+.sp -+.LP -+Description of the different parameters to the ZFS module. -+ -+.SS "Module parameters" -+.sp -+.LP -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_again\fR (int) -+.ad -+.RS 12n -+Turbo L2ARC warmup -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_min_ms\fR (ulong) -+.ad -+.RS 12n -+Min feed interval in milliseconds -+.sp -+Default value: \fB200\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_secs\fR (ulong) -+.ad -+.RS 12n -+Seconds between L2ARC writing -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_headroom\fR (ulong) -+.ad -+.RS 12n -+Number of max device writes to precache -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_headroom_boost\fR (ulong) -+.ad -+.RS 12n -+Compressed l2arc_headroom multiplier -+.sp -+Default value: \fB200\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_nocompress\fR (int) -+.ad -+.RS 12n -+Skip compressing L2ARC buffers -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_noprefetch\fR (int) -+.ad -+.RS 12n -+Skip caching prefetched buffers -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_norw\fR (int) -+.ad -+.RS 12n -+No reads during writes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_write_boost\fR (ulong) -+.ad -+.RS 12n -+Extra write bytes during device warmup -+.sp -+Default value: \fB8,388,608\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_write_max\fR (ulong) -+.ad -+.RS 12n -+Max write bytes per interval -+.sp -+Default value: \fB8,388,608\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBmetaslab_debug\fR (int) -+.ad -+.RS 12n -+Keep space maps in core to verify frees -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspa_config_path\fR (charp) -+.ad -+.RS 12n -+SPA config file -+.sp -+Default value: \fB/etc/zfs/zpool.cache\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspa_asize_inflation\fR (int) -+.ad -+.RS 12n -+Multiplication factor used to estimate actual disk consumption from the -+size of data being written. The default value is a worst case estimate, -+but lower values may be valid for a given pool depending on its -+configuration. Pool administrators who understand the factors involved -+may wish to specify a more realistic inflation factor, particularly if -+they operate close to quota or capacity limits. -+.sp -+Default value: 24 -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_array_rd_sz\fR (ulong) -+.ad -+.RS 12n -+Number of bytes in a array_read -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_block_cap\fR (uint) -+.ad -+.RS 12n -+Max number of blocks to fetch at a time -+.sp -+Default value: \fB256\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_max_streams\fR (uint) -+.ad -+.RS 12n -+Max number of streams per zfetch -+.sp -+Default value: \fB8\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_min_sec_reap\fR (uint) -+.ad -+.RS 12n -+Min time before stream reclaim -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_grow_retry\fR (int) -+.ad -+.RS 12n -+Seconds before growing arc size -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_max\fR (ulong) -+.ad -+.RS 12n -+Max arc size -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_memory_throttle_disable\fR (int) -+.ad -+.RS 12n -+Disable memory throttle -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_meta_limit\fR (ulong) -+.ad -+.RS 12n -+Meta limit for arc size -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_meta_prune\fR (int) -+.ad -+.RS 12n -+Bytes of meta data to prune -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_min\fR (ulong) -+.ad -+.RS 12n -+Min arc size -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_min_prefetch_lifespan\fR (int) -+.ad -+.RS 12n -+Min life of prefetch block -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_p_aggressive_disable\fR (int) -+.ad -+.RS 12n -+Disable aggressive arc_p growth -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_p_dampener_disable\fR (int) -+.ad -+.RS 12n -+Disable arc_p adapt dampener -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_shrink_shift\fR (int) -+.ad -+.RS 12n -+log2(fraction of arc to reclaim) -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_autoimport_disable\fR (int) -+.ad -+.RS 12n -+Disable pool import at module load -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dbuf_state_index\fR (int) -+.ad -+.RS 12n -+Calculate arc header index -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_deadman_enabled\fR (int) -+.ad -+.RS 12n -+Enable deadman timer -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_deadman_synctime_ms\fR (ulong) -+.ad -+.RS 12n -+Expiration time in milliseconds. This value has two meanings. First it is -+used to determine when the spa_deadman() logic should fire. By default the -+spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. -+Secondly, the value determines if an I/O is considered "hung". Any I/O that -+has not completed in zfs_deadman_synctime_ms is considered "hung" resulting -+in a zevent being logged. -+.sp -+Default value: \fB1,000,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dedup_prefetch\fR (int) -+.ad -+.RS 12n -+Enable prefetching dedup-ed blks -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_delay_min_dirty_percent\fR (int) -+.ad -+.RS 12n -+Start to delay each transaction once there is this amount of dirty data, -+expressed as a percentage of \fBzfs_dirty_data_max\fR. -+This value should be >= zfs_vdev_async_write_active_max_dirty_percent. -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: \fB60\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_delay_scale\fR (int) -+.ad -+.RS 12n -+This controls how quickly the transaction delay approaches infinity. -+Larger values cause longer delays for a given amount of dirty data. -+.sp -+For the smoothest delay, this value should be about 1 billion divided -+by the maximum number of operations per second. This will smoothly -+handle between 10x and 1/10th this number. -+.sp -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. -+.sp -+Default value: \fB500,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max\fR (int) -+.ad -+.RS 12n -+Determines the dirty space limit in bytes. Once this limit is exceeded, new -+writes are halted until space frees up. This parameter takes precedence -+over \fBzfs_dirty_data_max_percent\fR. -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 10 percent of all memory, capped at \fBzfs_dirty_data_max_max\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_max\fR (int) -+.ad -+.RS 12n -+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. -+This limit is only enforced at module load time, and will be ignored if -+\fBzfs_dirty_data_max\fR is later changed. This parameter takes -+precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section -+"ZFS TRANSACTION DELAY". -+.sp -+Default value: 25% of physical RAM. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_max_percent\fR (int) -+.ad -+.RS 12n -+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a -+percentage of physical RAM. This limit is only enforced at module load -+time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. -+The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this -+one. See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 25 -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_percent\fR (int) -+.ad -+.RS 12n -+Determines the dirty space limit, expressed as a percentage of all -+memory. Once this limit is exceeded, new writes are halted until space frees -+up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this -+one. See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 10%, subject to \fBzfs_dirty_data_max_max\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_sync\fR (int) -+.ad -+.RS 12n -+Start syncing out a transaction group if there is at least this much dirty data. -+.sp -+Default value: \fB67,108,864\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_read_max_active\fR (int) -+.ad -+.RS 12n -+Maxium asynchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB3\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_read_min_active\fR (int) -+.ad -+.RS 12n -+Minimum asynchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) -+.ad -+.RS 12n -+When the pool has more than -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use -+\fBzfs_vdev_async_write_max_active\fR to limit active async writes. If -+the dirty data is between min and max, the active I/O limit is linearly -+interpolated. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB60\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) -+.ad -+.RS 12n -+When the pool has less than -+\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use -+\fBzfs_vdev_async_write_min_active\fR to limit active async writes. If -+the dirty data is between min and max, the active I/O limit is linearly -+interpolated. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB30\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_max_active\fR (int) -+.ad -+.RS 12n -+Maxium asynchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_min_active\fR (int) -+.ad -+.RS 12n -+Minimum asynchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_max_active\fR (int) -+.ad -+.RS 12n -+The maximum number of I/Os active to each device. Ideally, this will be >= -+the sum of each queue's max_active. It must be at least the sum of each -+queue's min_active. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scrub_max_active\fR (int) -+.ad -+.RS 12n -+Maxium scrub I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scrub_min_active\fR (int) -+.ad -+.RS 12n -+Minimum scrub I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_read_max_active\fR (int) -+.ad -+.RS 12n -+Maxium synchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_read_min_active\fR (int) -+.ad -+.RS 12n -+Minimum synchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_write_max_active\fR (int) -+.ad -+.RS 12n -+Maxium synchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_write_min_active\fR (int) -+.ad -+.RS 12n -+Minimum synchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_disable_dup_eviction\fR (int) -+.ad -+.RS 12n -+Disable duplicate buffer eviction -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_expire_snapshot\fR (int) -+.ad -+.RS 12n -+Seconds to expire .zfs/snapshot -+.sp -+Default value: \fB300\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_flags\fR (int) -+.ad -+.RS 12n -+Set additional debugging flags -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_free_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to free per txg -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_immediate_write_sz\fR (long) -+.ad -+.RS 12n -+Largest data block to write to zil -+.sp -+Default value: \fB32,768\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_mdcomp_disable\fR (int) -+.ad -+.RS 12n -+Disable meta data compression -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_no_scrub_io\fR (int) -+.ad -+.RS 12n -+Set for no scrub I/O -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_no_scrub_prefetch\fR (int) -+.ad -+.RS 12n -+Set for no scrub prefetching -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_nocacheflush\fR (int) -+.ad -+.RS 12n -+Disable cache flushes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_nopwrite_enabled\fR (int) -+.ad -+.RS 12n -+Enable NOP writes -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_pd_blks_max\fR (int) -+.ad -+.RS 12n -+Max number of blocks to prefetch -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_prefetch_disable\fR (int) -+.ad -+.RS 12n -+Disable all ZFS prefetching -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_chunk_size\fR (long) -+.ad -+.RS 12n -+Bytes to read per chunk -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_history\fR (int) -+.ad -+.RS 12n -+Historic statistics for the last N reads -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_history_hits\fR (int) -+.ad -+.RS 12n -+Include cache hits in read history -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_recover\fR (int) -+.ad -+.RS 12n -+Set to attempt to recover from fatal errors. This should only be used as a -+last resort, as it typically results in leaked space, or worse. -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_resilver_delay\fR (int) -+.ad -+.RS 12n -+Number of ticks to delay resilver -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_resilver_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to resilver per txg -+.sp -+Default value: \fB3,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scan_idle\fR (int) -+.ad -+.RS 12n -+Idle window in clock ticks -+.sp -+Default value: \fB50\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scan_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to scrub per txg -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scrub_delay\fR (int) -+.ad -+.RS 12n -+Number of ticks to delay scrub -+.sp -+Default value: \fB4\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_send_corrupt_data\fR (int) -+.ad -+.RS 12n -+Allow to send corrupt data (ignore read/checksum errors when sending data) -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_deferred_free\fR (int) -+.ad -+.RS 12n -+Defer frees starting in this pass -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_dont_compress\fR (int) -+.ad -+.RS 12n -+Don't compress starting in this pass -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_rewrite\fR (int) -+.ad -+.RS 12n -+Rewrite new bps starting in this pass -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_top_maxinflight\fR (int) -+.ad -+.RS 12n -+Max I/Os per top-level -+.sp -+Default value: \fB32\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_txg_history\fR (int) -+.ad -+.RS 12n -+Historic statistics for the last N txgs -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_txg_timeout\fR (int) -+.ad -+.RS 12n -+Max seconds worth of delta per txg -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_aggregation_limit\fR (int) -+.ad -+.RS 12n -+Max vdev I/O aggregation size -+.sp -+Default value: \fB131,072\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_bshift\fR (int) -+.ad -+.RS 12n -+Shift size to inflate reads too -+.sp -+Default value: \fB16\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_max\fR (int) -+.ad -+.RS 12n -+Inflate reads small than max -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_size\fR (int) -+.ad -+.RS 12n -+Total size of the per-disk cache -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_mirror_switch_us\fR (int) -+.ad -+.RS 12n -+Switch mirrors every N usecs -+.sp -+Default value: \fB10,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_read_gap_limit\fR (int) -+.ad -+.RS 12n -+Aggregate read I/O over gap -+.sp -+Default value: \fB32,768\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scheduler\fR (charp) -+.ad -+.RS 12n -+I/O scheduler -+.sp -+Default value: \fBnoop\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_write_gap_limit\fR (int) -+.ad -+.RS 12n -+Aggregate write I/O over gap -+.sp -+Default value: \fB4,096\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_cols\fR (int) -+.ad -+.RS 12n -+Max event column width -+.sp -+Default value: \fB80\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_console\fR (int) -+.ad -+.RS 12n -+Log events to the console -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_len_max\fR (int) -+.ad -+.RS 12n -+Max event queue length -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzil_replay_disable\fR (int) -+.ad -+.RS 12n -+Disable intent logging replay -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzil_slog_limit\fR (ulong) -+.ad -+.RS 12n -+Max commit bytes to separate log device -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_bulk_flags\fR (int) -+.ad -+.RS 12n -+Additional flags to pass to bulk buffers -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_delay_max\fR (int) -+.ad -+.RS 12n -+Max zio millisec delay before posting event -+.sp -+Default value: \fB30,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_injection_enabled\fR (int) -+.ad -+.RS 12n -+Enable fault injection -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_requeue_io_start_cut_in_line\fR (int) -+.ad -+.RS 12n -+Prioritize requeued I/O -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_inhibit_dev\fR (uint) -+.ad -+.RS 12n -+Do not create zvol device nodes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_major\fR (uint) -+.ad -+.RS 12n -+Major number for zvol device -+.sp -+Default value: \fB230\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_max_discard_blocks\fR (ulong) -+.ad -+.RS 12n -+Max number of blocks to discard at once -+.sp -+Default value: \fB16,384\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_threads\fR (uint) -+.ad -+.RS 12n -+Number of threads for zvol device -+.sp -+Default value: \fB32\fR. -+.RE -+ -+.SH ZFS I/O SCHEDULER -+ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. -+The I/O scheduler determines when and in what order those operations are -+issued. The I/O scheduler divides operations into five I/O classes -+prioritized in the following order: sync read, sync write, async read, -+async write, and scrub/resilver. Each queue defines the minimum and -+maximum number of concurrent operations that may be issued to the -+device. In addition, the device has an aggregate maximum, -+\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums -+must not exceed the aggregate maximum. If the sum of the per-queue -+maximums exceeds the aggregate maximum, then the number of active I/Os -+may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will -+be issued regardless of whether all per-queue minimums have been met. -+.sp -+For many physical devices, throughput increases with the number of -+concurrent operations, but latency typically suffers. Further, physical -+devices typically have a limit at which more concurrent operations have no -+effect on throughput or can actually cause it to decrease. -+.sp -+The scheduler selects the next operation to issue by first looking for an -+I/O class whose minimum has not been satisfied. Once all are satisfied and -+the aggregate maximum has not been hit, the scheduler looks for classes -+whose maximum has not been satisfied. Iteration through the I/O classes is -+done in the order specified above. No further operations are issued if the -+aggregate maximum number of concurrent operations has been hit or if there -+are no operations queued for an I/O class that has not hit its maximum. -+Every time an I/O is queued or an operation completes, the I/O scheduler -+looks for new operations to issue. -+.sp -+In general, smaller max_active's will lead to lower latency of synchronous -+operations. Larger max_active's may lead to higher overall throughput, -+depending on underlying storage. -+.sp -+The ratio of the queues' max_actives determines the balance of performance -+between reads, writes, and scrubs. E.g., increasing -+\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete -+more quickly, but reads and writes to have higher latency and lower throughput. -+.sp -+All I/O classes have a fixed maximum number of outstanding operations -+except for the async write class. Asynchronous writes represent the data -+that is committed to stable storage during the syncing stage for -+transaction groups. Transaction groups enter the syncing state -+periodically so the number of queued async writes will quickly burst up -+and then bleed down to zero. Rather than servicing them as quickly as -+possible, the I/O scheduler changes the maximum number of active async -+write I/Os according to the amount of dirty data in the pool. Since -+both throughput and latency typically increase with the number of -+concurrent operations issued to physical devices, reducing the -+burstiness in the number of concurrent operations also stabilizes the -+response time of operations from other -- and in particular synchronous -+-- queues. In broad strokes, the I/O scheduler will issue more -+concurrent operations from the async write queue as there's more dirty -+data in the pool. -+.sp -+Async Writes -+.sp -+The number of concurrent operations issued for the async write I/O class -+follows a piece-wise linear function defined by a few adjustable points. -+.nf -+ -+ | o---------| <-- zfs_vdev_async_write_max_active -+ ^ | /^ | -+ | | / | | -+active | / | | -+ I/O | / | | -+count | / | | -+ | / | | -+ |-------o | | <-- zfs_vdev_async_write_min_active -+ 0|_______^______|_________| -+ 0% | | 100% of zfs_dirty_data_max -+ | | -+ | `-- zfs_vdev_async_write_active_max_dirty_percent -+ `--------- zfs_vdev_async_write_active_min_dirty_percent -+ -+.fi -+Until the amount of dirty data exceeds a minimum percentage of the dirty -+data allowed in the pool, the I/O scheduler will limit the number of -+concurrent operations to the minimum. As that threshold is crossed, the -+number of concurrent operations issued increases linearly to the maximum at -+the specified maximum percentage of the dirty data allowed in the pool. -+.sp -+Ideally, the amount of dirty data on a busy pool will stay in the sloped -+part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR -+and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the -+maximum percentage, this indicates that the rate of incoming data is -+greater than the rate that the backend storage can handle. In this case, we -+must further throttle incoming writes, as described in the next section. -+ -+.SH ZFS TRANSACTION DELAY -+We delay transactions when we've determined that the backend storage -+isn't able to accommodate the rate of incoming writes. -+.sp -+If there is already a transaction waiting, we delay relative to when -+that transaction will finish waiting. This way the calculated delay time -+is independent of the number of threads concurrently executing -+transactions. -+.sp -+If we are the only waiter, wait relative to when the transaction -+started, rather than the current time. This credits the transaction for -+"time already served", e.g. reading indirect blocks. -+.sp -+The minimum time for a transaction to take is calculated as: -+.nf -+ min_time = zfs_delay_scale * (dirty - min) / (max - dirty) -+ min_time is then capped at 100 milliseconds. -+.fi -+.sp -+The delay has two degrees of freedom that can be adjusted via tunables. The -+percentage of dirty data at which we start to delay is defined by -+\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to -+delay after writing at full speed has failed to keep up with the incoming write -+rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, -+this variable determines the amount of delay at the midpoint of the curve. -+.sp -+.nf -+delay -+ 10ms +-------------------------------------------------------------*+ -+ | *| -+ 9ms + *+ -+ | *| -+ 8ms + *+ -+ | * | -+ 7ms + * + -+ | * | -+ 6ms + * + -+ | * | -+ 5ms + * + -+ | * | -+ 4ms + * + -+ | * | -+ 3ms + * + -+ | * | -+ 2ms + (midpoint) * + -+ | | ** | -+ 1ms + v *** + -+ | zfs_delay_scale ----------> ******** | -+ 0 +-------------------------------------*********----------------+ -+ 0% <- zfs_dirty_data_max -> 100% -+.fi -+.sp -+Note that since the delay is added to the outstanding time remaining on the -+most recent transaction, the delay is effectively the inverse of IOPS. -+Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve -+was chosen such that small changes in the amount of accumulated dirty data -+in the first 3/4 of the curve yield relatively small differences in the -+amount of delay. -+.sp -+The effects can be easier to understand when the amount of delay is -+represented on a log scale: -+.sp -+.nf -+delay -+100ms +-------------------------------------------------------------++ -+ + + -+ | | -+ + *+ -+ 10ms + *+ -+ + ** + -+ | (midpoint) ** | -+ + | ** + -+ 1ms + v **** + -+ + zfs_delay_scale ----------> ***** + -+ | **** | -+ + **** + -+100us + ** + -+ + * + -+ | * | -+ + * + -+ 10us + * + -+ + + -+ | | -+ + + -+ +--------------------------------------------------------------+ -+ 0% <- zfs_dirty_data_max -> 100% -+.fi -+.sp -+Note here that only as the amount of dirty data approaches its limit does -+the delay start to increase rapidly. The goal of a properly tuned system -+should be to keep the amount of dirty data out of that range by first -+ensuring that the appropriate limits are set for the I/O scheduler to reach -+optimal throughput on the backend storage, and then by changing the value -+of \fBzfs_delay_scale\fR to increase the steepness of the curve. -diff --git a/man/man8/.gitignore b/man/man8/.gitignore -new file mode 100644 -index 0000000..be7e904 ---- /dev/null -+++ b/man/man8/.gitignore -@@ -0,0 +1 @@ -+/zed.8 -diff --git a/man/man8/Makefile.am b/man/man8/Makefile.am -index be7bc1d..b89e34d 100644 ---- a/man/man8/Makefile.am -+++ b/man/man8/Makefile.am -@@ -1,2 +1,2 @@ --man_MANS = \ -+dist_man_MANS = \ - fsck.zfs.8 \ -@@ -10,3 +10,21 @@ man_MANS = \ - --EXTRA_DIST = $(man_MANS) -+nodist_man_MANS = \ -+ zed.8 -+ -+EXTRA_DIST = \ -+ zed.8.in -+ -+zed.8: $(srcdir)/zed.8.in -+ -+do_subst = $(SED) \ -+ -e 's|@libexecdir[@]|$(libexecdir)|g' \ -+ -e 's|@runstatedir[@]|$(runstatedir)|g' \ -+ -e 's|@sysconfdir[@]|$(sysconfdir)|g' -+ -+$(nodist_man_MANS): Makefile -+ $(RM) $@ $@.tmp -+ srcdir=''; \ -+ test -f ./$@.in || srcdir=$(srcdir)/; \ -+ $(do_subst) $${srcdir}$@.in >$@.tmp -+ mv $@.tmp $@ - -@@ -14 +32,4 @@ install-data-local: - $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man8" -+ -+CLEANFILES = \ -+ $(nodist_man_MANS) -diff --git a/man/man8/fsck.zfs.8 b/man/man8/fsck.zfs.8 -index 08b4308..baa8c33 100644 ---- a/man/man8/fsck.zfs.8 -+++ b/man/man8/fsck.zfs.8 -@@ -27,3 +27,3 @@ - .SH NAME --.BR fsck.zfs " \- Dummy ZFS filesystem checker." -+fsck.zfs \- Dummy ZFS filesystem checker. - -diff --git a/man/man8/mount.zfs.8 b/man/man8/mount.zfs.8 -index 60c36fe..b4e2406 100644 ---- a/man/man8/mount.zfs.8 -+++ b/man/man8/mount.zfs.8 -@@ -77,2 +77,15 @@ Print the usage message. - .TP -+.BI "\-o context" -+This flag sets the SELinux context for all files in the filesytem -+under that mountpoint. -+.TP -+.BI "\-o fscontext" -+This flag sets the SELinux context for the filesytem being mounted. -+.TP -+.BI "\-o defcontext" -+This flag sets the SELinux context for unlabled files. -+.TP -+.BI "\-o rootcontext" -+This flag sets the SELinux context for the root inode of the filesystem. -+.TP - .BI "\-o legacy" -diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 -index 364cf30..6f4f40d 100644 ---- a/man/man8/zdb.8 -+++ b/man/man8/zdb.8 -@@ -135,2 +135,8 @@ the allocated (physically present on disk) and referenced (logically - referenced in the pool) block counts and sizes by reference count. -+.sp -+If specified a third time, display the statistics independently for each deduplication table. -+.sp -+If specified a fourth time, dump the contents of the deduplication tables describing duplicate blocks. -+.sp -+If specified a fifth time, also dump the contents of the deduplication tables describing unique blocks. - .RE -diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in -new file mode 100644 -index 0000000..b853d86 ---- /dev/null -+++ b/man/man8/zed.8.in -@@ -0,0 +1,265 @@ -+.\" -+.\" CDDL HEADER START -+.\" -+.\" The contents of this file are subject to the terms of the -+.\" Common Development and Distribution License (the "License"). -+.\" You may not use this file except in compliance with the License. -+.\" -+.\" You can obtain a copy of the license from the top-level -+.\" OPENSOLARIS.LICENSE or . -+.\" See the License for the specific language governing permissions -+.\" and limitations under the License. -+.\" -+.\" When distributing Covered Code, include this CDDL HEADER in each file -+.\" and include the License file from the top-level OPENSOLARIS.LICENSE. -+.\" If applicable, add the following below this CDDL HEADER, with the -+.\" fields enclosed by brackets "[]" replaced with your own identifying -+.\" information: Portions Copyright [yyyy] [name of copyright owner] -+.\" -+.\" CDDL HEADER END -+.\" -+.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+.\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+.\" -+.TH ZED 8 "Octember 1, 2013" "ZFS on Linux" "System Administration Commands" -+ -+.SH NAME -+zed \- ZFS Event Daemon -+ -+.SH SYNOPSIS -+.HP -+.B zed -+.\" [\fB\-c\fR \fIconfigfile\fR] -+[\fB\-d\fR \fIscriptdir\fR] -+[\fB\-f\fR] -+[\fB\-F\fR] -+[\fB\-h\fR] -+[\fB\-L\fR] -+[\fB\-M\fR] -+[\fB\-p\fR \fIpidfile\fR] -+[\fB\-s\fR \fIstatefile\fR] -+[\fB\-v\fR] -+[\fB\-V\fR] -+[\fB\-Z\fR] -+ -+.SH DESCRIPTION -+.PP -+\fBzed\fR (ZFS Event Daemon) monitors events generated by the ZFS kernel -+module. When a ZFS event (zevent) is posted, \fBzed\fR will run any scripts -+that have been enabled for the corresponding zevent class. -+ -+.SH OPTIONS -+.TP -+.BI \-h -+Display a summary of the command-line options. -+.TP -+.BI \-L -+Display license information. -+.TP -+.BI \-V -+Display version information. -+.TP -+.BI \-v -+Be verbose. -+.TP -+.BI \-f -+Force the daemon to run if at all possible, disabling security checks and -+throwing caution to the wind. Not recommended for use in production. -+.TP -+.BI \-F -+Run the daemon in the foreground. -+.TP -+.BI \-M -+Lock all current and future pages in the virtual memory address space. -+This may help the daemon remain responsive when the system is under heavy -+memory pressure. -+.TP -+.BI \-Z -+Zero the daemon's state, thereby allowing zevents still within the kernel -+to be reprocessed. -+.\" .TP -+.\" .BI \-c\ configfile -+.\" Read the configuration from the specified file. -+.TP -+.BI \-d\ scriptdir -+Read the enabled scripts from the specified directory. -+.TP -+.BI \-p\ pidfile -+Write the daemon's process ID to the specified file. -+.TP -+.BI \-s\ statefile -+Write the daemon's state to the specified file. -+ -+.SH ZEVENTS -+.PP -+A zevent is comprised of a list of name/value pairs (nvpairs). Each zevent -+contains an EID (Event IDentifier) that uniquely identifies it throughout -+the lifetime of the loaded ZFS kernel module; this EID is a monotonically -+increasing integer that resets to 1 each time the kernel module is loaded. -+Each zevent also contains a class string that identifies the type of event. -+For brevity, a subclass string is defined that omits the leading components -+of the class string. Additional nvpairs exist to provide event details. -+.PP -+The kernel maintains a list of recent zevents that can be viewed (along with -+their associated lists of nvpairs) using the "\fBzpool events \-v\fR" command. -+ -+.SH CONFIGURATION -+.PP -+The scripts to be invoked in response to zevents are located in the -+enabled-scripts directory. These can be symlinked or copied from the -+installed-scripts directory; symlinks allow for automatic updates from the -+installed scripts, whereas copies preserve local modifications. As a security -+measure, scripts must be owned by root. They must have execute permissions -+for the user, but they must not have write permissions for group or other. -+Dotfiles are ignored. -+.PP -+Scripts are named after the zevent class for which they should be invoked. -+In particular, a script will be invoked for a given zevent if either its -+class or subclass string is a prefix of its filename (and is followed by -+a non-alphabetic character). As a special case, the prefix "all" matches -+all zevents. Multiple scripts may be invoked for a given zevent. -+ -+.SH SCRIPTS -+.PP -+Scripts should be written under the presumption they can be invoked -+concurrently, and they should use appropriate locking to access any shared -+resources. Common variables used by the scripts can be stored in the default -+rc file which is sourced by the scripts; these variables should be prefixed -+with "ZED_". -+.PP -+The zevent nvpairs are passed to the scripts as environment variables. -+Each nvpair name is converted to an environment variable in the following -+manner: 1) it is prefixed with "ZEVENT_", 2) it is converted to uppercase, -+and 3) each non-alphanumeric character is converted to an underscore. -+Some additional environment variables have been defined to present certain -+nvpair values in a more convenient form. An incomplete list of zevent -+environment variables is as follows: -+.TP -+.B -+ZEVENT_EID -+The Event IDentifier. -+.TP -+.B -+ZEVENT_CLASS -+The zevent class string. -+.TP -+.B -+ZEVENT_SUBCLASS -+The zevent subclass string. -+.TP -+.B -+ZEVENT_TIME -+The time at which the zevent was posted as -+"\fIseconds\fR\ \fInanoseconds\fR" since the Epoch. -+.TP -+.B -+ZEVENT_TIME_SECS -+The \fIseconds\fR component of ZEVENT_TIME. -+.TP -+.B -+ZEVENT_TIME_NSECS -+The \fInanoseconds\fR component of ZEVENT_TIME. -+.TP -+.B -+ZEVENT_TIME_STRING -+An almost-RFC3339-compliant string for ZEVENT_TIME. -+.PP -+Additionally, the following ZED & ZFS variables are defined: -+.TP -+.B -+ZED_PID -+The daemon's process ID. -+.TP -+.B -+ZED_SCRIPT_DIR -+The daemon's current enabled-scripts directory. -+.TP -+.B -+ZFS_ALIAS -+The ZFS alias (name-ver-rel) string used to build the daemon. -+.TP -+.B -+ZFS_VERSION -+The ZFS version used to build the daemon. -+.TP -+.B -+ZFS_RELEASE -+The ZFS release used to build the daemon. -+.PP -+Scripts may need to call other ZFS commands. The installation paths of -+the following executables are defined: \fBZDB\fR, \fBZED\fR, \fBZFS\fR, -+\fBZINJECT\fR, and \fBZPOOL\fR. These variables can be overridden in the -+zed.rc if needed. -+ -+.SH FILES -+.\" .TP -+.\" @sysconfdir@/zfs/zed.conf -+.\" The default configuration file for the daemon. -+.TP -+.I @sysconfdir@/zfs/zed.d -+The default directory for enabled scripts. -+.TP -+.I @sysconfdir@/zfs/zed.d/zed.rc -+The default rc file for common variables used by the scripts. -+.TP -+.I @libexecdir@/zfs/zed.d -+The default directory for installed scripts. -+.TP -+.I @runstatedir@/zed.pid -+The default file containing the daemon's process ID. -+.TP -+.I @runstatedir@/zed.state -+The default file containing the daemon's state. -+ -+.SH SIGNALS -+.TP -+.B HUP -+Reconfigure the daemon and rescan the directory for enabled scripts. -+.TP -+.B TERM -+Terminate the daemon. -+ -+.SH NOTES -+.PP -+\fBzed\fR requires root privileges. -+.\" Do not taunt zed. -+ -+.SH BUGS -+.PP -+Events are processed synchronously by a single thread. This can delay the -+processing of simultaneous zevents. -+.PP -+There is no maximum timeout for script execution. Consequently, a misbehaving -+script can delay the processing of subsequent zevents. -+.PP -+The ownership and permissions of the enabled-scripts directory (along -+with all parent directories) are not checked. If any of these directories -+are improperly owned or permissioned, an unprivileged user could insert a -+script to be executed as root. The requirement that scripts be owned by -+root mitigates this to some extent. -+.PP -+Scripts are unable to return state/status information to the kernel. -+.PP -+Some zevent nvpair types are not handled. These are denoted by zevent -+environment variables having a "_NOT_IMPLEMENTED_" value. -+.PP -+Internationalization support via gettext has not been added. -+.PP -+The configuration file is not yet implemented. -+.PP -+The diagnosis engine is not yet implemented. -+ -+.SH COPYRIGHT -+.PP -+Developed at Lawrence Livermore National Laboratory (LLNL\-CODE\-403049). -+.br -+Copyright (C) 2013\-2014 Lawrence Livermore National Security, LLC. -+ -+.SH LICENSE -+.PP -+\fBzed\fR (ZFS Event Daemon) is distributed under the terms of the -+Common Development and Distribution License (CDDL\-1.0). -+ -+.SH SEE ALSO -+.BR zfs (8), -+.BR zpool (8) -diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 -index a0089e6..a45f640 100644 ---- a/man/man8/zfs.8 -+++ b/man/man8/zfs.8 -@@ -29,3 +29,3 @@ - .\" --.TH zfs 8 "Jan 10, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" -+.TH zfs 8 "Nov 19, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" - .SH NAME -@@ -60,4 +60,4 @@ zfs \- configures ZFS file systems - .nf --\fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... -- \fIfilesystem@snapname\fR|\fIvolume@snapname\fR -+\fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... -+ \fIfilesystem@snapname\fR|\fIvolume@snapname\fR ... - .fi -@@ -97,4 +97,4 @@ zfs \- configures ZFS file systems - .nf --\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-H\fR][\fB-o\fR \fIproperty\fR[,...]] [\fB-t\fR \fItype\fR[,...]] -- [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fIsnap\fR] ... -+\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]..] -+ [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ... - .fi -@@ -246,3 +246,2 @@ zfs \- configures ZFS file systems - .SH DESCRIPTION --.sp - .LP -@@ -297,3 +296,2 @@ A read-only version of a file system or volume at a given point in time. It is s - .SS "ZFS File System Hierarchy" --.sp - .LP -@@ -307,3 +305,2 @@ See \fBzpool\fR(8) for more information on creating and administering pools. - .SS "Snapshots" --.sp - .LP -@@ -312,3 +309,3 @@ A snapshot is a read-only copy of a file system or volume. Snapshots can be crea - .LP --Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently. -+Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back. Visibility is determined by the \fBsnapdev\fR property of the parent volume. - .sp -@@ -317,3 +314,2 @@ File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory - .SS "Clones" --.sp - .LP -@@ -327,3 +323,2 @@ The clone parent-child dependency relationship can be reversed by using the \fBp - .SS "Mount Points" --.sp - .LP -@@ -343,3 +338,2 @@ If needed, \fBZFS\fR file systems can also be managed with traditional tools (\f - .SS "Deduplication" --.sp - .LP -@@ -347,3 +341,2 @@ Deduplication is the process for removing redundant data at the block-level, red - .SS "Native Properties" --.sp - .LP -@@ -433,2 +426,36 @@ This property is \fBon\fR if the snapshot has been marked for deferred destructi - .na -+\fB\fBlogicalreferenced\fR\fR -+.ad -+.sp .6 -+.RS 4n -+The amount of space that is "logically" accessible by this dataset. See -+the \fBreferenced\fR property. The logical space ignores the effect of -+the \fBcompression\fR and \fBcopies\fR properties, giving a quantity -+closer to the amount of data that applications see. However, it does -+include space consumed by metadata. -+.sp -+This property can also be referred to by its shortened column name, -+\fBlrefer\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fB\fBlogicalused\fR\fR -+.ad -+.sp .6 -+.RS 4n -+The amount of space that is "logically" consumed by this dataset and all -+its descendents. See the \fBused\fR property. The logical space -+ignores the effect of the \fBcompression\fR and \fBcopies\fR properties, -+giving a quantity closer to the amount of data that applications see. -+However, it does include space consumed by metadata. -+.sp -+This property can also be referred to by its shortened column name, -+\fBlused\fR. -+.RE -+ -+.sp -+.ne 2 -+.na - \fB\fBmounted\fR\fR -@@ -678,2 +705,4 @@ Controls how \fBACL\fR entries are inherited when files and directories are crea - When the property value is set to \fBpassthrough\fR, files are created with a mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs exist that affect the mode, then the mode is set in accordance to the requested mode from the application. -+.sp -+The \fBaclinherit\fR property does not apply to Posix ACLs. - .RE -@@ -684,3 +713,3 @@ When the property value is set to \fBpassthrough\fR, files are created with a mo - .na --\fB\fBaclmode\fR=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR -+\fB\fBacltype\fR=\fBnoacl\fR | \fBposixacl\fR \fR - .ad -@@ -688,3 +717,15 @@ When the property value is set to \fBpassthrough\fR, files are created with a mo - .RS 4n --Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an \fBaclmode\fR property of \fBdiscard\fR deletes all \fBACL\fR entries that do not represent the mode of the file. An \fBaclmode\fR property of \fBgroupmask\fR (the default) reduces user or group permissions. The permissions are reduced, such that they are no greater than the group permission bits, unless it is a user entry that has the same \fBUID\fR as the owner of the file or directory. In this case, the \fBACL\fR permissions are reduced so that they are no greater than owner permission bits. A file system with an \fBaclmode\fR property of \fBpassthrough\fR indicates that no changes are made to the \fBACL\fR other than generating the necessary \fBACL\fR entries to represent the new mode of the file or directory. -+Controls whether ACLs are enabled and if so what type of ACL to use. When -+a file system has the \fBacltype\fR property set to \fBnoacl\fR (the default) -+then ACLs are disabled. Setting the \fBacltype\fR property to \fBposixacl\fR -+indicates Posix ACLs should be used. Posix ACLs are specific to Linux and -+are not functional on other platforms. Posix ACLs are stored as an xattr and -+therefore will not overwrite any existing ZFS/NFSv4 ACLs which may be set. -+Currently only \fBposixacls\fR are supported on Linux. -+.sp -+To obtain the best performance when setting \fBposixacl\fR users are strongly -+encouraged to set the \fBxattr=sa\fR property. This will result in the -+Posix ACL being stored more efficiently on disk. But as a consequence of this -+all new xattrs will only be accessable from ZFS implementations which support -+the \fBxattr=sa\fR property. See the \fBxattr\fR property for more details. - .RE -@@ -699,3 +740,3 @@ Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with - .RS 4n --Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. -+Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. See also \fBrelatime\fR below. - .RE -@@ -986,2 +1027,13 @@ This property can also be referred to by its shortened column name, \fBrefreserv - .na -+\fB\fBrelatime\fR=\fBon\fR | \fBoff\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Controls the manner in which the access time is updated when \fBatime=on\fR is set. Turning this property \fBon\fR causes the access time to be updated relative to the modify or change time. Access time is only updated if the previous access time was earlier than the current modify or change time or if the existing access time hasn't been updated within the past 24 hours. The default value is \fBoff\fR. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR -@@ -1178,3 +1230,3 @@ Controls whether regular files should be scanned for viruses when a file is open - .na --\fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR -+\fB\fBxattr\fR=\fBon\fR | \fBoff\fR | \fBsa\fR\fR - .ad -@@ -1182,3 +1234,25 @@ Controls whether regular files should be scanned for viruses when a file is open - .RS 4n --Controls whether extended attributes are enabled for this file system. The default value is \fBon\fR. -+Controls whether extended attributes are enabled for this file system. Two -+styles of extended attributes are supported either directory based or system -+attribute based. -+.sp -+The default value of \fBon\fR enables directory based extended attributes. -+This style of xattr imposes no practical limit on either the size or number of -+xattrs which may be set on a file. Although under Linux the \fBgetxattr\fR(2) -+and \fBsetxattr\fR(2) system calls limit the maximum xattr size to 64K. This -+is the most compatible style of xattr and it is supported by the majority of -+ZFS implementations. -+.sp -+System attribute based xattrs may be enabled by setting the value to \fBsa\fR. -+The key advantage of this type of xattr is improved performance. Storing -+xattrs as system attributes significantly decreases the amount of disk IO -+required. Up to 64K of xattr data may be stored per file in the space reserved -+for system attributes. If there is not enough space available for an xattr then -+it will be automatically written as a directory based xattr. System attribute -+based xattrs are not accessable on platforms which do not support the -+\fBxattr=sa\fR feature. -+.sp -+The use of system attribute based xattrs is strongly encouraged for users of -+SELinux or Posix ACLs. Both of these features heavily rely of xattrs and -+benefit significantly from the reduced xattr access time. - .RE -@@ -1237,4 +1311,49 @@ Indicates whether the file system should reject file names that include characte - The \fBcasesensitivity\fR, \fBnormalization\fR, and \fButf8only\fR properties are also new permissions that can be assigned to non-privileged users by using the \fBZFS\fR delegated administration feature. --.SS "Temporary Mount Point Properties" -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for all files in the filesytem under the mountpoint for that filesystem. See \fBselinux\fR(8) for more information. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBfscontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for the filesytem being mounted. See \fBselinux\fR(8) for more information. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBdefntext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for unlabeled files. See \fBselinux\fR(8) for more information. -+.RE -+ - .sp -+.ne 2 -+.mk -+.na -+\fB\fBrootcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for the root inode of the filesystem. See \fBselinux\fR(8) for more information. -+.RE -+ -+.SS "Temporary Mount Point Properties" - .LP -@@ -1258,3 +1377,2 @@ In addition, these options can be set on a per-mount basis using the \fB-o\fR op - .SS "User Properties" --.sp - .LP -@@ -1271,3 +1389,2 @@ The values of user properties are arbitrary strings, are always inherited, and a - .SS "ZFS Volumes as Swap" --.sp - .LP -@@ -1278,3 +1395,2 @@ with the \fBzfs create\fR command set up and enable the swap area using the - .SH SUBCOMMANDS --.sp - .LP -@@ -1513,3 +1629,5 @@ Destroy (or mark for deferred destruction) all snapshots with this name in desce - .RS 4n --Recursively destroy all dependents. -+Recursively destroy all clones of these snapshots, including the clones, -+snapshots, and children. If this flag is specified, the \fB-d\fR flag will -+have no effect. - .RE -@@ -1549,3 +1667,3 @@ Print verbose information about the deleted data. - .sp --Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR -+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR - options, as they can destroy large portions of a pool and cause unexpected -@@ -1560,3 +1678,3 @@ behavior for mounted file systems in use. - .na --\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR -+\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR ... - .ad -@@ -1564,3 +1682,3 @@ behavior for mounted file systems in use. - .RS 4n --Creates a snapshot with the given name. All previous modifications by successful system calls to the file system are part of the snapshot. See the "Snapshots" section for details. -+Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the "Snapshots" section for details. - .sp -@@ -1573,3 +1691,3 @@ Creates a snapshot with the given name. All previous modifications by successful - .RS 4n --Recursively create snapshots of all descendent datasets. Snapshots are taken atomically, so that all recursive snapshots correspond to the same moment in time. -+Recursively create snapshots of all descendent datasets. - .RE -@@ -1736,3 +1854,3 @@ Recursively rename the snapshots of all descendent datasets. Snapshots are the o - .na --\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR \fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fIsnap\fR] ...\fR -+\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR \fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ...\fR - .ad -@@ -1740,3 +1858,3 @@ Recursively rename the snapshots of all descendent datasets. Snapshots are the o - .RS 4n --Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . The following fields are displayed, \fBname,used,available,referenced,mountpoint\fR. -+Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR). When listing hundreds or thousands of snapshots performance can be improved by restricting the output to only the name. In that case, it is recommended to use \fB-o name -s name\fR. The following fields are displayed by default, \fBname,used,available,referenced,mountpoint\fR. - .sp -@@ -1756,2 +1874,12 @@ Used for scripting mode. Do not print headers and separate fields by a single ta - .na -+\fB\fB-p\fR\fR -+.sp .6 -+.RS 4n -+Display numbers in parsable (exact) values. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fB-r\fR\fR -@@ -1865,3 +1993,3 @@ Same as the \fB-s\fR option, but sorts by property in descending order. - .RS 4n --A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR , \fBvolume\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots. -+A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR, \fBsnap\fR, \fBvolume\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots. - .RE -@@ -1883,4 +2011,3 @@ Sets the property to the given value for each dataset. Only some properties can - .ne 2 --.mk --.na -+.mk .na - \fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIfield\fR[,...] [\fB-t\fR \fItype\fR[,...]] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR -@@ -1968,3 +2095,3 @@ A comma-separated list of sources to display. Those properties coming from a sou - .RS 4n --Display numbers in parseable (exact) values. -+Display numbers in parsable (exact) values. - .RE -@@ -2674,4 +2801,4 @@ userused other Allows reading any userused@... property - -+acltype property - aclinherit property --aclmode property - atime property -@@ -2908,3 +3035,3 @@ F Regular file - .RS 4n --Give more parseable tab-separated output, without header lines and without arrows. -+Give more parsable tab-separated output, without header lines and without arrows. - .RE -@@ -3046,3 +3173,3 @@ pool/home/bob zoned off default - pool/home/bob snapdir hidden default --pool/home/bob aclmode groupmask default -+pool/home/bob acltype off default - pool/home/bob aclinherit restricted default -@@ -3070,2 +3197,3 @@ pool/home/bob dedup off default - pool/home/bob mlslabel none default -+pool/home/bob relatime off default - .fi -@@ -3421,3 +3549,2 @@ M F /tank/test/modified - .SH EXIT STATUS --.sp - .LP -@@ -3458,3 +3585,2 @@ Invalid command line options were specified. - .SH SEE ALSO --.sp - .LP -diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 -index b4b0f46..2cfa855 100644 ---- a/man/man8/zpool.8 -+++ b/man/man8/zpool.8 -@@ -64,3 +64,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ... -+\fBzpool get\fR [\fB-p\fR] "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ... - .fi -@@ -86,3 +86,3 @@ zpool \- configures ZFS storage pools - \fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] -- [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] \fIpool\fR |\fIid\fR [\fInewpool\fR] -+ [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] [\fB-t\fR]] \fIpool\fR |\fIid\fR [\fInewpool\fR] - .fi -@@ -91,3 +91,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool iostat\fR [\fB-T\fR u | d ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] -+\fBzpool iostat\fR [\fB-T\fR d | u ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] - .fi -@@ -101,3 +101,4 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool list\fR [\fB-T\fR u | d ] [\fB-Hv\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] -+\fBzpool list\fR [\fB-T\fR d | u ] [\fB-Hv\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... -+ [\fIinterval\fR[\fIcount\fR]] - .fi -@@ -151,3 +152,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ... -+\fBzpool status\fR [\fB-xvD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] - .fi -@@ -682,2 +683,13 @@ A text string consisting of printable ASCII characters that will be stored such - .na -+\fB\fBdedupditto\fR=\fB\fInumber\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Threshold for the number of block ditto copies. If the reference count for a deduplicated block increases above this number, a new ditto copy of this block is automatically stored. The default setting is 0 which causes no ditto copies to be created for deduplicated blocks. The miniumum legal nonzero setting is 100. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR -@@ -1003,3 +1015,3 @@ Forces any active datasets contained within the pool to be unmounted. - .RS 4n --Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data. -+Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data. If \fIdevice\fR may be re-added to the pool later on then consider the "\fBzpool offline\fR" command instead. - .RE -@@ -1038,3 +1050,3 @@ This command will forcefully export the pool even if it has a shared spare that - .na --\fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR -+\fB\fBzpool get\fR [\fB-p\fR] "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR - .ad -@@ -1055,2 +1067,13 @@ Retrieves the given list of properties (or all properties if "\fBall\fR" is used - See the "Properties" section for more information on the available pool properties. -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-p\fR\fR -+.ad -+.RS 6n -+.rt -+Display numbers in parseable (exact) values. -+.RE -+ - .RE -@@ -1279,3 +1302,3 @@ Used with the \fB-F\fR recovery option. Determines whether a non-importable pool - .na --\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR -+\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] [\fB-t\fR]] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR - .ad -@@ -1389,2 +1412,13 @@ Used with the \fB-F\fR recovery option. Determines whether a non-importable pool - .na -+\fB\fB-t\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Used with "\fBnewpool\fR". Specifies that "\fBnewpool\fR" is temporary. Temporary pool names last until export. Ensures that the original pool name will be used in all label updates and therefore is retained upon export. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fB-m\fR\fR -@@ -1402,3 +1436,3 @@ Allows a pool to import when there is a missing log device. - .na --\fB\fBzpool iostat\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR -+\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR - .ad -@@ -1459,3 +1493,3 @@ Treat exported or foreign devices as inactive. - .na --\fB\fBzpool list\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-Hv\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR -+\fB\fBzpool list\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-Hv\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR - .ad -@@ -1478,3 +1512,3 @@ Scripted mode. Do not display headers, and separate fields by a single tab inste - .na --\fB\fB-T\fR \fBu\fR | \fBd\fR\fR -+\fB\fB-T\fR \fBd\fR | \fBu\fR\fR - .ad -@@ -1703,3 +1737,3 @@ Sets the specified property for \fInewpool\fR. See the “Properties” section - .na --\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR -+\fBzpool status\fR [\fB-xvD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] - .ad -@@ -1716,3 +1750,3 @@ If a scrub or resilver is in progress, this command reports the percentage done - .ad --.RS 6n -+.RS 12n - .rt -@@ -1727,3 +1761,3 @@ Only display status for pools that are exhibiting errors or are otherwise unavai - .ad --.RS 6n -+.RS 12n - .rt -@@ -1732,2 +1766,25 @@ Displays verbose data error information, printing out a complete list of all dat - -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-D\fR\fR -+.ad -+.RS 12n -+.rt -+Display a histogram of deduplication statistics, showing the allocated (physically present on disk) and -+referenced (logically referenced in the pool) block counts and sizes by reference count. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-T\fR \fBd\fR | \fBu\fR\fR -+.ad -+.RS 12n -+.rt -+Display a time stamp. -+.sp -+Specify \fBu\fR for a printed representation of the internal representation of time. See \fBtime\fR(2). Specify \fBd\fR for standard date format. See \fBdate\fR(1). - .RE -diff --git a/module/avl/avl.c b/module/avl/avl.c -index e000647..b598cc9 100644 ---- a/module/avl/avl.c -+++ b/module/avl/avl.c -@@ -1042,2 +1042,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c -index 1758371..a91b952 100644 ---- a/module/nvpair/fnvpair.c -+++ b/module/nvpair/fnvpair.c -@@ -28,2 +28,3 @@ - #include -+#include - #ifndef _KERNEL -@@ -116,2 +117,14 @@ fnvlist_merge(nvlist_t *dst, nvlist_t *src) - -+size_t -+fnvlist_num_pairs(nvlist_t *nvl) -+{ -+ size_t count = 0; -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; -+ pair = nvlist_next_nvpair(nvl, pair)) -+ count++; -+ return (count); -+} -+ - void -@@ -503,2 +516,3 @@ EXPORT_SYMBOL(fnvlist_size); - EXPORT_SYMBOL(fnvlist_pack); -+EXPORT_SYMBOL(fnvlist_pack_free); - EXPORT_SYMBOL(fnvlist_unpack); -@@ -564,2 +578,3 @@ EXPORT_SYMBOL(fnvpair_value_string); - EXPORT_SYMBOL(fnvpair_value_nvlist); -+EXPORT_SYMBOL(fnvlist_num_pairs); - -diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c -index 36f4e4d..f5c3166 100644 ---- a/module/nvpair/nvpair.c -+++ b/module/nvpair/nvpair.c -@@ -264,11 +264,5 @@ nvlist_nvflag(nvlist_t *nvl) - --/* -- * nvlist_alloc - Allocate nvlist. -- */ --/*ARGSUSED1*/ --int --nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) -+static nv_alloc_t * -+nvlist_nv_alloc(int kmflag) - { -- nv_alloc_t *nva = nv_alloc_nosleep; -- - #if defined(_KERNEL) && !defined(_BOOT) -@@ -276,16 +270,20 @@ nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) - case KM_SLEEP: -- nva = nv_alloc_sleep; -- break; -+ return (nv_alloc_sleep); - case KM_PUSHPAGE: -- nva = nv_alloc_pushpage; -- break; -- case KM_NOSLEEP: -- nva = nv_alloc_nosleep; -- break; -+ return (nv_alloc_pushpage); - default: -- return (EINVAL); -+ return (nv_alloc_nosleep); - } --#endif -+#else -+ return (nv_alloc_nosleep); -+#endif /* _KERNEL && !_BOOT */ -+} - -- return (nvlist_xalloc(nvlp, nvflag, nva)); -+/* -+ * nvlist_alloc - Allocate nvlist. -+ */ -+int -+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) -+{ -+ return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag))); - } -@@ -616,3 +614,2 @@ nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) - */ --/*ARGSUSED1*/ - int -@@ -620,8 +617,3 @@ nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) - { --#if defined(_KERNEL) && !defined(_BOOT) -- return (nvlist_xdup(nvl, nvlp, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); --#endif -+ return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag))); - } -@@ -1626,3 +1618,3 @@ nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, - const char *np; -- char *sepp=NULL; -+ char *sepp = NULL; - char *idxp, *idxep; -@@ -2354,3 +2346,2 @@ nvlist_size(nvlist_t *nvl, size_t *size, int encoding) - */ --/*ARGSUSED1*/ - int -@@ -2359,8 +2350,4 @@ nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - { --#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xpack(nvl, bufp, buflen, encoding, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); --#endif -+ nvlist_nv_alloc(kmflag))); - } -@@ -2417,3 +2404,2 @@ nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - */ --/*ARGSUSED1*/ - int -@@ -2421,8 +2407,3 @@ nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) - { --#if defined(_KERNEL) && !defined(_BOOT) -- return (nvlist_xunpack(buf, buflen, nvlp, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); --#endif -+ return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag))); - } -@@ -2601,3 +2582,4 @@ nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) - */ -- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); -+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), -+ sizeof (uint64_t)); - } -@@ -2629,3 +2611,4 @@ nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) - */ -- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); -+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), -+ sizeof (uint64_t)); - } -@@ -3322,2 +3305,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/nvpair/nvpair_alloc_spl.c b/module/nvpair/nvpair_alloc_spl.c -index be6e8f0..a75b4a6 100644 ---- a/module/nvpair/nvpair_alloc_spl.c -+++ b/module/nvpair/nvpair_alloc_spl.c -@@ -54,7 +54,7 @@ nv_free_spl(nv_alloc_t *nva, void *buf, size_t size) - const nv_alloc_ops_t spl_sleep_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_sleep_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_sleep_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -@@ -62,7 +62,7 @@ const nv_alloc_ops_t spl_sleep_ops_def = { - const nv_alloc_ops_t spl_pushpage_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_pushpage_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_pushpage_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -@@ -70,7 +70,7 @@ const nv_alloc_ops_t spl_pushpage_ops_def = { - const nv_alloc_ops_t spl_nosleep_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_nosleep_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_nosleep_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -diff --git a/module/unicode/u8_textprep.c b/module/unicode/u8_textprep.c -index df6dcf5..60e586d 100644 ---- a/module/unicode/u8_textprep.c -+++ b/module/unicode/u8_textprep.c -@@ -148,3 +148,3 @@ - (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ -- (((uint32_t)(b2) & 0x3F) << 6) | \ -+ (((uint32_t)(b2) & 0x3F) << 6) | \ - ((uint32_t)(b3) & 0x3F)); -@@ -2145,2 +2145,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c -index ccf169b..6d0314f 100644 ---- a/module/zcommon/zfs_comutil.c -+++ b/module/zcommon/zfs_comutil.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -159,3 +160,7 @@ zfs_spa_version_map(int zpl_version) - --const char *zfs_history_event_names[LOG_END] = { -+/* -+ * This is the table of legacy internal event names; it should not be modified. -+ * The internal events are now stored in the history log as strings. -+ */ -+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { - "invalid event", -diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c -index b27e4f3..dd456b5 100644 ---- a/module/zcommon/zfs_prop.c -+++ b/module/zcommon/zfs_prop.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ -@@ -114,2 +115,10 @@ zfs_prop_init(void) - -+ static zprop_index_t acltype_table[] = { -+ { "off", ZFS_ACLTYPE_OFF }, -+ { "disabled", ZFS_ACLTYPE_OFF }, -+ { "noacl", ZFS_ACLTYPE_OFF }, -+ { "posixacl", ZFS_ACLTYPE_POSIXACL }, -+ { NULL } -+ }; -+ - static zprop_index_t acl_inherit_table[] = { -@@ -228,2 +237,5 @@ zfs_prop_init(void) - "hidden | visible", "SNAPDEV", snapdev_table); -+ zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_OFF, -+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, -+ "noacl | posixacl", "ACLTYPE", acltype_table); - zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", -@@ -253,2 +265,4 @@ zfs_prop_init(void) - ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); -+ zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT, -+ ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table); - zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, -@@ -322,2 +336,14 @@ zfs_prop_init(void) - "", "MLSLABEL"); -+ zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "CONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "FSCONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "DEFCONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "ROOTCONTEXT"); - -@@ -355,2 +381,6 @@ zfs_prop_init(void) - ZFS_TYPE_DATASET, "", "WRITTEN"); -+ zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, -+ PROP_READONLY, ZFS_TYPE_DATASET, "", "LUSED"); -+ zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", -+ 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); - -@@ -396,2 +426,4 @@ zfs_prop_init(void) - PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); -+ zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", -+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); - -@@ -644,2 +676,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c -index 9904645..90376f2 100644 ---- a/module/zcommon/zfs_uio.c -+++ b/module/zcommon/zfs_uio.c -@@ -72,15 +72,12 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) - case UIO_USERISPACE: -- /* p = kernel data pointer -- * iov->iov_base = user data pointer */ -- -+ /* -+ * p = kernel data pointer -+ * iov->iov_base = user data pointer -+ */ - if (rw == UIO_READ) { - if (copy_to_user(iov->iov_base, p, cnt)) -- return EFAULT; -- /* error = xcopyout_nta(p, iov->iov_base, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ return (EFAULT); - } else { -- /* error = xcopyin_nta(iov->iov_base, p, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ - if (copy_from_user(p, iov->iov_base, cnt)) -- return EFAULT; -+ return (EFAULT); - } -@@ -105,3 +102,3 @@ EXPORT_SYMBOL(uiomove); - --#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) -+#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) - -@@ -196,17 +193,14 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) - case UIO_USERISPACE: -- /* p = kernel data pointer -- * iov->iov_base = user data pointer */ -- -+ /* -+ * p = kernel data pointer -+ * iov->iov_base = user data pointer -+ */ - if (rw == UIO_READ) { -- /* * UIO_READ = copy data from kernel to user * */ -+ /* UIO_READ = copy data from kernel to user */ - if (copy_to_user(iov->iov_base, p, cnt)) -- return EFAULT; -- /* error = xcopyout_nta(p, iov->iov_base, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ return (EFAULT); - } else { -- /* * UIO_WRITE = copy data from user to kernel * */ -- /* error = xcopyin_nta(iov->iov_base, p, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ /* UIO_WRITE = copy data from user to kernel */ - if (copy_from_user(p, iov->iov_base, cnt)) -- return EFAULT; -+ return (EFAULT); - } -diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c -index 0a0af23..6d9f89a 100644 ---- a/module/zcommon/zprop_common.c -+++ b/module/zcommon/zprop_common.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -131,3 +134,4 @@ zprop_register_hidden(int prop, const char *name, zprop_type_t type, - zprop_register_impl(prop, name, type, 0, NULL, attr, -- objset_types, NULL, colname, B_FALSE, B_FALSE, NULL); -+ objset_types, NULL, colname, -+ type == PROP_TYPE_NUMBER, B_FALSE, NULL); - } -diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in -index 81b1680..5552436 100644 ---- a/module/zfs/Makefile.in -+++ b/module/zfs/Makefile.in -@@ -10,2 +10,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o - $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o - $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o -@@ -46,2 +47,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o - $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o - $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o -@@ -95 +97,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o - $(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o -diff --git a/module/zfs/arc.c b/module/zfs/arc.c -index ce4a023..00d2659 100644 ---- a/module/zfs/arc.c -+++ b/module/zfs/arc.c -@@ -22,5 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -60,3 +60,3 @@ - * 3. The Megiddo and Modha model assumes a fixed page size. All -- * elements of the cache are therefor exactly the same size. So -+ * elements of the cache are therefore exactly the same size. So - * when adjusting the cache size following a cache miss, its simply -@@ -64,3 +64,3 @@ - * have variable sized cache blocks (rangeing from 512 bytes to -- * 128K bytes). We therefor choose a set of blocks to evict to make -+ * 128K bytes). We therefore choose a set of blocks to evict to make - * space for a cache miss that approximates as closely as possible -@@ -79,3 +79,3 @@ - * uses method 1, while the internal arc algorithms for -- * adjusting the cache use method 2. We therefor provide two -+ * adjusting the cache use method 2. We therefore provide two - * types of locks: 1) the hash table lock array, and 2) the -@@ -136,2 +136,3 @@ - #include -+#include - #ifdef _KERNEL -@@ -147,2 +148,7 @@ - -+#ifndef _KERNEL -+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ -+boolean_t arc_watch = B_FALSE; -+#endif -+ - static kmutex_t arc_reclaim_thr_lock; -@@ -159,2 +165,8 @@ typedef enum arc_reclaim_strategy { - -+/* -+ * The number of iterations through arc_evict_*() before we -+ * drop & reacquire the lock. -+ */ -+int arc_evict_iterations = 100; -+ - /* number of seconds before growing cache again */ -@@ -162,4 +174,7 @@ int zfs_arc_grow_retry = 5; - --/* shift of arc_c for calculating both min and max arc_p */ --int zfs_arc_p_min_shift = 4; -+/* disable anon data aggressively growing arc_p */ -+int zfs_arc_p_aggressive_disable = 1; -+ -+/* disable arc_p adapt dampener in arc_adapt */ -+int zfs_arc_p_dampener_disable = 1; - -@@ -180,2 +195,7 @@ int zfs_disable_dup_eviction = 0; - -+/* -+ * If this percent of memory is free, don't throttle. -+ */ -+int arc_lotsfree_percent = 10; -+ - static int arc_dead; -@@ -234,2 +254,3 @@ typedef struct arc_state { - kmutex_t arcs_mtx; -+ arc_state_type_t arcs_state; - } arc_state_t; -@@ -261,3 +282,14 @@ typedef struct arc_stats { - kstat_named_t arcstat_recycle_miss; -+ /* -+ * Number of buffers that could not be evicted because the hash lock -+ * was held by another thread. The lock may not necessarily be held -+ * by something using the same buffer, since hash locks are shared -+ * by multiple buffers. -+ */ - kstat_named_t arcstat_mutex_miss; -+ /* -+ * Number of buffers skipped because they have I/O in progress, are -+ * indrect prefetch buffers that have not lived long enough, or are -+ * not from the spa we're trying to evict from. -+ */ - kstat_named_t arcstat_evict_skip; -@@ -278,2 +310,3 @@ typedef struct arc_stats { - kstat_named_t arcstat_data_size; -+ kstat_named_t arcstat_meta_size; - kstat_named_t arcstat_other_size; -@@ -365,2 +398,3 @@ static arc_stats_t arc_stats = { - { "data_size", KSTAT_DATA_UINT64 }, -+ { "meta_size", KSTAT_DATA_UINT64 }, - { "other_size", KSTAT_DATA_UINT64 }, -@@ -421,3 +455,3 @@ static arc_stats_t arc_stats = { - #define ARCSTAT_INCR(stat, val) \ -- atomic_add_64(&arc_stats.stat.value.ui64, (val)); -+ atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -@@ -480,5 +514,5 @@ static arc_state_t *arc_l2c_only; - #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) --#define arc_meta_used ARCSTAT(arcstat_meta_used) --#define arc_meta_limit ARCSTAT(arcstat_meta_limit) --#define arc_meta_max ARCSTAT(arcstat_meta_max) -+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -+#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ -+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ - -@@ -504,2 +538,3 @@ struct arc_write_callback { - arc_done_func_t *awcb_ready; -+ arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; -@@ -536,2 +571,7 @@ struct arc_buf_hdr { - clock_t b_arc_access; -+ uint32_t b_mru_hits; -+ uint32_t b_mru_ghost_hits; -+ uint32_t b_mfu_hits; -+ uint32_t b_mfu_ghost_hits; -+ uint32_t b_l2_hits; - -@@ -554,2 +594,3 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); -+static void arc_buf_watch(arc_buf_t *buf); - -@@ -650,5 +691,3 @@ uint64_t zfs_crc64_table[256]; - --/* -- * L2ARC Performance Tunables -- */ -+/* L2ARC Performance Tunables */ - unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -@@ -711,3 +750,4 @@ struct l2arc_buf_hdr { - /* real alloc'd buffer size depending on b_compress applied */ -- int b_asize; -+ uint32_t b_hits; -+ uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ -@@ -866,2 +906,3 @@ static kmem_cache_t *hdr_cache; - static kmem_cache_t *buf_cache; -+static kmem_cache_t *l2arc_hdr_cache; - -@@ -873,4 +914,6 @@ buf_fini(void) - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_free() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_free() in the linux kernel\ -+ */ - vmem_free(buf_hash_table.ht_table, -@@ -885,2 +928,3 @@ buf_fini(void) - kmem_cache_destroy(buf_cache); -+ kmem_cache_destroy(l2arc_hdr_cache); - } -@@ -965,4 +1009,6 @@ retry: - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_alloc() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_alloc() in the linux kernel -+ */ - buf_hash_table.ht_table = -@@ -983,2 +1029,4 @@ retry: - 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); -+ l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE, -+ 0, NULL, NULL, NULL, NULL, NULL, 0); - -@@ -1042,3 +1090,3 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), -- KM_PUSHPAGE); -+ KM_PUSHPAGE); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, -@@ -1046,2 +1094,33 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) - mutex_exit(&buf->b_hdr->b_freeze_lock); -+ arc_buf_watch(buf); -+} -+ -+#ifndef _KERNEL -+void -+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) -+{ -+ panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); -+} -+#endif -+ -+/* ARGSUSED */ -+static void -+arc_buf_unwatch(arc_buf_t *buf) -+{ -+#ifndef _KERNEL -+ if (arc_watch) { -+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, -+ PROT_READ | PROT_WRITE)); -+ } -+#endif -+} -+ -+/* ARGSUSED */ -+static void -+arc_buf_watch(arc_buf_t *buf) -+{ -+#ifndef _KERNEL -+ if (arc_watch) -+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); -+#endif - } -@@ -1066,2 +1145,4 @@ arc_buf_thaw(arc_buf_t *buf) - mutex_exit(&buf->b_hdr->b_freeze_lock); -+ -+ arc_buf_unwatch(buf); - } -@@ -1083,2 +1164,3 @@ arc_buf_freeze(arc_buf_t *buf) - mutex_exit(hash_lock); -+ - } -@@ -1140,2 +1222,50 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) - /* -+ * Returns detailed information about a specific arc buffer. When the -+ * state_index argument is set the function will calculate the arc header -+ * list position for its arc state. Since this requires a linear traversal -+ * callers are strongly encourage not to do this. However, it can be helpful -+ * for targeted analysis so the functionality is provided. -+ */ -+void -+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) -+{ -+ arc_buf_hdr_t *hdr = ab->b_hdr; -+ arc_state_t *state = hdr->b_state; -+ -+ memset(abi, 0, sizeof (arc_buf_info_t)); -+ abi->abi_flags = hdr->b_flags; -+ abi->abi_datacnt = hdr->b_datacnt; -+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; -+ abi->abi_state_contents = hdr->b_type; -+ abi->abi_state_index = -1; -+ abi->abi_size = hdr->b_size; -+ abi->abi_access = hdr->b_arc_access; -+ abi->abi_mru_hits = hdr->b_mru_hits; -+ abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; -+ abi->abi_mfu_hits = hdr->b_mfu_hits; -+ abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; -+ abi->abi_holds = refcount_count(&hdr->b_refcnt); -+ -+ if (hdr->b_l2hdr) { -+ abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; -+ abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; -+ abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; -+ abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; -+ } -+ -+ if (state && state_index && list_link_active(&hdr->b_arc_node)) { -+ list_t *list = &state->arcs_list[hdr->b_type]; -+ arc_buf_hdr_t *h; -+ -+ mutex_enter(&state->arcs_mtx); -+ for (h = list_head(list); h != NULL; h = list_next(list, h)) { -+ abi->abi_state_index++; -+ if (h == hdr) -+ break; -+ } -+ mutex_exit(&state->arcs_mtx); -+ } -+} -+ -+/* - * Move the supplied buffer to the indicated state. The mutex -@@ -1151,3 +1281,3 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) - ASSERT(MUTEX_HELD(hash_lock)); -- ASSERT(new_state != old_state); -+ ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); -@@ -1241,2 +1371,5 @@ arc_space_consume(uint64_t space, arc_space_type_t type) - break; -+ case ARC_SPACE_META: -+ ARCSTAT_INCR(arcstat_meta_size, space); -+ break; - case ARC_SPACE_OTHER: -@@ -1252,3 +1385,5 @@ arc_space_consume(uint64_t space, arc_space_type_t type) - -- atomic_add_64(&arc_meta_used, space); -+ if (type != ARC_SPACE_DATA) -+ ARCSTAT_INCR(arcstat_meta_used, space); -+ - atomic_add_64(&arc_size, space); -@@ -1267,2 +1402,5 @@ arc_space_return(uint64_t space, arc_space_type_t type) - break; -+ case ARC_SPACE_META: -+ ARCSTAT_INCR(arcstat_meta_size, -space); -+ break; - case ARC_SPACE_OTHER: -@@ -1278,6 +1416,9 @@ arc_space_return(uint64_t space, arc_space_type_t type) - -- ASSERT(arc_meta_used >= space); -- if (arc_meta_max < arc_meta_used) -- arc_meta_max = arc_meta_used; -- atomic_add_64(&arc_meta_used, -space); -+ if (type != ARC_SPACE_DATA) { -+ ASSERT(arc_meta_used >= space); -+ if (arc_meta_max < arc_meta_used) -+ arc_meta_max = arc_meta_used; -+ ARCSTAT_INCR(arcstat_meta_used, -space); -+ } -+ - ASSERT(arc_size >= space); -@@ -1300,2 +1441,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) - hdr->b_arc_access = 0; -+ hdr->b_mru_hits = 0; -+ hdr->b_mru_ghost_hits = 0; -+ hdr->b_mfu_hits = 0; -+ hdr->b_mfu_ghost_hits = 0; -+ hdr->b_l2_hits = 0; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); -@@ -1437,5 +1583,6 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) - static void --arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), -- void *data, size_t size) -+arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) - { -+ arc_buf_hdr_t *hdr = buf->b_hdr; -+ - if (HDR_L2_WRITING(hdr)) { -@@ -1443,4 +1590,4 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); -- df->l2df_data = data; -- df->l2df_size = size; -+ df->l2df_data = buf->b_data; -+ df->l2df_size = hdr->b_size; - df->l2df_func = free_func; -@@ -1451,3 +1598,3 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - } else { -- free_func(data, size); -+ free_func(buf->b_data, hdr->b_size); - } -@@ -1467,2 +1614,3 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) - arc_cksum_verify(buf); -+ arc_buf_unwatch(buf); - -@@ -1470,11 +1618,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) - if (type == ARC_BUFC_METADATA) { -- arc_buf_data_free(buf->b_hdr, zio_buf_free, -- buf->b_data, size); -- arc_space_return(size, ARC_SPACE_DATA); -+ arc_buf_data_free(buf, zio_buf_free); -+ arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); -- arc_buf_data_free(buf->b_hdr, -- zio_data_buf_free, buf->b_data, size); -- ARCSTAT_INCR(arcstat_data_size, -size); -- atomic_add_64(&arc_size, -size); -+ arc_buf_data_free(buf, zio_data_buf_free); -+ arc_space_return(size, ARC_SPACE_DATA); - } -@@ -1554,3 +1699,3 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); -- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -1645,3 +1790,3 @@ arc_buf_free(arc_buf_t *buf, void *tag) - --int -+boolean_t - arc_buf_remove_ref(arc_buf_t *buf, void* tag) -@@ -1650,3 +1795,3 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) - kmutex_t *hash_lock = NULL; -- int no_callback = (buf->b_efunc == NULL); -+ boolean_t no_callback = (buf->b_efunc == NULL); - -@@ -1752,2 +1897,4 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - void *stolen = NULL; -+ arc_buf_hdr_t marker = {{{ 0 }}}; -+ int count = 0; - -@@ -1757,2 +1904,3 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - -+top: - mutex_enter(&state->arcs_mtx); -@@ -1775,2 +1923,29 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - continue; -+ -+ /* ignore markers */ -+ if (ab->b_spa == 0) -+ continue; -+ -+ /* -+ * It may take a long time to evict all the bufs requested. -+ * To avoid blocking all arc activity, periodically drop -+ * the arcs_mtx and give other threads a chance to run -+ * before reacquiring the lock. -+ * -+ * If we are looking for a buffer to recycle, we are in -+ * the hot code path, so don't sleep. -+ */ -+ if (!recycle && count++ > arc_evict_iterations) { -+ list_insert_after(list, ab, &marker); -+ mutex_exit(&evicted_state->arcs_mtx); -+ mutex_exit(&state->arcs_mtx); -+ kpreempt(KPREEMPT_SYNC); -+ mutex_enter(&state->arcs_mtx); -+ mutex_enter(&evicted_state->arcs_mtx); -+ ab_prev = list_prev(list, &marker); -+ list_remove(list, &marker); -+ count = 0; -+ continue; -+ } -+ - hash_lock = HDR_LOCK(ab); -@@ -1845,2 +2020,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - -+ if (list == &state->arcs_list[ARC_BUFC_DATA] && -+ (bytes < 0 || bytes_evicted < bytes)) { -+ /* Prevent second pass from recycling metadata into data */ -+ recycle = FALSE; -+ type = ARC_BUFC_METADATA; -+ list = &state->arcs_list[type]; -+ goto top; -+ } -+ - if (bytes_evicted < bytes) -@@ -1856,23 +2040,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - /* -- * We have just evicted some date into the ghost state, make -- * sure we also adjust the ghost state size if necessary. -+ * Note: we have just evicted some data into the ghost state, -+ * potentially putting the ghost size over the desired size. Rather -+ * that evicting from the ghost list in this hot code path, leave -+ * this chore to the arc_reclaim_thread(). - */ -- if (arc_no_grow && -- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { -- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + -- arc_mru_ghost->arcs_size - arc_c; -- -- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { -- int64_t todelete = -- MIN(arc_mru_ghost->arcs_lsize[type], mru_over); -- arc_evict_ghost(arc_mru_ghost, 0, todelete, -- ARC_BUFC_DATA); -- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { -- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], -- arc_mru_ghost->arcs_size + -- arc_mfu_ghost->arcs_size - arc_c); -- arc_evict_ghost(arc_mfu_ghost, 0, todelete, -- ARC_BUFC_DATA); -- } -- } - -@@ -1895,5 +2063,6 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - uint64_t bufs_skipped = 0; -+ int count = 0; - - ASSERT(GHOST_STATE(state)); -- bzero(&marker, sizeof(marker)); -+ bzero(&marker, sizeof (marker)); - top: -@@ -1902,2 +2071,4 @@ top: - ab_prev = list_prev(list, ab); -+ if (ab->b_type > ARC_BUFC_NUMTYPES) -+ panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) -@@ -1913,2 +2084,19 @@ top: - continue; -+ -+ /* -+ * It may take a long time to evict all the bufs requested. -+ * To avoid blocking all arc activity, periodically drop -+ * the arcs_mtx and give other threads a chance to run -+ * before reacquiring the lock. -+ */ -+ if (count++ > arc_evict_iterations) { -+ list_insert_after(list, ab, &marker); -+ mutex_exit(&state->arcs_mtx); -+ kpreempt(KPREEMPT_SYNC); -+ mutex_enter(&state->arcs_mtx); -+ ab_prev = list_prev(list, &marker); -+ list_remove(list, &marker); -+ count = 0; -+ continue; -+ } - if (mutex_tryenter(hash_lock)) { -@@ -1948,4 +2136,5 @@ top: - list_remove(list, &marker); -- } else -+ } else { - bufs_skipped += 1; -+ } - } -@@ -1979,15 +2168,7 @@ arc_adjust(void) - adjustment = MIN((int64_t)(arc_size - arc_c), -- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - -- arc_p)); -+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); -+ if (adjustment > 0 && arc_mru->arcs_size > 0) { -+ delta = MIN(arc_mru->arcs_size, adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); -- adjustment -= delta; -- } -- -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); -- (void) arc_evict(arc_mru, 0, delta, FALSE, -- ARC_BUFC_METADATA); - } -@@ -2000,13 +2181,5 @@ arc_adjust(void) - -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { -- delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); -+ if (adjustment > 0 && arc_mfu->arcs_size > 0) { -+ delta = MIN(arc_mfu->arcs_size, adjustment); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); -- adjustment -= delta; -- } -- -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- int64_t delta = MIN(adjustment, -- arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); -- (void) arc_evict(arc_mfu, 0, delta, FALSE, -- ARC_BUFC_METADATA); - } -@@ -2103,20 +2276,57 @@ arc_do_user_evicts(void) - */ --void --arc_adjust_meta(int64_t adjustment, boolean_t may_prune) -+static void -+arc_adjust_meta(void) - { -- int64_t delta; -+ int64_t adjustmnt, delta; -+ -+ /* -+ * This slightly differs than the way we evict from the mru in -+ * arc_adjust because we don't have a "target" value (i.e. no -+ * "meta" arc_p). As a result, I think we can completely -+ * cannibalize the metadata in the MRU before we evict the -+ * metadata from the MFU. I think we probably need to implement a -+ * "metadata arc_p" value to do this properly. -+ */ -+ adjustmnt = arc_meta_used - arc_meta_limit; - -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); -+ if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); -- adjustment -= delta; -+ adjustmnt -= delta; - } - -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); -+ /* -+ * We can't afford to recalculate adjustmnt here. If we do, -+ * new metadata buffers can sneak into the MRU or ANON lists, -+ * thus penalize the MFU metadata. Although the fudge factor is -+ * small, it has been empirically shown to be significant for -+ * certain workloads (e.g. creating many empty directories). As -+ * such, we use the original calculation for adjustmnt, and -+ * simply decrement the amount of data evicted from the MRU. -+ */ -+ -+ if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); -- adjustment -= delta; - } - -- if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) -+ adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + -+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; -+ -+ if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(adjustmnt, -+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); -+ arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); -+ } -+ -+ adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + -+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; -+ -+ if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(adjustmnt, -+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); -+ arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); -+ } -+ -+ if (arc_meta_used > arc_meta_limit) - arc_do_user_prune(zfs_arc_meta_prune); -@@ -2179,3 +2389,9 @@ arc_shrink(uint64_t bytes) - -- atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift)); -+ to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift; -+ -+ if (arc_p > to_free) -+ atomic_add_64(&arc_p, -to_free); -+ else -+ arc_p = 0; -+ - if (arc_c > arc_size) -@@ -2234,3 +2450,2 @@ arc_adapt_thread(void) - callb_cpr_t cpr; -- int64_t prune; - -@@ -2258,3 +2473,4 @@ arc_adapt_thread(void) - /* reset the growth delay for every reclaim */ -- arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz); -+ arc_grow_time = ddi_get_lbolt() + -+ (zfs_arc_grow_retry * hz); - -@@ -2269,10 +2485,3 @@ arc_adapt_thread(void) - -- /* -- * Keep meta data usage within limits, arc_shrink() is not -- * used to avoid collapsing the arc_c value when only the -- * arc_meta_limit is being exceeded. -- */ -- prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; -- if (prune > 0) -- arc_adjust_meta(prune, B_TRUE); -+ arc_adjust_meta(); - -@@ -2411,4 +2620,6 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) - arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); -+ pages = btop(arc_evictable_memory()); - } else { - arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); -+ pages = -1; - } -@@ -2432,3 +2643,3 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) - -- return (-1); -+ return (pages); - } -@@ -2448,3 +2659,2 @@ arc_adapt(int bytes, arc_state_t *state) - int mult; -- uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift); - -@@ -2465,5 +2675,7 @@ arc_adapt(int bytes, arc_state_t *state) - 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); -- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - -- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); -+ if (!zfs_arc_p_dampener_disable) -+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ -+ -+ arc_p = MIN(arc_c, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { -@@ -2473,6 +2685,8 @@ arc_adapt(int bytes, arc_state_t *state) - 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); -- mult = MIN(mult, 10); -+ -+ if (!zfs_arc_p_dampener_disable) -+ mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); -- arc_p = MAX(arc_p_min, arc_p - delta); -+ arc_p = MAX(0, arc_p - delta); - } -@@ -2547,2 +2761,4 @@ arc_get_data_buf(arc_buf_t *buf) - arc_buf_contents_t type = buf->b_hdr->b_type; -+ arc_buf_contents_t evict = ARC_BUFC_DATA; -+ boolean_t recycle = TRUE; - -@@ -2557,3 +2773,3 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_buf_alloc(size); -- arc_space_consume(size, ARC_SPACE_DATA); -+ arc_space_consume(size, ARC_SPACE_META); - } else { -@@ -2561,4 +2777,3 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_data_buf_alloc(size); -- ARCSTAT_INCR(arcstat_data_size, size); -- atomic_add_64(&arc_size, size); -+ arc_space_consume(size, ARC_SPACE_DATA); - } -@@ -2587,6 +2802,23 @@ arc_get_data_buf(arc_buf_t *buf) - -- if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { -+ /* -+ * Evict data buffers prior to metadata buffers, unless we're -+ * over the metadata limit and adding a metadata buffer. -+ */ -+ if (type == ARC_BUFC_METADATA) { -+ if (arc_meta_used >= arc_meta_limit) -+ evict = ARC_BUFC_METADATA; -+ else -+ /* -+ * In this case, we're evicting data while -+ * adding metadata. Thus, to prevent recycling a -+ * data buffer into a metadata buffer, recycling -+ * is disabled in the following arc_evict call. -+ */ -+ recycle = FALSE; -+ } -+ -+ if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); -- arc_space_consume(size, ARC_SPACE_DATA); -+ arc_space_consume(size, ARC_SPACE_META); - -@@ -2598,4 +2830,6 @@ arc_get_data_buf(arc_buf_t *buf) - * thread to avoid deadlocking on the hash_lock. -+ * Of course, only do this when recycle is true. - */ -- cv_signal(&arc_reclaim_thr_cv); -+ if (recycle) -+ cv_signal(&arc_reclaim_thr_cv); - } else { -@@ -2603,7 +2837,8 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_data_buf_alloc(size); -- ARCSTAT_INCR(arcstat_data_size, size); -- atomic_add_64(&arc_size, size); -+ arc_space_consume(size, ARC_SPACE_DATA); - } - -- ARCSTAT_BUMP(arcstat_recycle_miss); -+ /* Only bump this if we tried to recycle and failed */ -+ if (recycle) -+ ARCSTAT_BUMP(arcstat_recycle_miss); - } -@@ -2627,3 +2862,4 @@ out: - */ -- if (arc_size < arc_c && hdr->b_state == arc_anon && -+ if (!zfs_arc_p_aggressive_disable && -+ arc_size < arc_c && hdr->b_state == arc_anon && - arc_anon->arcs_size + arc_mru->arcs_size > arc_p) -@@ -2672,2 +2908,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - buf->b_flags &= ~ARC_PREFETCH; -+ atomic_inc_32(&buf->b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); -@@ -2693,2 +2930,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - } -+ atomic_inc_32(&buf->b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); -@@ -2715,2 +2953,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - -+ atomic_inc_32(&buf->b_mru_ghost_hits); - ARCSTAT_BUMP(arcstat_mru_ghost_hits); -@@ -2730,2 +2969,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - } -+ atomic_inc_32(&buf->b_mfu_hits); - ARCSTAT_BUMP(arcstat_mfu_hits); -@@ -2753,2 +2993,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - -+ atomic_inc_32(&buf->b_mfu_ghost_hits); - ARCSTAT_BUMP(arcstat_mfu_ghost_hits); -@@ -2774,3 +3015,3 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); -- VERIFY(arc_buf_remove_ref(buf, arg) == 1); -+ VERIFY(arc_buf_remove_ref(buf, arg)); - } -@@ -2783,3 +3024,3 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) - if (zio && zio->io_error) { -- VERIFY(arc_buf_remove_ref(buf, arg) == 1); -+ VERIFY(arc_buf_remove_ref(buf, arg)); - *bufp = NULL; -@@ -2836,2 +3077,3 @@ arc_read_done(zio_t *zio) - arc_cksum_compute(buf, B_FALSE); -+ arc_buf_watch(buf); - -@@ -2937,3 +3179,3 @@ int - arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, -- void *private, int priority, int zio_flags, uint32_t *arc_flags, -+ void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_t *zb) -@@ -2945,2 +3187,3 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - uint64_t guid = spa_load_guid(spa); -+ int rc = 0; - -@@ -2978,6 +3221,6 @@ top: - mutex_exit(hash_lock); -- return (0); -+ goto out; - } - mutex_exit(hash_lock); -- return (0); -+ goto out; - } -@@ -3025,4 +3268,6 @@ top: - vdev_t *vd = NULL; -- uint64_t addr = -1; -+ uint64_t addr = 0; - boolean_t devw = B_FALSE; -+ enum zio_compress b_compress = ZIO_COMPRESS_OFF; -+ uint64_t b_asize = 0; - -@@ -3096,3 +3341,3 @@ top: - -- if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && -+ if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { -@@ -3100,2 +3345,4 @@ top: - addr = hdr->b_l2hdr->b_daddr; -+ b_compress = hdr->b_l2hdr->b_compress; -+ b_asize = hdr->b_l2hdr->b_asize; - /* -@@ -3110,2 +3357,6 @@ top: - -+ /* -+ * At this point, we have a level 1 cache miss. Try again in -+ * L2ARC if possible. -+ */ - ASSERT3U(hdr->b_size, ==, size); -@@ -3135,2 +3386,3 @@ top: - ARCSTAT_BUMP(arcstat_l2_hits); -+ atomic_inc_32(&hdr->b_l2hdr->b_hits); - -@@ -3143,3 +3395,7 @@ top: - cb->l2rcb_flags = zio_flags; -- cb->l2rcb_compress = hdr->b_l2hdr->b_compress; -+ cb->l2rcb_compress = b_compress; -+ -+ ASSERT(addr >= VDEV_LABEL_START_SIZE && -+ addr + size < vd->vdev_psize - -+ VDEV_LABEL_END_SIZE); - -@@ -3151,4 +3407,3 @@ top: - */ -- if (hdr->b_l2hdr->b_compress == -- ZIO_COMPRESS_EMPTY) { -+ if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, -@@ -3161,4 +3416,4 @@ top: - rzio = zio_read_phys(pio, vd, addr, -- hdr->b_l2hdr->b_asize, -- buf->b_data, ZIO_CHECKSUM_OFF, -+ b_asize, buf->b_data, -+ ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, -@@ -3171,4 +3426,3 @@ top: - zio_t *, rzio); -- ARCSTAT_INCR(arcstat_l2_read_bytes, -- hdr->b_l2hdr->b_asize); -+ ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - -@@ -3176,3 +3430,3 @@ top: - zio_nowait(rzio); -- return (0); -+ goto out; - } -@@ -3181,3 +3435,3 @@ top: - if (zio_wait(rzio) == 0) -- return (0); -+ goto out; - -@@ -3205,4 +3459,6 @@ top: - -- if (*arc_flags & ARC_WAIT) -- return (zio_wait(rzio)); -+ if (*arc_flags & ARC_WAIT) { -+ rc = zio_wait(rzio); -+ goto out; -+ } - -@@ -3211,3 +3467,6 @@ top: - } -- return (0); -+ -+out: -+ spa_read_history_add(spa, zb, *arc_flags); -+ return (rc); - } -@@ -3219,3 +3478,3 @@ arc_add_prune_callback(arc_prune_func_t *func, void *private) - -- p = kmem_alloc(sizeof(*p), KM_SLEEP); -+ p = kmem_alloc(sizeof (*p), KM_SLEEP); - p->p_pfunc = func; -@@ -3371,4 +3630,4 @@ arc_buf_evict(arc_buf_t *buf) - /* -- * Release this buffer from the cache. This must be done -- * after a read and prior to modifying the buffer contents. -+ * Release this buffer from the cache, making it an anonymous buffer. This -+ * must be done after a read and prior to modifying the buffer contents. - * If the buffer has more than one reference, we must make -@@ -3410,4 +3669,5 @@ arc_release(arc_buf_t *buf, void *tag) - hdr->b_l2hdr = NULL; -- buf_size = hdr->b_size; -+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - } -+ buf_size = hdr->b_size; - -@@ -3455,2 +3715,3 @@ arc_release(arc_buf_t *buf, void *tag) - arc_cksum_verify(buf); -+ arc_buf_unwatch(buf); - -@@ -3465,2 +3726,7 @@ arc_release(arc_buf_t *buf, void *tag) - nhdr->b_arc_access = 0; -+ nhdr->b_mru_hits = 0; -+ nhdr->b_mru_ghost_hits = 0; -+ nhdr->b_mfu_hits = 0; -+ nhdr->b_mfu_ghost_hits = 0; -+ nhdr->b_l2_hits = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; -@@ -3481,2 +3747,7 @@ arc_release(arc_buf_t *buf, void *tag) - hdr->b_arc_access = 0; -+ hdr->b_mru_hits = 0; -+ hdr->b_mru_ghost_hits = 0; -+ hdr->b_mfu_hits = 0; -+ hdr->b_mfu_ghost_hits = 0; -+ hdr->b_l2_hits = 0; - if (hash_lock) -@@ -3492,4 +3763,3 @@ arc_release(arc_buf_t *buf, void *tag) - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); -- list_remove(l2hdr->b_dev->l2ad_buflist, hdr); -- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -3563,2 +3833,14 @@ arc_write_ready(zio_t *zio) - -+/* -+ * The SPA calls this callback for each physical write that happens on behalf -+ * of a logical write. See the comment in dbuf_write_physdone() for details. -+ */ -+static void -+arc_write_physdone(zio_t *zio) -+{ -+ arc_write_callback_t *cb = zio->io_private; -+ if (cb->awcb_physdone != NULL) -+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -+} -+ - static void -@@ -3611,2 +3893,8 @@ arc_write_done(zio_t *zio) - ASSERT3P(exists, ==, NULL); -+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { -+ /* nopwrite */ -+ ASSERT(zio->io_prop.zp_nopwrite); -+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) -+ panic("bad nopwrite, hdr=%p exists=%p", -+ (void *)hdr, (void *)exists); - } else { -@@ -3637,4 +3925,5 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, -- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, -- void *private, int priority, int zio_flags, const zbookmark_t *zb) -+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, -+ arc_done_func_t *done, void *private, zio_priority_t priority, -+ int zio_flags, const zbookmark_t *zb) - { -@@ -3655,2 +3944,3 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - callback->awcb_ready = ready; -+ callback->awcb_physdone = physdone; - callback->awcb_done = done; -@@ -3660,3 +3950,4 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, -- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); -+ arc_write_ready, arc_write_physdone, arc_write_done, callback, -+ priority, zio_flags, zb); - -@@ -3666,7 +3957,5 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - static int --arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) -+arc_memory_throttle(uint64_t reserve, uint64_t txg) - { - #ifdef _KERNEL -- uint64_t available_memory; -- - if (zfs_arc_memory_throttle_disable) -@@ -3674,15 +3963,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) - -- /* Easily reclaimable memory (free + inactive + arc-evictable) */ -- available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory(); -- -- if (available_memory <= zfs_write_limit_max) { -+ if (freemem <= physmem * arc_lotsfree_percent / 100) { - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); -- return (EAGAIN); -- } -- -- if (inflight_data > available_memory / 4) { -- ARCSTAT_INCR(arcstat_memory_throttle_count, 1); -- DMU_TX_STAT_BUMP(dmu_tx_memory_inflight); -- return (ERESTART); -+ return (SET_ERROR(EAGAIN)); - } -@@ -3705,11 +3985,2 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - --#ifdef ZFS_DEBUG -- /* -- * Once in a while, fail for no reason. Everything should cope. -- */ -- if (spa_get_random(10000) == 0) { -- dprintf("forcing random failure\n"); -- return (ERESTART); -- } --#endif - if (reserve > arc_c/4 && !arc_no_grow) -@@ -3718,3 +3989,3 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -3730,6 +4001,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - * Writes will, almost always, require additional memory allocations -- * in order to compress/encrypt/etc the data. We therefor need to -+ * in order to compress/encrypt/etc the data. We therefore need to - * make sure that there is sufficient available memory for this. - */ -- if ((error = arc_memory_throttle(reserve, anon_size, txg))) -+ error = arc_memory_throttle(reserve, txg); -+ if (error != 0) - return (error); -@@ -3753,3 +4025,3 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -3774,3 +4046,3 @@ arc_kstat_update(kstat_t *ksp, int rw) - if (rw == KSTAT_WRITE) { -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } else { -@@ -3828,6 +4100,6 @@ arc_init(void) - -- /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ -- arc_c_min = MAX(arc_c / 4, 64<<20); -+ /* set min cache to zero */ -+ arc_c_min = 4<<20; - /* set max to 1/2 of all memory */ -- arc_c_max = MAX(arc_c * 4, arc_c_max); -+ arc_c_max = arc_c * 4; - -@@ -3839,3 +4111,3 @@ arc_init(void) - arc_c_max = zfs_arc_max; -- if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) -+ if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; -@@ -3845,4 +4117,4 @@ arc_init(void) - -- /* limit meta-data to 1/4 of the arc capacity */ -- arc_meta_limit = arc_c_max / 4; -+ /* limit meta-data to 3/4 of the arc capacity */ -+ arc_meta_limit = (3 * arc_c_max) / 4; - arc_meta_max = 0; -@@ -3853,5 +4125,2 @@ arc_init(void) - -- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) -- arc_c_min = arc_meta_limit / 2; -- - /* if kmem_flags are set, lets try to use less memory */ -@@ -3898,2 +4167,9 @@ arc_init(void) - -+ arc_anon->arcs_state = ARC_STATE_ANON; -+ arc_mru->arcs_state = ARC_STATE_MRU; -+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; -+ arc_mfu->arcs_state = ARC_STATE_MFU; -+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; -+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; -+ - buf_init(); -@@ -3923,7 +4199,20 @@ arc_init(void) - -- if (zfs_write_limit_max == 0) -- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; -- else -- zfs_write_limit_shift = 0; -- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); -+ /* -+ * Calculate maximum amount of dirty data per pool. -+ * -+ * If it has been set by a module parameter, take that. -+ * Otherwise, use a percentage of physical memory defined by -+ * zfs_dirty_data_max_percent (default 10%) with a cap at -+ * zfs_dirty_data_max_max (default 25% of physical memory). -+ */ -+ if (zfs_dirty_data_max_max == 0) -+ zfs_dirty_data_max_max = physmem * PAGESIZE * -+ zfs_dirty_data_max_max_percent / 100; -+ -+ if (zfs_dirty_data_max == 0) { -+ zfs_dirty_data_max = physmem * PAGESIZE * -+ zfs_dirty_data_max_percent / 100; -+ zfs_dirty_data_max = MIN(zfs_dirty_data_max, -+ zfs_dirty_data_max_max); -+ } - } -@@ -3985,4 +4274,2 @@ arc_fini(void) - -- mutex_destroy(&zfs_write_limit_lock); -- - buf_fini(); -@@ -4335,2 +4622,9 @@ l2arc_write_done(zio_t *zio) - ab_prev = list_prev(buflist, ab); -+ abl2 = ab->b_l2hdr; -+ -+ /* -+ * Release the temporary compressed buffer as soon as possible. -+ */ -+ if (abl2->b_compress != ZIO_COMPRESS_OFF) -+ l2arc_release_cdata_buf(ab); - -@@ -4347,10 +4641,2 @@ l2arc_write_done(zio_t *zio) - -- abl2 = ab->b_l2hdr; -- -- /* -- * Release the temporary compressed buffer as soon as possible. -- */ -- if (abl2->b_compress != ZIO_COMPRESS_OFF) -- l2arc_release_cdata_buf(ab); -- - if (zio->io_error != 0) { -@@ -4362,3 +4648,3 @@ l2arc_write_done(zio_t *zio) - ab->b_l2hdr = NULL; -- kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4439,3 +4725,3 @@ l2arc_read_done(zio_t *zio) - } else { -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - } -@@ -4617,3 +4903,3 @@ top: - ab->b_l2hdr = NULL; -- kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4753,3 +5039,3 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - cb = kmem_alloc(sizeof (l2arc_write_callback_t), -- KM_PUSHPAGE); -+ KM_PUSHPAGE); - cb->l2wcb_dev = dev; -@@ -4763,5 +5049,5 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - */ -- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), -- KM_PUSHPAGE); -+ l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_PUSHPAGE); - l2hdr->b_dev = dev; -+ l2hdr->b_daddr = 0; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4781,2 +5067,3 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - l2hdr->b_tmp_cdata = ab->b_buf->b_data; -+ l2hdr->b_hits = 0; - -@@ -5017,3 +5304,3 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) - hdr->b_size) != 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - zio_data_buf_free(cdata, csize); -@@ -5313,2 +5600,3 @@ EXPORT_SYMBOL(arc_read); - EXPORT_SYMBOL(arc_buf_remove_ref); -+EXPORT_SYMBOL(arc_buf_info); - EXPORT_SYMBOL(arc_getbuf_func); -@@ -5332,2 +5620,8 @@ MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); - -+module_param(zfs_arc_p_aggressive_disable, int, 0644); -+MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow"); -+ -+module_param(zfs_arc_p_dampener_disable, int, 0644); -+MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener"); -+ - module_param(zfs_arc_shrink_shift, int, 0644); -@@ -5335,5 +5629,2 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); - --module_param(zfs_arc_p_min_shift, int, 0644); --MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); -- - module_param(zfs_disable_dup_eviction, int, 0644); -diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c -index d196351..c3927e7 100644 ---- a/module/zfs/bplist.c -+++ b/module/zfs/bplist.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -54,2 +55,8 @@ bplist_append(bplist_t *bpl, const blkptr_t *bp) - -+/* -+ * To aid debugging, we keep the most recently removed entry. This way if -+ * we are in the callback, we can easily locate the entry. -+ */ -+static bplist_entry_t *bplist_iterate_last_removed; -+ - void -@@ -61,2 +68,3 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) - while ((bpe = list_head(&bpl->bpl_list))) { -+ bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); -diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c -index 1920da4..5787a6f 100644 ---- a/module/zfs/bpobj.c -+++ b/module/zfs/bpobj.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -368,2 +368,3 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - uint64_t used, comp, uncomp, subsubobjs; -+ ASSERTV(dmu_object_info_t doi); - -@@ -394,2 +395,5 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - -+ ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); -+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); -+ - mutex_enter(&bpo->bpo_lock); -@@ -416,2 +420,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - 0, FTAG, &subdb, 0)); -+ /* -+ * Make sure that we are not asking dmu_write() -+ * to write more data than we have in our buffer. -+ */ -+ VERIFY3U(subdb->db_size, >=, -+ numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, -diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c -index 73922db..c03cb1f 100644 ---- a/module/zfs/bptree.c -+++ b/module/zfs/bptree.c -@@ -22,3 +22,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -45,3 +45,3 @@ - * -- * Note that while bt_begin and bt_end are only ever incremented in this code -+ * Note that while bt_begin and bt_end are only ever incremented in this code, - * they are effectively reset to 0 every time the entire bptree is freed because -@@ -182,2 +182,3 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - bptree_entry_phys_t bte; -+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - -@@ -190,9 +191,9 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - -+ if (zfs_recover) -+ flags |= TRAVERSE_HARD; - err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, -- bte.be_birth_txg, &bte.be_zb, -- TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST, -+ bte.be_birth_txg, &bte.be_zb, flags, - bptree_visit_cb, &ba); - if (free) { -- ASSERT(err == 0 || err == ERESTART); -- if (err != 0) { -+ if (err == ERESTART) { - /* save bookmark for future resume */ -@@ -204,7 +205,17 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - break; -- } else { -- ba.ba_phys->bt_begin++; -- (void) dmu_free_range(os, obj, -- i * sizeof (bte), sizeof (bte), tx); - } -+ if (err != 0) { -+ /* -+ * We can not properly handle an i/o -+ * error, because the traversal code -+ * does not know how to resume from an -+ * arbitrary bookmark. -+ */ -+ zfs_panic_recover("error %u from " -+ "traverse_dataset_destroyed()", err); -+ } -+ -+ ba.ba_phys->bt_begin++; -+ (void) dmu_free_range(os, obj, -+ i * sizeof (bte), sizeof (bte), tx); - } -diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c -index faa6cc3..c8a5261 100644 ---- a/module/zfs/dbuf.c -+++ b/module/zfs/dbuf.c -@@ -23,3 +23,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -30,2 +30,3 @@ - #include -+#include - #include -@@ -65,4 +66,10 @@ static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); - -+/* -+ * Number of times that zfs_free_range() took the slow path while doing -+ * a zfs receive. A nonzero value indicates a potential performance problem. -+ */ -+uint64_t zfs_free_range_recv_miss; -+ - static void dbuf_destroy(dmu_buf_impl_t *db); --static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); - static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); -@@ -300,4 +307,6 @@ retry: - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_alloc() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_alloc() in the linux kernel -+ */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE); -@@ -319,2 +328,4 @@ retry: - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); -+ -+ dbuf_stats_init(h); - } -@@ -327,2 +338,4 @@ dbuf_fini(void) - -+ dbuf_stats_destroy(); -+ - for (i = 0; i < DBUF_MUTEXES; i++) -@@ -330,4 +343,6 @@ dbuf_fini(void) - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_free() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_free() in the linux kernel -+ */ - vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -@@ -548,3 +563,3 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) - ASSERT3P(db->db_buf, ==, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - db->db_state = DB_UNCACHED; -@@ -649,3 +664,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - if (db->db_state == DB_NOFILL) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -689,2 +704,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - } else { -+ /* -+ * Another reader came in while the dbuf was in flight -+ * between UNCACHED and CACHED. Either a writer will finish -+ * writing the buffer (sending the dbuf to CACHED) or the -+ * first reader's request will reach the read_done callback -+ * and send the dbuf to CACHED. Otherwise, a failure -+ * occurred and the dbuf went to UNCACHED. -+ */ - mutex_exit(&db->db_mtx); -@@ -697,2 +720,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - -+ /* Skip the wait per the caller's request. */ - mutex_enter(&db->db_mtx); -@@ -706,3 +730,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - if (db->db_state == DB_UNCACHED) -- err = EIO; -+ err = SET_ERROR(EIO); - } -@@ -812,3 +836,3 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - /* free this block */ -- if (!BP_IS_HOLE(bp)) { -+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { - spa_t *spa; -@@ -819,2 +843,4 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; -+ dr->dt.dl.dr_nopwrite = B_FALSE; -+ - /* -@@ -833,5 +859,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - * data blocks in the free range, so that any future readers will find -- * empty blocks. Also, if we happen accross any level-1 dbufs in the -+ * empty blocks. Also, if we happen across any level-1 dbufs in the - * range that have not already been marked dirty, mark them dirty so - * they stay in memory. -+ * -+ * This is a no-op if the dataset is in the middle of an incremental -+ * receive; see comment below for details. - */ -@@ -851,4 +880,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); -+ - mutex_enter(&dn->dn_dbufs_mtx); -- for (db = list_head(&dn->dn_dbufs); db; db = db_next) { -+ if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { -+ /* There can't be any dbufs in this range; no need to search. */ -+ mutex_exit(&dn->dn_dbufs_mtx); -+ return; -+ } else if (dmu_objset_is_receiving(dn->dn_objset)) { -+ /* -+ * If we are receiving, we expect there to be no dbufs in -+ * the range to be freed, because receive modifies each -+ * block at most once, and in offset order. If this is -+ * not the case, it can lead to performance problems, -+ * so note that we unexpectedly took the slow path. -+ */ -+ atomic_inc_64(&zfs_free_range_recv_miss); -+ } -+ -+ for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); -@@ -877,6 +922,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) - /* found a level 0 buffer in the range */ -- if (dbuf_undirty(db, tx)) -+ mutex_enter(&db->db_mtx); -+ if (dbuf_undirty(db, tx)) { -+ /* mutex has been dropped and dbuf destroyed */ - continue; -+ } - -- mutex_enter(&db->db_mtx); - if (db->db_state == DB_UNCACHED || -@@ -1007,3 +1054,3 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) - dbuf_set_data(db, buf); -- VERIFY(arc_buf_remove_ref(obuf, db) == 1); -+ VERIFY(arc_buf_remove_ref(obuf, db)); - db->db.db_size = size; -@@ -1197,2 +1244,4 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - } -+ if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) -+ dr->dr_accounted = db->db.db_size; - dr->dr_dbuf = db; -@@ -1280,3 +1329,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - mutex_enter(&db->db_mtx); -- /* possible race with dbuf_undirty() */ -+ /* -+ * Since we've dropped the mutex, it's possible that -+ * dbuf_undirty() might have changed this out from under us. -+ */ - if (db->db_last_dirty == dr || -@@ -1308,3 +1360,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - --static int -+/* -+ * Undirty a buffer in the transaction group referenced by the given -+ * transaction. Return whether this evicted the dbuf. -+ */ -+static boolean_t - dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -@@ -1317,4 +1373,5 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - ASSERT(db->db_blkid != DMU_BONUS_BLKID); -+ ASSERT0(db->db_level); -+ ASSERT(MUTEX_HELD(&db->db_mtx)); - -- mutex_enter(&db->db_mtx); - /* -@@ -1325,6 +1382,4 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - break; -- if (dr == NULL || dr->dr_txg < txg) { -- mutex_exit(&db->db_mtx); -- return (0); -- } -+ if (dr == NULL || dr->dr_txg < txg) -+ return (B_FALSE); - ASSERT(dr->dr_txg == txg); -@@ -1336,20 +1391,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - /* -- * If this buffer is currently held, we cannot undirty -- * it, since one of the current holders may be in the -- * middle of an update. Note that users of dbuf_undirty() -- * should not place a hold on the dbuf before the call. -- * Also note: we can get here with a spill block, so -- * test for that similar to how dbuf_dirty does. -+ * Note: This code will probably work even if there are concurrent -+ * holders, but it is untested in that scenerio, as the ZPL and -+ * ztest have additional locking (the range locks) that prevents -+ * that type of concurrent access. - */ -- if (refcount_count(&db->db_holds) > db->db_dirtycnt) { -- mutex_exit(&db->db_mtx); -- /* Make sure we don't toss this buffer at sync phase */ -- if (db->db_blkid != DMU_SPILL_BLKID) { -- mutex_enter(&dn->dn_mtx); -- dnode_clear_range(dn, db->db_blkid, 1, tx); -- mutex_exit(&dn->dn_mtx); -- } -- DB_DNODE_EXIT(db); -- return (0); -- } -+ ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); - -@@ -1359,3 +1402,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - -- /* XXX would be nice to fix up dn_towrite_space[] */ -+ /* -+ * Any space we accounted for in dp_dirty_* will be cleaned up by -+ * dsl_pool_sync(). This is relatively rare so the discrepancy -+ * is not a big deal. -+ */ - -@@ -1382,17 +1429,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - -- if (db->db_level == 0) { -- if (db->db_state != DB_NOFILL) { -- dbuf_unoverride(dr); -+ if (db->db_state != DB_NOFILL) { -+ dbuf_unoverride(dr); - -- ASSERT(db->db_buf != NULL); -- ASSERT(dr->dt.dl.dr_data != NULL); -- if (dr->dt.dl.dr_data != db->db_buf) -- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, -- db) == 1); -- } -- } else { - ASSERT(db->db_buf != NULL); -- ASSERT(list_head(&dr->dt.di.dr_children) == NULL); -- mutex_destroy(&dr->dt.di.dr_mtx); -- list_destroy(&dr->dt.di.dr_children); -+ ASSERT(dr->dt.dl.dr_data != NULL); -+ if (dr->dt.dl.dr_data != db->db_buf) -+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); - } -@@ -1408,9 +1447,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - dbuf_set_data(db, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); -- return (1); -+ return (B_TRUE); - } - -- mutex_exit(&db->db_mtx); -- return (0); -+ return (B_FALSE); - } -@@ -1513,3 +1551,3 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - bcopy(buf->b_data, db->db.db_data, db->db.db_size); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - xuio_stat_wbuf_copied(); -@@ -1531,6 +1569,6 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - dr->dt.dl.dr_data = buf; -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(db->db_buf, db)); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { - arc_release(db->db_buf, db); -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(db->db_buf, db)); - } -@@ -1548,3 +1586,3 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - * "Clear" the contents of this dbuf. This will mark the dbuf -- * EVICTING and clear *most* of its references. Unfortunetely, -+ * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the -@@ -1661,3 +1699,3 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - /* the buffer has no parent yet */ -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } else if (level < nlevels-1) { -@@ -1668,4 +1706,3 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - fail_sparse, NULL, parentp); -- } -- else { -+ } else { - __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, -@@ -1745,3 +1782,3 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - int blocksize = -- db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; -+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; - db->db.db_size = blocksize; -@@ -1766,2 +1803,5 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - list_insert_head(&dn->dn_dbufs, db); -+ if (db->db_level == 0 && db->db_blkid >= -+ dn->dn_unlisted_l0_blkid) -+ dn->dn_unlisted_l0_blkid = db->db_blkid + 1; - db->db_state = DB_UNCACHED; -@@ -1851,3 +1891,3 @@ dbuf_destroy(dmu_buf_impl_t *db) - void --dbuf_prefetch(dnode_t *dn, uint64_t blkid) -+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) - { -@@ -1875,4 +1915,2 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - if (bp && !BP_IS_HOLE(bp)) { -- int priority = dn->dn_type == DMU_OT_DDT_ZAP ? -- ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; -@@ -1885,3 +1923,3 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - (void) arc_read(NULL, dn->dn_objset->os_spa, -- bp, NULL, NULL, priority, -+ bp, NULL, NULL, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, -@@ -1894,3 +1932,3 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - --#define DBUF_HOLD_IMPL_MAX_DEPTH 20 -+#define DBUF_HOLD_IMPL_MAX_DEPTH 20 - -@@ -1923,4 +1961,5 @@ top: - if (dh->dh_fail_sparse) { -- if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) -- dh->dh_err = ENOENT; -+ if (dh->dh_err == 0 && -+ dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) -+ dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { -@@ -2004,3 +2043,3 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - -- dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * -+ dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE); -@@ -2010,3 +2049,3 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - -- kmem_free(dh, sizeof(struct dbuf_hold_impl_data) * -+ kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH); -@@ -2062,3 +2101,3 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) - if (db->db_blkid != DMU_SPILL_BLKID) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (blksz == 0) -@@ -2170,6 +2209,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) - dbuf_set_data(db, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); - } else { -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); -+ VERIFY(!arc_buf_remove_ref(db->db_buf, db)); - -@@ -2274,2 +2313,9 @@ dmu_buf_freeable(dmu_buf_t *dbuf) - -+blkptr_t * -+dmu_buf_get_blkptr(dmu_buf_t *db) -+{ -+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; -+ return (dbi->db_blkptr); -+} -+ - static void -@@ -2319,3 +2365,4 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) - --/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it -+/* -+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it - * is critical the we not allow the compiler to inline this function in to -@@ -2339,2 +2386,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - -+ /* Read the block if it hasn't been read yet. */ - if (db->db_buf == NULL) { -@@ -2349,2 +2397,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - dn = DB_DNODE(db); -+ /* Indirect block size must match what the dnode thinks it is. */ - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); -@@ -2353,2 +2402,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - -+ /* Provide the pending dirty record to child dbufs */ - db->db_data_pending = dr; -@@ -2366,3 +2416,4 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - --/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is -+/* -+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is - * critical the we not allow the compiler to inline this function in to -@@ -2611,2 +2662,34 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) - -+/* -+ * The SPA will call this callback several times for each zio - once -+ * for every physical child i/o (zio->io_phys_children times). This -+ * allows the DMU to monitor the progress of each logical i/o. For example, -+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z -+ * block. There may be a long delay before all copies/fragments are completed, -+ * so this callback allows us to retire dirty space gradually, as the physical -+ * i/os complete. -+ */ -+/* ARGSUSED */ -+static void -+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -+{ -+ dmu_buf_impl_t *db = arg; -+ objset_t *os = db->db_objset; -+ dsl_pool_t *dp = dmu_objset_pool(os); -+ dbuf_dirty_record_t *dr; -+ int delta = 0; -+ -+ dr = db->db_data_pending; -+ ASSERT3U(dr->dr_txg, ==, zio->io_txg); -+ -+ /* -+ * The callback will be called io_phys_children times. Retire one -+ * portion of our dirty space each time we are called. Any rounding -+ * error will be cleaned up by dsl_pool_sync()'s call to -+ * dsl_pool_undirty_space(). -+ */ -+ delta = dr->dr_accounted / zio->io_phys_children; -+ dsl_pool_undirty_space(dp, delta, zio->io_txg); -+} -+ - /* ARGSUSED */ -@@ -2624,3 +2707,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - -- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { -+ /* -+ * For nopwrites and rewrites we ensure that the bp matches our -+ * original and bypass all the accounting. -+ */ -+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { - ASSERT(BP_EQUAL(bp, bp_orig)); -@@ -2671,3 +2758,3 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, -- db) == 1); -+ db)); - else if (!arc_released(db->db_buf)) -@@ -2701,2 +2788,3 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - db->db_data_pending = NULL; -+ - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); -@@ -2743,2 +2831,3 @@ dbuf_write_override_done(zio_t *zio) - -+/* Issue I/O to commit a dirty buffer to disk. */ - static void -@@ -2777,4 +2866,11 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - if (parent != dn->dn_dbuf) { -+ /* Our parent is an indirect block. */ -+ /* We have a dirty parent that has been scheduled for write. */ - ASSERT(parent && parent->db_data_pending); -+ /* Our parent's buffer is one level closer to the dnode. */ - ASSERT(db->db_level == parent->db_level-1); -+ /* -+ * We're about to modify our parent's db_data by modifying -+ * our block pointer, so the parent must be released. -+ */ - ASSERT(arc_released(parent->db_buf)); -@@ -2782,2 +2878,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - } else { -+ /* Our parent is the dnode itself. */ - ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && -@@ -2810,4 +2907,4 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - db->db_blkptr, data->b_data, arc_buf_size(data), &zp, -- dbuf_write_override_ready, dbuf_write_override_done, dr, -- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); -+ dbuf_write_override_ready, NULL, dbuf_write_override_done, -+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - mutex_enter(&db->db_mtx); -@@ -2815,3 +2912,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, -- dr->dt.dl.dr_copies); -+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); - mutex_exit(&db->db_mtx); -@@ -2821,3 +2918,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - db->db_blkptr, NULL, db->db.db_size, &zp, -- dbuf_write_nofill_ready, dbuf_write_nofill_done, db, -+ dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, - ZIO_PRIORITY_ASYNC_WRITE, -@@ -2829,4 +2926,4 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, -- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, -- ZIO_FLAG_MUSTSUCCEED, &zb); -+ dbuf_write_physdone, dbuf_write_done, db, -+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c -new file mode 100644 -index 0000000..0cad9ef ---- /dev/null -+++ b/module/zfs/dbuf_stats.c -@@ -0,0 +1,230 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#include -+#include -+#include -+ -+/* -+ * Calculate the index of the arc header for the state, disabled by default. -+ */ -+int zfs_dbuf_state_index = 0; -+ -+/* -+ * ========================================================================== -+ * Dbuf Hash Read Routines -+ * ========================================================================== -+ */ -+typedef struct dbuf_stats_t { -+ kmutex_t lock; -+ kstat_t *kstat; -+ dbuf_hash_table_t *hash; -+ int idx; -+} dbuf_stats_t; -+ -+static dbuf_stats_t dbuf_stats_hash_table; -+ -+static int -+dbuf_stats_hash_table_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, -+ "%-88s | %-124s | %s\n" -+ "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " -+ "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " -+ "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " -+ "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", -+ "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", -+ "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", -+ "atype", "index", "flags", "count", "asize", "access", -+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", -+ "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+int -+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) -+{ -+ arc_buf_info_t abi = { 0 }; -+ dmu_object_info_t doi = { 0 }; -+ dnode_t *dn = DB_DNODE(db); -+ -+ if (db->db_buf) -+ arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); -+ -+ if (dn) -+ __dmu_object_info_from_dnode(dn, &doi); -+ -+ size = snprintf(buf, size - 1, -+ "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " -+ "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " -+ "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " -+ "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", -+ /* dmu_buf_impl_t */ -+ spa_name(dn->dn_objset->os_spa), -+ (u_longlong_t)dmu_objset_id(db->db_objset), -+ (longlong_t)db->db.db_object, -+ (longlong_t)db->db_level, -+ (longlong_t)db->db_blkid, -+ (u_longlong_t)db->db.db_offset, -+ (u_longlong_t)db->db.db_size, -+ !!dbuf_is_metadata(db), -+ db->db_state, -+ (ulong_t)refcount_count(&db->db_holds), -+ /* arc_buf_info_t */ -+ abi.abi_state_type, -+ abi.abi_state_contents, -+ (longlong_t)abi.abi_state_index, -+ abi.abi_flags, -+ (ulong_t)abi.abi_datacnt, -+ (u_longlong_t)abi.abi_size, -+ (u_longlong_t)abi.abi_access, -+ (ulong_t)abi.abi_mru_hits, -+ (ulong_t)abi.abi_mru_ghost_hits, -+ (ulong_t)abi.abi_mfu_hits, -+ (ulong_t)abi.abi_mfu_ghost_hits, -+ (ulong_t)abi.abi_l2arc_hits, -+ (u_longlong_t)abi.abi_l2arc_dattr, -+ (u_longlong_t)abi.abi_l2arc_asize, -+ abi.abi_l2arc_compress, -+ (ulong_t)abi.abi_holds, -+ /* dmu_object_info_t */ -+ doi.doi_type, -+ doi.doi_bonus_type, -+ (ulong_t)doi.doi_data_block_size, -+ (ulong_t)doi.doi_metadata_block_size, -+ (u_longlong_t)doi.doi_bonus_size, -+ (ulong_t)doi.doi_indirection, -+ (ulong_t)refcount_count(&dn->dn_holds), -+ (u_longlong_t)doi.doi_fill_count, -+ (u_longlong_t)doi.doi_max_offset); -+ buf[size] = '\0'; -+ -+ return (size); -+} -+ -+static int -+dbuf_stats_hash_table_data(char *buf, size_t size, void *data) -+{ -+ dbuf_stats_t *dsh = (dbuf_stats_t *)data; -+ dbuf_hash_table_t *h = dsh->hash; -+ dmu_buf_impl_t *db; -+ int length, error = 0; -+ -+ ASSERT3S(dsh->idx, >=, 0); -+ ASSERT3S(dsh->idx, <=, h->hash_table_mask); -+ memset(buf, 0, size); -+ -+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); -+ for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { -+ /* -+ * Returning ENOMEM will cause the data and header functions -+ * to be called with a larger scratch buffers. -+ */ -+ if (size < 512) { -+ error = ENOMEM; -+ break; -+ } -+ -+ mutex_enter(&db->db_mtx); -+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); -+ -+ length = __dbuf_stats_hash_table_data(buf, size, db); -+ buf += length; -+ size -= length; -+ -+ mutex_exit(&db->db_mtx); -+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); -+ } -+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); -+ -+ return (error); -+} -+ -+static void * -+dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) -+{ -+ dbuf_stats_t *dsh = ksp->ks_private; -+ -+ ASSERT(MUTEX_HELD(&dsh->lock)); -+ -+ if (n <= dsh->hash->hash_table_mask) { -+ dsh->idx = n; -+ return (dsh); -+ } -+ -+ return (NULL); -+} -+ -+static void -+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -+{ -+ dbuf_stats_t *dsh = &dbuf_stats_hash_table; -+ kstat_t *ksp; -+ -+ mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); -+ dsh->hash = hash; -+ -+ ksp = kstat_create("zfs", 0, "dbufs", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ dsh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &dsh->lock; -+ ksp->ks_ndata = UINT32_MAX; -+ ksp->ks_private = dsh; -+ kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, -+ dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+dbuf_stats_hash_table_destroy(void) -+{ -+ dbuf_stats_t *dsh = &dbuf_stats_hash_table; -+ kstat_t *ksp; -+ -+ ksp = dsh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_destroy(&dsh->lock); -+} -+ -+void -+dbuf_stats_init(dbuf_hash_table_t *hash) -+{ -+ dbuf_stats_hash_table_init(hash); -+} -+ -+void -+dbuf_stats_destroy(void) -+{ -+ dbuf_stats_hash_table_destroy(); -+} -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+module_param(zfs_dbuf_state_index, int, 0644); -+MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); -+#endif -diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c -index 286f3bb..070c831 100644 ---- a/module/zfs/ddt.c -+++ b/module/zfs/ddt.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -39,2 +39,5 @@ - -+static kmem_cache_t *ddt_cache; -+static kmem_cache_t *ddt_entry_cache; -+ - /* -@@ -172,3 +175,3 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - if (!ddt_object_exists(ddt, type, class)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -234,3 +237,3 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - if (!ddt_object_exists(ddt, type, class)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -517,3 +520,2 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) - -- /* XXX: Move to a slab */ - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); -@@ -661,2 +663,18 @@ ddt_exit(ddt_t *ddt) - -+void -+ddt_init(void) -+{ -+ ddt_cache = kmem_cache_create("ddt_cache", -+ sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -+ ddt_entry_cache = kmem_cache_create("ddt_entry_cache", -+ sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -+} -+ -+void -+ddt_fini(void) -+{ -+ kmem_cache_destroy(ddt_entry_cache); -+ kmem_cache_destroy(ddt_cache); -+} -+ - static ddt_entry_t * -@@ -666,4 +684,4 @@ ddt_alloc(const ddt_key_t *ddk) - -- /* XXX: Move to a slab */ -- dde = kmem_zalloc(sizeof (ddt_entry_t), KM_PUSHPAGE); -+ dde = kmem_cache_alloc(ddt_entry_cache, KM_PUSHPAGE); -+ bzero(dde, sizeof (ddt_entry_t)); - cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); -@@ -690,3 +708,3 @@ ddt_free(ddt_entry_t *dde) - cv_destroy(&dde->dde_cv); -- kmem_free(dde, sizeof (*dde)); -+ kmem_cache_free(ddt_entry_cache, dde); - } -@@ -815,4 +833,4 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) - -- /* XXX: Move to a slab */ -- ddt = kmem_zalloc(sizeof (*ddt), KM_PUSHPAGE | KM_NODEBUG); -+ ddt = kmem_cache_alloc(ddt_cache, KM_PUSHPAGE | KM_NODEBUG); -+ bzero(ddt, sizeof (ddt_t)); - -@@ -838,3 +856,3 @@ ddt_table_free(ddt_t *ddt) - mutex_destroy(&ddt->ddt_lock); -- kmem_free(ddt, sizeof (*ddt)); -+ kmem_cache_free(ddt_cache, ddt); - } -@@ -918,3 +936,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; -- dde = kmem_alloc(sizeof(ddt_entry_t), KM_PUSHPAGE); -+ dde = kmem_cache_alloc(ddt_entry_cache, KM_PUSHPAGE); - -@@ -925,3 +943,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - if (ddt_object_lookup(ddt, type, class, dde) == 0) { -- kmem_free(dde, sizeof(ddt_entry_t)); -+ kmem_cache_free(ddt_entry_cache, dde); - return (B_TRUE); -@@ -931,3 +949,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - -- kmem_free(dde, sizeof(ddt_entry_t)); -+ kmem_cache_free(ddt_entry_cache, dde); - return (B_FALSE); -@@ -1206,3 +1224,3 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1211,3 +1229,3 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) - module_param(zfs_dedup_prefetch, int, 0644); --MODULE_PARM_DESC(zfs_dedup_prefetch,"Enable prefetching dedup-ed blks"); -+MODULE_PARM_DESC(zfs_dedup_prefetch, "Enable prefetching dedup-ed blks"); - #endif -diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c -index 65b14ab..a21ed45 100644 ---- a/module/zfs/ddt_zap.c -+++ b/module/zfs/ddt_zap.c -@@ -143,3 +143,3 @@ ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) - { -- return zap_count(os, object, count); -+ return (zap_count(os, object, count)); - } -diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c -index 0a90333..edad9b4 100644 ---- a/module/zfs/dmu.c -+++ b/module/zfs/dmu.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -43,2 +43,3 @@ - #include -+#include - #include -@@ -49,2 +50,7 @@ - -+/* -+ * Enable/disable nopwrite feature. -+ */ -+int zfs_nopwrite_enabled = 1; -+ - const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { -@@ -140,3 +146,3 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - if (db == NULL) { -- err = EIO; -+ err = SET_ERROR(EIO); - } else { -@@ -171,5 +177,5 @@ dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) - if (dn->dn_bonus != db) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else { -@@ -194,5 +200,5 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) - if (!DMU_OT_IS_VALID(type)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else { -@@ -323,3 +329,3 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -@@ -328,3 +334,3 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) - if (!dn->dn_have_spill) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -366,3 +372,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - { -- dsl_pool_t *dp = NULL; - dmu_buf_t **dbp; -@@ -372,3 +377,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - zio_t *zio; -- hrtime_t start = 0; - -@@ -394,3 +398,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - rw_exit(&dn->dn_struct_rwlock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -398,8 +402,5 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - } -- dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG); -+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, -+ KM_PUSHPAGE | KM_NODEBUG); - -- if (dn->dn_objset->os_dsl_dataset) -- dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; -- if (dp && dsl_pool_sync_context(dp)) -- start = gethrtime(); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); -@@ -412,3 +413,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - zio_nowait(zio); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -424,5 +425,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - err = zio_wait(zio); -- /* track read overhead when we are in sync context */ -- if (dp && dsl_pool_sync_context(dp)) -- dp->dp_read_overhead += gethrtime() - start; - if (err) { -@@ -441,3 +439,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - if (db->db_state == DB_UNCACHED) -- err = EIO; -+ err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); -@@ -508,2 +506,12 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) - -+/* -+ * Issue prefetch i/os for the given blocks. -+ * -+ * Note: The assumption is that we *know* these blocks will be needed -+ * almost immediately. Therefore, the prefetch i/os will be issued at -+ * ZIO_PRIORITY_SYNC_READ -+ * -+ * Note: indirect blocks and other metadata will be read synchronously, -+ * causing this function to block if they are not already cached. -+ */ - void -@@ -513,3 +521,3 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - uint64_t blkid; -- int nblks, i, err; -+ int nblks, err; - -@@ -526,3 +534,3 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); -- dbuf_prefetch(dn, blkid); -+ dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); - rw_exit(&dn->dn_struct_rwlock); -@@ -543,4 +551,4 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - int blkshift = dn->dn_datablkshift; -- nblks = (P2ROUNDUP(offset+len, 1<> blkshift; -+ nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - -+ P2ALIGN(offset, 1 << blkshift)) >> blkshift; - } else { -@@ -550,5 +558,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - if (nblks != 0) { -+ int i; -+ - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) -- dbuf_prefetch(dn, blkid+i); -+ dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); - } -@@ -565,16 +575,20 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - * data by simply searching the allocated level 1 indirects. -+ * -+ * On input, *start should be the first offset that does not need to be -+ * freed (e.g. "offset + length"). On return, *start will be the first -+ * offset that should be freed. - */ - static int --get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) -+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) - { -- uint64_t len = *start - limit; -- uint64_t blkcnt = 0; -- uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); -+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); -+ /* bytes of data covered by a level-1 indirect block */ - uint64_t iblkrange = - dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); -+ uint64_t blks; - -- ASSERT(limit <= *start); -+ ASSERT3U(minimum, <=, *start); - -- if (len <= iblkrange * maxblks) { -- *start = limit; -+ if (*start - minimum <= iblkrange * maxblks) { -+ *start = minimum; - return (0); -@@ -583,6 +597,12 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) - -- while (*start > limit && blkcnt < maxblks) { -+ for (blks = 0; *start > minimum && blks < maxblks; blks++) { - int err; - -- /* find next allocated L1 indirect */ -+ /* -+ * dnode_next_offset(BACKWARDS) will find an allocated L1 -+ * indirect block at or before the input offset. We must -+ * decrement *start so that it is at the end of the region -+ * to search. -+ */ -+ (*start)--; - err = dnode_next_offset(dn, -@@ -590,18 +610,15 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) - -- /* if there are no more, then we are done */ -+ /* if there are no indirect blocks before start, we are done */ - if (err == ESRCH) { -- *start = limit; -- return (0); -- } else if (err) { -+ *start = minimum; -+ break; -+ } else if (err != 0) { - return (err); - } -- blkcnt += 1; - -- /* reset offset to end of "next" block back */ -+ /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); -- if (*start <= limit) -- *start = limit; -- else -- *start -= 1; - } -+ if (*start < minimum) -+ *start = minimum; - return (0); -@@ -611,31 +628,29 @@ static int - dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, -- uint64_t length, boolean_t free_dnode) -+ uint64_t length) - { -- dmu_tx_t *tx; -- uint64_t object_size, start, end, len; -- boolean_t trunc = (length == DMU_OBJECT_END); -- int align, err; -- -- align = 1 << dn->dn_datablkshift; -- ASSERT(align > 0); -- object_size = align == 1 ? dn->dn_datablksz : -- (dn->dn_maxblkid + 1) << dn->dn_datablkshift; -- -- end = offset + length; -- if (trunc || end > object_size) -- end = object_size; -- if (end <= offset) -+ uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; -+ int err; -+ -+ if (offset >= object_size) - return (0); -- length = end - offset; - -- while (length) { -- start = end; -- /* assert(offset <= start) */ -- err = get_next_chunk(dn, &start, offset); -+ if (length == DMU_OBJECT_END || offset + length > object_size) -+ length = object_size - offset; -+ -+ while (length != 0) { -+ uint64_t chunk_end, chunk_begin; -+ dmu_tx_t *tx; -+ -+ chunk_end = chunk_begin = offset + length; -+ -+ /* move chunk_begin backwards to the beginning of this chunk */ -+ err = get_next_chunk(dn, &chunk_begin, offset); - if (err) - return (err); -- len = trunc ? DMU_OBJECT_END : end - start; -+ ASSERT3U(chunk_begin, >=, offset); -+ ASSERT3U(chunk_begin, <=, chunk_end); - - tx = dmu_tx_create(os); -- dmu_tx_hold_free(tx, dn->dn_object, start, len); -+ dmu_tx_hold_free(tx, dn->dn_object, -+ chunk_begin, chunk_end - chunk_begin); - err = dmu_tx_assign(tx, TXG_WAIT); -@@ -645,14 +660,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, - } -- -- dnode_free_range(dn, start, trunc ? -1 : len, tx); -- -- if (start == 0 && free_dnode) { -- ASSERT(trunc); -- dnode_free(dn, tx); -- } -- -- length -= end - start; -- -+ dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); - dmu_tx_commit(tx); -- end = start; -+ -+ length -= chunk_end - chunk_begin; - } -@@ -671,3 +678,13 @@ dmu_free_long_range(objset_t *os, uint64_t object, - return (err); -- err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); -+ err = dmu_free_long_range_impl(os, dn, offset, length); -+ -+ /* -+ * It is important to zero out the maxblkid when freeing the entire -+ * file, so that (a) subsequent calls to dmu_free_long_range_impl() -+ * will take the fast path, and (b) dnode_reallocate() can verify -+ * that the entire file has been freed. -+ */ -+ if (offset == 0 && length == DMU_OBJECT_END) -+ dn->dn_maxblkid = 0; -+ - dnode_rele(dn, FTAG); -@@ -677,5 +694,4 @@ dmu_free_long_range(objset_t *os, uint64_t object, - int --dmu_free_object(objset_t *os, uint64_t object) -+dmu_free_long_object(objset_t *os, uint64_t object) - { -- dnode_t *dn; - dmu_tx_t *tx; -@@ -683,22 +699,17 @@ dmu_free_object(objset_t *os, uint64_t object) - -- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, -- FTAG, &dn); -+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); - if (err != 0) - return (err); -- if (dn->dn_nlevels == 1) { -- tx = dmu_tx_create(os); -- dmu_tx_hold_bonus(tx, object); -- dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); -- err = dmu_tx_assign(tx, TXG_WAIT); -- if (err == 0) { -- dnode_free_range(dn, 0, DMU_OBJECT_END, tx); -- dnode_free(dn, tx); -- dmu_tx_commit(tx); -- } else { -- dmu_tx_abort(tx); -- } -+ -+ tx = dmu_tx_create(os); -+ dmu_tx_hold_bonus(tx, object); -+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); -+ err = dmu_tx_assign(tx, TXG_WAIT); -+ if (err == 0) { -+ err = dmu_object_free(os, object, tx); -+ dmu_tx_commit(tx); - } else { -- err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); -+ dmu_tx_abort(tx); - } -- dnode_rele(dn, FTAG); -+ - return (err); -@@ -869,5 +880,5 @@ static xuio_stats_t xuio_stats = { - --#define XUIOSTAT_INCR(stat, val) \ -- atomic_add_64(&xuio_stats.stat.value.ui64, (val)) --#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) -+#define XUIOSTAT_INCR(stat, val) \ -+ atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -@@ -995,31 +1006,38 @@ xuio_stat_wbuf_nocopy() - * Copy up to size bytes between arg_buf and req based on the data direction -- * described by the req. If an entire req's data cannot be transfered the -- * req's is updated such that it's current index and bv offsets correctly -- * reference any residual data which could not be copied. The return value -- * is the number of bytes successfully copied to arg_buf. -+ * described by the req. If an entire req's data cannot be transfered in one -+ * pass, you should pass in @req_offset to indicate where to continue. The -+ * return value is the number of bytes successfully copied to arg_buf. - */ - static int --dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) -+dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) - { -- struct bio_vec *bv; -+ struct bio_vec bv, *bvp; - struct req_iterator iter; - char *bv_buf; -- int tocpy; -+ int tocpy, bv_len, bv_offset; -+ int offset = 0; - -- *offset = 0; -- rq_for_each_segment(bv, req, iter) { -- -- /* Fully consumed the passed arg_buf */ -- ASSERT3S(*offset, <=, size); -- if (size == *offset) -- break; -+ rq_for_each_segment4(bv, bvp, req, iter) { -+ /* -+ * Fully consumed the passed arg_buf. We use goto here because -+ * rq_for_each_segment is a double loop -+ */ -+ ASSERT3S(offset, <=, size); -+ if (size == offset) -+ goto out; - -- /* Skip fully consumed bv's */ -- if (bv->bv_len == 0) -+ /* Skip already copied bv */ -+ if (req_offset >= bv.bv_len) { -+ req_offset -= bv.bv_len; - continue; -+ } - -- tocpy = MIN(bv->bv_len, size - *offset); -+ bv_len = bv.bv_len - req_offset; -+ bv_offset = bv.bv_offset + req_offset; -+ req_offset = 0; -+ -+ tocpy = MIN(bv_len, size - offset); - ASSERT3S(tocpy, >=, 0); - -- bv_buf = page_address(bv->bv_page) + bv->bv_offset; -+ bv_buf = page_address(bv.bv_page) + bv_offset; - ASSERT3P(bv_buf, !=, NULL); -@@ -1027,57 +1045,10 @@ dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) - if (rq_data_dir(req) == WRITE) -- memcpy(arg_buf + *offset, bv_buf, tocpy); -+ memcpy(arg_buf + offset, bv_buf, tocpy); - else -- memcpy(bv_buf, arg_buf + *offset, tocpy); -+ memcpy(bv_buf, arg_buf + offset, tocpy); - -- *offset += tocpy; -- bv->bv_offset += tocpy; -- bv->bv_len -= tocpy; -- } -- -- return 0; --} -- --static void --dmu_bio_put(struct bio *bio) --{ -- struct bio *bio_next; -- -- while (bio) { -- bio_next = bio->bi_next; -- bio_put(bio); -- bio = bio_next; -- } --} -- --static int --dmu_bio_clone(struct bio *bio, struct bio **bio_copy) --{ -- struct bio *bio_root = NULL; -- struct bio *bio_last = NULL; -- struct bio *bio_new; -- -- if (bio == NULL) -- return EINVAL; -- -- while (bio) { -- bio_new = bio_clone(bio, GFP_NOIO); -- if (bio_new == NULL) { -- dmu_bio_put(bio_root); -- return ENOMEM; -- } -- -- if (bio_last) { -- bio_last->bi_next = bio_new; -- bio_last = bio_new; -- } else { -- bio_root = bio_new; -- bio_last = bio_new; -- } -- -- bio = bio->bi_next; -+ offset += tocpy; - } -- -- *bio_copy = bio_root; -- -- return 0; -+out: -+ return (offset); - } -@@ -1089,5 +1060,5 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - uint64_t offset = blk_rq_pos(req) << 9; -- struct bio *bio_saved = req->bio; - dmu_buf_t **dbp; - int numbufs, i, err; -+ size_t req_offset; - -@@ -1098,3 +1069,3 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG, -- &numbufs, &dbp); -+ &numbufs, &dbp); - if (err) -@@ -1102,13 +1073,3 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - -- /* -- * Clone the bio list so the bv->bv_offset and bv->bv_len members -- * can be safely modified. The original bio list is relinked in to -- * the request when the function exits. This is required because -- * some file systems blindly assume that these values will remain -- * constant between bio_submit() and the IO completion callback. -- */ -- err = dmu_bio_clone(bio_saved, &req->bio); -- if (err) -- goto error; -- -+ req_offset = 0; - for (i = 0; i < numbufs; i++) { -@@ -1124,3 +1085,4 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - -- err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); -+ didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, -+ req_offset); - -@@ -1134,8 +1096,5 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - offset += didcpy; -+ req_offset += didcpy; - err = 0; - } -- -- dmu_bio_put(req->bio); -- req->bio = bio_saved; --error: - dmu_buf_rele_array(dbp, numbufs, FTAG); -@@ -1150,7 +1109,5 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - uint64_t offset = blk_rq_pos(req) << 9; -- struct bio *bio_saved = req->bio; - dmu_buf_t **dbp; -- int numbufs; -- int err = 0; -- int i; -+ int numbufs, i, err; -+ size_t req_offset; - -@@ -1160,3 +1117,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, -- &numbufs, &dbp); -+ &numbufs, &dbp); - if (err) -@@ -1164,13 +1121,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- /* -- * Clone the bio list so the bv->bv_offset and bv->bv_len members -- * can be safely modified. The original bio list is relinked in to -- * the request when the function exits. This is required because -- * some file systems blindly assume that these values will remain -- * constant between bio_submit() and the IO completion callback. -- */ -- err = dmu_bio_clone(bio_saved, &req->bio); -- if (err) -- goto error; -- -+ req_offset = 0; - for (i = 0; i < numbufs; i++) { -@@ -1193,3 +1140,4 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); -+ didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, -+ req_offset); - -@@ -1206,2 +1154,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - offset += didcpy; -+ req_offset += didcpy; - err = 0; -@@ -1209,7 +1158,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- dmu_bio_put(req->bio); -- req->bio = bio_saved; --error: - dmu_buf_rele_array(dbp, numbufs, FTAG); -- - return (err); -@@ -1384,3 +1329,3 @@ dmu_return_arcbuf(arc_buf_t *buf) - arc_return_buf(buf, FTAG); -- VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); -+ VERIFY(arc_buf_remove_ref(buf, FTAG)); - } -@@ -1476,2 +1421,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) - if (zio->io_error == 0) { -+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); -+ if (dr->dt.dl.dr_nopwrite) { -+ ASSERTV(blkptr_t *bp = zio->io_bp); -+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); -+ ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig)); -+ -+ ASSERT(BP_EQUAL(bp, bp_orig)); -+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); -+ ASSERT(zio_checksum_table[chksum].ci_dedup); -+ } - dr->dt.dl.dr_overridden_by = *zio->io_bp; -@@ -1497,7 +1452,18 @@ dmu_sync_late_arrival_done(zio_t *zio) - dmu_sync_arg_t *dsa = zio->io_private; -+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); - - if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { -- ASSERT(zio->io_bp->blk_birth == zio->io_txg); -- ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); -- zio_free(zio->io_spa, zio->io_txg, zio->io_bp); -+ /* -+ * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) -+ * then there is nothing to do here. Otherwise, free the -+ * newly allocated block in this txg. -+ */ -+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { -+ ASSERT(BP_EQUAL(bp, bp_orig)); -+ } else { -+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); -+ ASSERT(zio->io_bp->blk_birth == zio->io_txg); -+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); -+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp); -+ } - } -@@ -1522,3 +1488,4 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - dmu_tx_abort(tx); -- return (EIO); /* Make zl_get_data do txg_waited_synced() */ -+ /* Make zl_get_data do txg_waited_synced() */ -+ return (SET_ERROR(EIO)); - } -@@ -1533,4 +1500,4 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, -- dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, -- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb)); -+ dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, -+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL|ZIO_FLAG_FASTWRITE, zb)); - -@@ -1546,3 +1513,3 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - * -- * EEXIST: this txg has already been synced, so there's nothing to to. -+ * EEXIST: this txg has already been synced, so there's nothing to do. - * The caller should not log the write. -@@ -1578,3 +1545,2 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - ASSERT(pio != NULL); -- ASSERT(BP_IS_HOLE(bp)); - ASSERT(txg != 0); -@@ -1608,3 +1574,3 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -1630,5 +1596,22 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } - -+ ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); -+ -+ /* -+ * Assume the on-disk data is X, the current syncing data is Y, -+ * and the current in-memory data is Z (currently in dmu_sync). -+ * X and Z are identical but Y is has been modified. Normally, -+ * when X and Z are the same we will perform a nopwrite but if Y -+ * is different we must disable nopwrite since the resulting write -+ * of Y to disk can free the block containing X. If we allowed a -+ * nopwrite to occur the block pointing to Z would reference a freed -+ * block. Since this is a rare case we simplify this by disabling -+ * nopwrite if the current dmu_sync-ing dbuf has been modified in -+ * a previous transaction. -+ */ -+ if (dr->dr_next) -+ zp.zp_nopwrite = B_FALSE; -+ - ASSERT(dr->dr_txg == txg); -@@ -1642,3 +1625,3 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (EALREADY); -+ return (SET_ERROR(EALREADY)); - } -@@ -1657,4 +1640,5 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), -- DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done, -- dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb)); -+ DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, -+ NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, -+ ZIO_FLAG_CANFAIL, &zb)); - -@@ -1717,3 +1701,4 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - enum zio_checksum dedup_checksum = os->os_dedup_checksum; -- boolean_t dedup; -+ boolean_t dedup = B_FALSE; -+ boolean_t nopwrite = B_FALSE; - boolean_t dedup_verify = os->os_dedup_verify; -@@ -1722,3 +1707,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - /* -- * Determine checksum setting. -+ * We maintain different write policies for each of the following -+ * types of data: -+ * 1. metadata -+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device) -+ * 3. all other level 0 blocks - */ -@@ -1726,2 +1715,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - /* -+ * XXX -- we should design a compression algorithm -+ * that specializes in arrays of bps. -+ */ -+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : -+ ZIO_COMPRESS_LZJB; -+ -+ /* - * Metadata always gets checksummed. If the data -@@ -1735,41 +1731,43 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - checksum = ZIO_CHECKSUM_FLETCHER_4; -- } else { -- checksum = zio_checksum_select(dn->dn_checksum, checksum); -- } -+ } else if (wp & WP_NOFILL) { -+ ASSERT(level == 0); - -- /* -- * Determine compression setting. -- */ -- if (ismd) { - /* -- * XXX -- we should design a compression algorithm -- * that specializes in arrays of bps. -+ * If we're writing preallocated blocks, we aren't actually -+ * writing them so don't set any policy properties. These -+ * blocks are currently only used by an external subsystem -+ * outside of zfs (i.e. dump) and not written by the zio -+ * pipeline. - */ -- compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : -- ZIO_COMPRESS_LZJB; -+ compress = ZIO_COMPRESS_OFF; -+ checksum = ZIO_CHECKSUM_OFF; - } else { - compress = zio_compress_select(dn->dn_compress, compress); -- } - -- /* -- * Determine dedup setting. If we are in dmu_sync(), we won't -- * actually dedup now because that's all done in syncing context; -- * but we do want to use the dedup checkum. If the checksum is not -- * strong enough to ensure unique signatures, force dedup_verify. -- */ -- dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); -- if (dedup) { -- checksum = dedup_checksum; -- if (!zio_checksum_table[checksum].ci_dedup) -- dedup_verify = 1; -- } -+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? -+ zio_checksum_select(dn->dn_checksum, checksum) : -+ dedup_checksum; - -- if (wp & WP_DMU_SYNC) -- dedup = 0; -+ /* -+ * Determine dedup setting. If we are in dmu_sync(), -+ * we won't actually dedup now because that's all -+ * done in syncing context; but we do want to use the -+ * dedup checkum. If the checksum is not strong -+ * enough to ensure unique signatures, force -+ * dedup_verify. -+ */ -+ if (dedup_checksum != ZIO_CHECKSUM_OFF) { -+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; -+ if (!zio_checksum_table[checksum].ci_dedup) -+ dedup_verify = B_TRUE; -+ } - -- if (wp & WP_NOFILL) { -- ASSERT(!ismd && level == 0); -- checksum = ZIO_CHECKSUM_OFF; -- compress = ZIO_COMPRESS_OFF; -- dedup = B_FALSE; -+ /* -+ * Enable nopwrite if we have a cryptographically secure -+ * checksum that has no known collisions (i.e. SHA-256) -+ * and compression is enabled. We don't enable nopwrite if -+ * dedup is enabled as the two features are mutually exclusive. -+ */ -+ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && -+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); - } -@@ -1783,2 +1781,3 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - zp->zp_dedup_verify = dedup && dedup_verify; -+ zp->zp_nopwrite = nopwrite; - } -@@ -1817,12 +1816,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) - void --dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - { -- dnode_phys_t *dnp; -+ dnode_phys_t *dnp = dn->dn_phys; - int i; - -- rw_enter(&dn->dn_struct_rwlock, RW_READER); -- mutex_enter(&dn->dn_mtx); -- -- dnp = dn->dn_phys; -- - doi->doi_data_block_size = dn->dn_datablksz; -@@ -1837,3 +1831,3 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; -- doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; -+ doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; - doi->doi_fill_count = 0; -@@ -1841,2 +1835,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; -+} -+ -+void -+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -+{ -+ rw_enter(&dn->dn_struct_rwlock, RW_READER); -+ mutex_enter(&dn->dn_mtx); -+ -+ __dmu_object_info_from_dnode(dn, doi); - -@@ -1963,3 +1966,3 @@ dmu_fini(void) - { -- arc_fini(); -+ arc_fini(); /* arc depends on l2arc, so arc must go first */ - l2arc_fini(); -@@ -1982,3 +1985,3 @@ EXPORT_SYMBOL(dmu_free_range); - EXPORT_SYMBOL(dmu_free_long_range); --EXPORT_SYMBOL(dmu_free_object); -+EXPORT_SYMBOL(dmu_free_long_object); - EXPORT_SYMBOL(dmu_read); -@@ -2003,2 +2006,6 @@ module_param(zfs_mdcomp_disable, int, 0644); - MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); -+ -+module_param(zfs_nopwrite_enabled, int, 0644); -+MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); -+ - #endif -diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c -index dc23778..a2cb2fc 100644 ---- a/module/zfs/dmu_diff.c -+++ b/module/zfs/dmu_diff.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -114,3 +115,3 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -137,3 +138,3 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -157,47 +158,45 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - int --dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) -+dmu_diff(const char *tosnap_name, const char *fromsnap_name, -+ struct vnode *vp, offset_t *offp) - { - struct diffarg da; -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; -- dsl_dataset_t *findds; -- dsl_dataset_t *relds; -- int err = 0; -- -- /* make certain we are looking at snapshots */ -- if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) -- return (EINVAL); -- -- /* fromsnap must be earlier and from the same lineage as tosnap */ -- if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) -- return (EXDEV); -- -- relds = NULL; -- findds = ds; -- -- while (fromds->ds_dir != findds->ds_dir) { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- if (!dsl_dir_is_clone(findds->ds_dir)) { -- if (relds) -- dsl_dataset_rele(relds, FTAG); -- return (EXDEV); -- } -- -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); -- rw_exit(&dp->dp_config_rwlock); -- -- if (relds) -- dsl_dataset_rele(relds, FTAG); -+ dsl_dataset_t *fromsnap; -+ dsl_dataset_t *tosnap; -+ dsl_pool_t *dp; -+ int error; -+ uint64_t fromtxg; -+ -+ if (strchr(tosnap_name, '@') == NULL || -+ strchr(fromsnap_name, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ error = dsl_pool_hold(tosnap_name, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } - -- if (err) -- return (EXDEV); -+ error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } - -- relds = findds; -+ if (!dsl_dataset_is_before(tosnap, fromsnap)) { -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (SET_ERROR(EXDEV)); - } - -- if (relds) -- dsl_dataset_rele(relds, FTAG); -+ fromtxg = fromsnap->ds_phys->ds_creation_txg; -+ dsl_dataset_rele(fromsnap, FTAG); -+ -+ dsl_dataset_long_hold(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -209,7 +208,7 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) - -- err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, -+ error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); - -- if (err) { -- da.da_err = err; -+ if (error != 0) { -+ da.da_err = error; - } else { -@@ -219,2 +218,5 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) - -+ dsl_dataset_long_rele(tosnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ - return (da.da_err); -diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c -index 8bb3eb4..b6b82a2 100644 ---- a/module/zfs/dmu_object.c -+++ b/module/zfs/dmu_object.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -92,3 +93,3 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -114,3 +115,3 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - if (object == DMU_META_DNODE_OBJECT) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c -index 52d55d5..fc7c803 100644 ---- a/module/zfs/dmu_objset.c -+++ b/module/zfs/dmu_objset.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -46,2 +47,3 @@ - #include -+#include - -@@ -284,3 +286,3 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); -- if (err) { -+ if (err != 0) { - kmem_free(os, sizeof (objset_t)); -@@ -288,3 +290,3 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - if (err == ECKSUM) -- err = EIO; -+ err = SET_ERROR(EIO); - return (err); -@@ -324,30 +326,45 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - if (ds) { -- err = dsl_prop_register(ds, "primarycache", -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), - primary_cache_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "secondarycache", -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), - secondary_cache_changed_cb, os); -+ } - if (!dsl_dataset_is_snapshot(ds)) { -- if (err == 0) -- err = dsl_prop_register(ds, "checksum", -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "compression", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), - compression_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "copies", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_COPIES), - copies_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "dedup", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_DEDUP), - dedup_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "logbias", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_LOGBIAS), - logbias_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "sync", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_SYNC), - sync_changed_cb, os); -+ } - } -- if (err) { -+ if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, -- &os->os_phys_buf) == 1); -+ &os->os_phys_buf)); - kmem_free(os, sizeof (objset_t)); -@@ -429,3 +446,6 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) - --/* called from zpl */ -+/* -+ * Holds the pool while the objset is held. Therefore only one objset -+ * can be held at a time. -+ */ - int -@@ -433,2 +453,3 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -436,9 +457,16 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - -- err = dsl_dataset_hold(name, tag, &ds); -- if (err) -+ err = dsl_pool_hold(name, tag, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_hold(dp, name, tag, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, tag); - return (err); -+ } - - err = dmu_objset_from_ds(ds, osp); -- if (err) -+ if (err != 0) { - dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ } - -@@ -447,3 +475,7 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - --/* called from zpl */ -+/* -+ * dsl_pool must not be held when this is called. -+ * Upon successful return, there will be a longhold on the dataset, -+ * and the dsl_pool will not be held. -+ */ - int -@@ -452,2 +484,3 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -455,15 +488,21 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, - -- err = dsl_dataset_own(name, B_FALSE, tag, &ds); -- if (err) -+ err = dsl_pool_hold(name, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_own(dp, name, tag, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); - return (err); -+ } - - err = dmu_objset_from_ds(ds, osp); -- if (err) { -+ dsl_pool_rele(dp, FTAG); -+ if (err != 0) { - dsl_dataset_disown(ds, tag); - } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { -- dmu_objset_disown(*osp, tag); -- return (EINVAL); -+ dsl_dataset_disown(ds, tag); -+ return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { -- dmu_objset_disown(*osp, tag); -- return (EROFS); -+ dsl_dataset_disown(ds, tag); -+ return (SET_ERROR(EROFS)); - } -@@ -475,3 +514,37 @@ dmu_objset_rele(objset_t *os, void *tag) - { -+ dsl_pool_t *dp = dmu_objset_pool(os); - dsl_dataset_rele(os->os_dsl_dataset, tag); -+ dsl_pool_rele(dp, tag); -+} -+ -+/* -+ * When we are called, os MUST refer to an objset associated with a dataset -+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner -+ * == tag. We will then release and reacquire ownership of the dataset while -+ * holding the pool config_rwlock to avoid intervening namespace or ownership -+ * changes may occur. -+ * -+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to -+ * release the hold on its dataset and acquire a new one on the dataset of the -+ * same name so that it can be partially torn down and reconstructed. -+ */ -+void -+dmu_objset_refresh_ownership(objset_t *os, void *tag) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds, *newds; -+ char name[MAXNAMELEN]; -+ -+ ds = os->os_dsl_dataset; -+ VERIFY3P(ds, !=, NULL); -+ VERIFY3P(ds->ds_owner, ==, tag); -+ VERIFY(dsl_dataset_long_held(ds)); -+ -+ dsl_dataset_name(ds, name); -+ dp = dmu_objset_pool(os); -+ dsl_pool_config_enter(dp, FTAG); -+ dmu_objset_disown(os, tag); -+ VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); -+ VERIFY3P(newds, ==, os->os_dsl_dataset); -+ dsl_pool_config_exit(dp, FTAG); - } -@@ -484,3 +557,3 @@ dmu_objset_disown(objset_t *os, void *tag) - --int -+void - dmu_objset_evict_dbufs(objset_t *os) -@@ -519,5 +592,3 @@ dmu_objset_evict_dbufs(objset_t *os) - } -- dn = list_head(&os->os_dnodes); - mutex_exit(&os->os_lock); -- return (dn != DMU_META_DNODE(os)); - } -@@ -527,5 +598,6 @@ dmu_objset_evict(objset_t *os) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; - int t; - -+ dsl_dataset_t *ds = os->os_dsl_dataset; -+ - for (t = 0; t < TXG_SIZE; t++) -@@ -535,18 +607,26 @@ dmu_objset_evict(objset_t *os) - if (!dsl_dataset_is_snapshot(ds)) { -- VERIFY(0 == dsl_prop_unregister(ds, "checksum", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "compression", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), - compression_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "copies", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_COPIES), - copies_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "dedup", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_DEDUP), - dedup_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "logbias", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_LOGBIAS), - logbias_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "sync", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_SYNC), - sync_changed_cb, os)); - } -- VERIFY(0 == dsl_prop_unregister(ds, "primarycache", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), - primary_cache_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), - secondary_cache_changed_cb, os)); -@@ -557,7 +637,3 @@ dmu_objset_evict(objset_t *os) - -- /* -- * We should need only a single pass over the dnode list, since -- * nothing can be added to the list at this point. -- */ -- (void) dmu_objset_evict_dbufs(os); -+ dmu_objset_evict_dbufs(os); - -@@ -572,3 +648,3 @@ dmu_objset_evict(objset_t *os) - -- VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); -+ VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); - -@@ -604,6 +680,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - ASSERT(dmu_tx_is_syncing(tx)); -+ - if (ds != NULL) -- VERIFY(0 == dmu_objset_from_ds(ds, &os)); -+ VERIFY0(dmu_objset_from_ds(ds, &os)); - else -- VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); -+ VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); - -@@ -655,11 +732,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - --struct oscarg { -- void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -- void *userarg; -- dsl_dataset_t *clone_origin; -- const char *lastname; -- dmu_objset_type_t type; -- uint64_t flags; -- cred_t *cr; --}; -+typedef struct dmu_objset_create_arg { -+ const char *doca_name; -+ cred_t *doca_cred; -+ void (*doca_userfunc)(objset_t *os, void *arg, -+ cred_t *cr, dmu_tx_t *tx); -+ void *doca_userarg; -+ dmu_objset_type_t doca_type; -+ uint64_t doca_flags; -+} dmu_objset_create_arg_t; - -@@ -667,24 +744,21 @@ struct oscarg { - static int --dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_create_check(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct oscarg *oa = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- int err; -- uint64_t ddobj; -- -- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, -- oa->lastname, sizeof (uint64_t), 1, &ddobj); -- if (err != ENOENT) -- return (err ? err : EEXIST); -+ dmu_objset_create_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ int error; - -- if (oa->clone_origin != NULL) { -- /* You can't clone across pools. */ -- if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) -- return (EXDEV); -+ if (strchr(doca->doca_name, '@') != NULL) -+ return (SET_ERROR(EINVAL)); - -- /* You can only clone snapshots, not the head datasets. */ -- if (!dsl_dataset_is_snapshot(oa->clone_origin)) -- return (EINVAL); -+ error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); -+ if (error != 0) -+ return (error); -+ if (tail == NULL) { -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EEXIST)); - } -+ dsl_dir_rele(pdd, FTAG); - -@@ -694,32 +768,31 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_create_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- spa_t *spa = dd->dd_pool->dp_spa; -- struct oscarg *oa = arg2; -+ dmu_objset_create_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ dsl_dataset_t *ds; - uint64_t obj; -+ blkptr_t *bp; -+ objset_t *os; - -- ASSERT(dmu_tx_is_syncing(tx)); -- -- obj = dsl_dataset_create_sync(dd, oa->lastname, -- oa->clone_origin, oa->flags, oa->cr, tx); -- -- if (oa->clone_origin == NULL) { -- dsl_pool_t *dp = dd->dd_pool; -- dsl_dataset_t *ds; -- blkptr_t *bp; -- objset_t *os; -+ VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); -- bp = dsl_dataset_get_blkptr(ds); -- ASSERT(BP_IS_HOLE(bp)); -+ obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, -+ doca->doca_cred, tx); - -- os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); -+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); -+ bp = dsl_dataset_get_blkptr(ds); -+ os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, -+ ds, bp, doca->doca_type, tx); - -- if (oa->userfunc) -- oa->userfunc(os, oa->userarg, oa->cr, tx); -- dsl_dataset_rele(ds, FTAG); -+ if (doca->doca_userfunc != NULL) { -+ doca->doca_userfunc(os, doca->doca_userarg, -+ doca->doca_cred, tx); - } - -- spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); -+ spa_history_log_internal_ds(ds, "create", tx, ""); -+ dsl_dataset_rele(ds, FTAG); -+ dsl_dir_rele(pdd, FTAG); - } -@@ -730,125 +803,67 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - { -- dsl_dir_t *pdd; -- const char *tail; -- int err = 0; -- struct oscarg oa = { 0 }; -+ dmu_objset_create_arg_t doca; - -- ASSERT(strchr(name, '@') == NULL); -- err = dsl_dir_open(name, FTAG, &pdd, &tail); -- if (err) -- return (err); -- if (tail == NULL) { -- dsl_dir_close(pdd, FTAG); -- return (EEXIST); -- } -- -- oa.userfunc = func; -- oa.userarg = arg; -- oa.lastname = tail; -- oa.type = type; -- oa.flags = flags; -- oa.cr = CRED(); -+ doca.doca_name = name; -+ doca.doca_cred = CRED(); -+ doca.doca_flags = flags; -+ doca.doca_userfunc = func; -+ doca.doca_userarg = arg; -+ doca.doca_type = type; - -- err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, -- dmu_objset_create_sync, pdd, &oa, 5); -- dsl_dir_close(pdd, FTAG); -- return (err); -+ return (dsl_sync_task(name, -+ dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); - } - --int --dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) -+typedef struct dmu_objset_clone_arg { -+ const char *doca_clone; -+ const char *doca_origin; -+ cred_t *doca_cred; -+} dmu_objset_clone_arg_t; -+ -+/*ARGSUSED*/ -+static int -+dmu_objset_clone_check(void *arg, dmu_tx_t *tx) - { -+ dmu_objset_clone_arg_t *doca = arg; - dsl_dir_t *pdd; - const char *tail; -- int err = 0; -- struct oscarg oa = { 0 }; -+ int error; -+ dsl_dataset_t *origin; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - -- ASSERT(strchr(name, '@') == NULL); -- err = dsl_dir_open(name, FTAG, &pdd, &tail); -- if (err) -- return (err); -+ if (strchr(doca->doca_clone, '@') != NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); -+ if (error != 0) -+ return (error); - if (tail == NULL) { -- dsl_dir_close(pdd, FTAG); -- return (EEXIST); -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EEXIST)); - } -- -- oa.lastname = tail; -- oa.clone_origin = clone_origin; -- oa.flags = flags; -- oa.cr = CRED(); -- -- err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, -- dmu_objset_create_sync, pdd, &oa, 5); -- dsl_dir_close(pdd, FTAG); -- return (err); --} -- --int --dmu_objset_destroy(const char *name, boolean_t defer) --{ -- dsl_dataset_t *ds; -- int error; -- -- error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); -- if (error == 0) { -- error = dsl_dataset_destroy(ds, FTAG, defer); -- /* dsl_dataset_destroy() closes the ds. */ -+ /* You can't clone across pools. */ -+ if (pdd->dd_pool != dp) { -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EXDEV)); - } -+ dsl_dir_rele(pdd, FTAG); - -- return (error); --} -- --struct snaparg { -- dsl_sync_task_group_t *dstg; -- char *snapname; -- char *htag; -- char failed[MAXPATHLEN]; -- boolean_t recursive; -- boolean_t needsuspend; -- boolean_t temporary; -- nvlist_t *props; -- struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ -- dsl_dataset_t *newds; --}; -- --static int --snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- objset_t *os = arg1; -- struct snaparg *sn = arg2; -- int error; -- -- /* The props have already been checked by zfs_check_userprops(). */ -- -- error = dsl_dataset_snapshot_check(os->os_dsl_dataset, -- sn->snapname, tx); -- if (error) -+ error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); -+ if (error != 0) - return (error); - -- if (sn->temporary) { -- /* -- * Ideally we would just call -- * dsl_dataset_user_hold_check() and -- * dsl_dataset_destroy_check() here. However the -- * dataset we want to hold and destroy is the snapshot -- * that we just confirmed we can create, but it won't -- * exist until after these checks are run. Do any -- * checks we can here and if more checks are added to -- * those routines in the future, similar checks may be -- * necessary here. -- */ -- if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- /* -- * Not checking number of tags because the tag will be -- * unique, as it will be the only tag. -- */ -- if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -- return (E2BIG); -+ /* You can't clone across pools. */ -+ if (origin->ds_dir->dd_pool != dp) { -+ dsl_dataset_rele(origin, FTAG); -+ return (SET_ERROR(EXDEV)); -+ } - -- sn->ha = kmem_alloc(sizeof(struct dsl_ds_holdarg), KM_PUSHPAGE); -- sn->ha->temphold = B_TRUE; -- sn->ha->htag = sn->htag; -+ /* You can only clone snapshots, not the head datasets. */ -+ if (!dsl_dataset_is_snapshot(origin)) { -+ dsl_dataset_rele(origin, FTAG); -+ return (SET_ERROR(EINVAL)); - } -- return (error); -+ dsl_dataset_rele(origin, FTAG); -+ -+ return (0); - } -@@ -856,85 +871,38 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) - { -- objset_t *os = arg1; -- dsl_dataset_t *ds = os->os_dsl_dataset; -- struct snaparg *sn = arg2; -- -- dsl_dataset_snapshot_sync(ds, sn->snapname, tx); -- -- if (sn->props) { -- dsl_props_arg_t pa; -- pa.pa_props = sn->props; -- pa.pa_source = ZPROP_SRC_LOCAL; -- dsl_props_set_sync(ds->ds_prev, &pa, tx); -- } -+ dmu_objset_clone_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ dsl_dataset_t *origin, *ds; -+ uint64_t obj; -+ char namebuf[MAXNAMELEN]; - -- if (sn->temporary) { -- struct dsl_ds_destroyarg da; -+ VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); -+ VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - -- dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); -- kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); -- sn->ha = NULL; -- sn->newds = ds->ds_prev; -+ obj = dsl_dataset_create_sync(pdd, tail, origin, 0, -+ doca->doca_cred, tx); - -- da.ds = ds->ds_prev; -- da.defer = B_TRUE; -- dsl_dataset_destroy_sync(&da, FTAG, tx); -- } -+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); -+ dsl_dataset_name(origin, namebuf); -+ spa_history_log_internal_ds(ds, "clone", tx, -+ "origin=%s (%llu)", namebuf, origin->ds_object); -+ dsl_dataset_rele(ds, FTAG); -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dir_rele(pdd, FTAG); - } - --static int --dmu_objset_snapshot_one(const char *name, void *arg) -+int -+dmu_objset_clone(const char *clone, const char *origin) - { -- struct snaparg *sn = arg; -- objset_t *os; -- int err; -- char *cp; -- -- /* -- * If the objset starts with a '%', then ignore it unless it was -- * explicitly named (ie, not recursive). These hidden datasets -- * are always inconsistent, and by not opening them here, we can -- * avoid a race with dsl_dir_destroy_check(). -- */ -- cp = strrchr(name, '/'); -- if (cp && cp[1] == '%' && sn->recursive) -- return (0); -- -- (void) strcpy(sn->failed, name); -+ dmu_objset_clone_arg_t doca; - -- /* -- * Check permissions if we are doing a recursive snapshot. The -- * permission checks for the starting dataset have already been -- * performed in zfs_secpolicy_snapshot() -- */ -- if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) -- return (err); -+ doca.doca_clone = clone; -+ doca.doca_origin = origin; -+ doca.doca_cred = CRED(); - -- err = dmu_objset_hold(name, sn, &os); -- if (err != 0) -- return (err); -- -- /* -- * If the objset is in an inconsistent state (eg, in the process -- * of being destroyed), don't snapshot it. As with %hidden -- * datasets, we return EBUSY if this name was explicitly -- * requested (ie, not recursive), and otherwise ignore it. -- */ -- if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { -- dmu_objset_rele(os, sn); -- return (sn->recursive ? 0 : EBUSY); -- } -- -- if (sn->needsuspend) { -- err = zil_suspend(dmu_objset_zil(os)); -- if (err) { -- dmu_objset_rele(os, sn); -- return (err); -- } -- } -- dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, -- os, sn, 3); -- -- return (0); -+ return (dsl_sync_task(clone, -+ dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); - } -@@ -942,72 +910,12 @@ dmu_objset_snapshot_one(const char *name, void *arg) - int --dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) -+dmu_objset_snapshot_one(const char *fsname, const char *snapname) - { -- dsl_sync_task_t *dst; -- struct snaparg *sn; -- spa_t *spa; -- minor_t minor; - int err; -+ char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); -+ nvlist_t *snaps = fnvlist_alloc(); - -- sn = kmem_alloc(sizeof (struct snaparg), KM_SLEEP); -- (void) strcpy(sn->failed, fsname); -- -- err = spa_open(fsname, &spa, FTAG); -- if (err) { -- kmem_free(sn, sizeof (struct snaparg)); -- return (err); -- } -- -- if (temporary) { -- if (cleanup_fd < 0) { -- spa_close(spa, FTAG); -- return (EINVAL); -- } -- if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { -- spa_close(spa, FTAG); -- return (err); -- } -- } -- -- sn->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- sn->snapname = snapname; -- sn->htag = tag; -- sn->props = props; -- sn->recursive = recursive; -- sn->needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -- sn->temporary = temporary; -- sn->ha = NULL; -- sn->newds = NULL; -- -- if (recursive) { -- err = dmu_objset_find(fsname, -- dmu_objset_snapshot_one, sn, DS_FIND_CHILDREN); -- } else { -- err = dmu_objset_snapshot_one(fsname, sn); -- } -- -- if (err == 0) -- err = dsl_sync_task_group_wait(sn->dstg); -- -- for (dst = list_head(&sn->dstg->dstg_tasks); dst; -- dst = list_next(&sn->dstg->dstg_tasks, dst)) { -- objset_t *os = dst->dst_arg1; -- dsl_dataset_t *ds = os->os_dsl_dataset; -- if (dst->dst_err) { -- dsl_dataset_name(ds, sn->failed); -- } else if (temporary) { -- dsl_register_onexit_hold_cleanup(sn->newds, tag, minor); -- } -- if (sn->needsuspend) -- zil_resume(dmu_objset_zil(os)); -- dmu_objset_rele(os, sn); -- } -- -- if (err) -- (void) strcpy(fsname, sn->failed); -- if (temporary) -- zfs_onexit_fd_rele(cleanup_fd); -- dsl_sync_task_group_destroy(sn->dstg); -- spa_close(spa, FTAG); -- kmem_free(sn, sizeof (struct snaparg)); -+ fnvlist_add_boolean(snaps, longsnap); -+ strfree(longsnap); -+ err = dsl_dataset_snapshot(snaps, NULL, NULL); -+ fnvlist_free(snaps); - return (err); -@@ -1052,5 +960,5 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) - -- ASSERT(bp == os->os_rootbp); -- ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); -- ASSERT(BP_GET_LEVEL(bp) == 0); -+ ASSERT3P(bp, ==, os->os_rootbp); -+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); -+ ASSERT0(BP_GET_LEVEL(bp)); - -@@ -1126,3 +1034,3 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, -- dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, -+ NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); -@@ -1161,4 +1069,4 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; -- while ((dr = list_head(list)) != NULL) { -- ASSERT(dr->dr_dbuf->db_level == 0); -+ while ((dr = list_head(list))) { -+ ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); -@@ -1221,3 +1129,3 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) - -- while ((dn = list_head(list)) != NULL) { -+ while ((dn = list_head(list))) { - int flags; -@@ -1324,3 +1232,4 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) - dmu_buf_impl_t *db = NULL; -- uint64_t *user = NULL, *group = NULL; -+ uint64_t *user = NULL; -+ uint64_t *group = NULL; - int flags = dn->dn_id_flags; -@@ -1435,5 +1344,5 @@ dmu_objset_userspace_upgrade(objset_t *os) - if (!dmu_objset_userused_enabled(os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (dmu_objset_is_snapshot(os)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1453,6 +1362,6 @@ dmu_objset_userspace_upgrade(objset_t *os) - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - - objerr = dmu_bonus_hold(os, obj, FTAG, &db); -- if (objerr) -+ if (objerr != 0) - continue; -@@ -1461,3 +1370,3 @@ dmu_objset_userspace_upgrade(objset_t *os) - objerr = dmu_tx_assign(tx, TXG_WAIT); -- if (objerr) { -+ if (objerr != 0) { - dmu_tx_abort(tx); -@@ -1529,3 +1438,3 @@ dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, - if (ds->ds_phys->ds_snapnames_zapobj == 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1544,4 +1453,6 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - -+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); -+ - if (ds->ds_phys->ds_snapnames_zapobj == 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1553,3 +1464,3 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1558,3 +1469,3 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -1576,3 +1487,3 @@ dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) - { -- return dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value); -+ return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value)); - } -@@ -1590,3 +1501,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - dd->dd_phys->dd_head_dataset_obj) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1598,3 +1509,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1603,3 +1514,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -1616,27 +1527,103 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - --struct findarg { -- int (*func)(const char *, void *); -- void *arg; --}; -- --/* ARGSUSED */ --static int --findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) --{ -- struct findarg *fa = arg; -- return (fa->func(dsname, fa->arg)); --} -- - /* -- * Find all objsets under name, and for each, call 'func(child_name, arg)'. -- * Perhaps change all callers to use dmu_objset_find_spa()? -+ * Find objsets under and including ddobj, call func(ds) on each. - */ - int --dmu_objset_find(char *name, int func(const char *, void *), void *arg, -- int flags) -+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, -+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) - { -- struct findarg fa; -- fa.func = func; -- fa.arg = arg; -- return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); -+ dsl_dir_t *dd; -+ dsl_dataset_t *ds; -+ zap_cursor_t zc; -+ zap_attribute_t *attr; -+ uint64_t thisobj; -+ int err; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); -+ if (err != 0) -+ return (err); -+ -+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ -+ if (dd->dd_myname[0] == '$') { -+ dsl_dir_rele(dd, FTAG); -+ return (0); -+ } -+ -+ thisobj = dd->dd_phys->dd_head_dataset_obj; -+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ -+ /* -+ * Iterate over all children. -+ */ -+ if (flags & DS_FIND_CHILDREN) { -+ for (zap_cursor_init(&zc, dp->dp_meta_objset, -+ dd->dd_phys->dd_child_dir_zapobj); -+ zap_cursor_retrieve(&zc, attr) == 0; -+ (void) zap_cursor_advance(&zc)) { -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); -+ -+ err = dmu_objset_find_dp(dp, attr->za_first_integer, -+ func, arg, flags); -+ if (err != 0) -+ break; -+ } -+ zap_cursor_fini(&zc); -+ -+ if (err != 0) { -+ dsl_dir_rele(dd, FTAG); -+ kmem_free(attr, sizeof (zap_attribute_t)); -+ return (err); -+ } -+ } -+ -+ /* -+ * Iterate over all snapshots. -+ */ -+ if (flags & DS_FIND_SNAPSHOTS) { -+ dsl_dataset_t *ds; -+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -+ -+ if (err == 0) { -+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; -+ dsl_dataset_rele(ds, FTAG); -+ -+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); -+ zap_cursor_retrieve(&zc, attr) == 0; -+ (void) zap_cursor_advance(&zc)) { -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); -+ -+ err = dsl_dataset_hold_obj(dp, -+ attr->za_first_integer, FTAG, &ds); -+ if (err != 0) -+ break; -+ err = func(dp, ds, arg); -+ dsl_dataset_rele(ds, FTAG); -+ if (err != 0) -+ break; -+ } -+ zap_cursor_fini(&zc); -+ } -+ } -+ -+ dsl_dir_rele(dd, FTAG); -+ kmem_free(attr, sizeof (zap_attribute_t)); -+ -+ if (err != 0) -+ return (err); -+ -+ /* -+ * Apply to self. -+ */ -+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -+ if (err != 0) -+ return (err); -+ err = func(dp, ds, arg); -+ dsl_dataset_rele(ds, FTAG); -+ return (err); - } -@@ -1644,10 +1631,14 @@ dmu_objset_find(char *name, int func(const char *, void *), void *arg, - /* -- * Find all objsets under name, call func on each -+ * Find all objsets under name, and for each, call 'func(child_name, arg)'. -+ * The dp_config_rwlock must not be held when this is called, and it -+ * will not be held when the callback is called. -+ * Therefore this function should only be used when the pool is not changing -+ * (e.g. in syncing context), or the callback can deal with the possible races. - */ --int --dmu_objset_find_spa(spa_t *spa, const char *name, -- int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) -+static int -+dmu_objset_find_impl(spa_t *spa, const char *name, -+ int func(const char *, void *), void *arg, int flags) - { - dsl_dir_t *dd; -- dsl_pool_t *dp; -+ dsl_pool_t *dp = spa_get_dsl(spa); - dsl_dataset_t *ds; -@@ -1659,7 +1650,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- if (name == NULL) -- name = spa_name(spa); -- err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); -- if (err) -+ dsl_pool_config_enter(dp, FTAG); -+ -+ err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); -+ if (err != 0) { -+ dsl_pool_config_exit(dp, FTAG); - return (err); -+ } - -@@ -1667,3 +1660,4 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - if (dd->dd_myname[0] == '$') { -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ dsl_pool_config_exit(dp, FTAG); - return (0); -@@ -1673,3 +1667,2 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -- dp = dd->dd_pool; - -@@ -1683,9 +1676,13 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - (void) zap_cursor_advance(&zc)) { -- ASSERT(attr->za_integer_length == sizeof (uint64_t)); -- ASSERT(attr->za_num_integers == 1); -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); - - child = kmem_asprintf("%s/%s", name, attr->za_name); -- err = dmu_objset_find_spa(spa, child, func, arg, flags); -+ dsl_pool_config_exit(dp, FTAG); -+ err = dmu_objset_find_impl(spa, child, -+ func, arg, flags); -+ dsl_pool_config_enter(dp, FTAG); - strfree(child); -- if (err) -+ if (err != 0) - break; -@@ -1694,4 +1691,5 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- if (err) { -- dsl_dir_close(dd, FTAG); -+ if (err != 0) { -+ dsl_dir_rele(dd, FTAG); -+ dsl_pool_config_exit(dp, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); -@@ -1705,7 +1703,3 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - if (flags & DS_FIND_SNAPSHOTS) { -- if (!dsl_pool_sync_context(dp)) -- rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -- if (!dsl_pool_sync_context(dp)) -- rw_exit(&dp->dp_config_rwlock); - -@@ -1718,5 +1712,5 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - (void) zap_cursor_advance(&zc)) { -- ASSERT(attr->za_integer_length == -+ ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); -- ASSERT(attr->za_num_integers == 1); -+ ASSERT3U(attr->za_num_integers, ==, 1); - -@@ -1724,6 +1718,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - name, attr->za_name); -- err = func(spa, attr->za_first_integer, -- child, arg); -+ dsl_pool_config_exit(dp, FTAG); -+ err = func(child, arg); -+ dsl_pool_config_enter(dp, FTAG); - strfree(child); -- if (err) -+ if (err != 0) - break; -@@ -1734,44 +1729,29 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); -+ dsl_pool_config_exit(dp, FTAG); - -- if (err) -+ if (err != 0) - return (err); - -- /* -- * Apply to self if appropriate. -- */ -- err = func(spa, thisobj, name, arg); -- return (err); -+ /* Apply to self. */ -+ return (func(name, arg)); - } - --/* ARGSUSED */ -+/* -+ * See comment above dmu_objset_find_impl(). -+ */ - int --dmu_objset_prefetch(const char *name, void *arg) -+dmu_objset_find(char *name, int func(const char *, void *), void *arg, -+ int flags) - { -- dsl_dataset_t *ds; -- -- if (dsl_dataset_hold(name, FTAG, &ds)) -- return (0); -- -- if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { -- mutex_enter(&ds->ds_opening_lock); -- if (ds->ds_objset == NULL) { -- uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; -- zbookmark_t zb; -- -- SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, -- ZB_ROOT_LEVEL, ZB_ROOT_BLKID); -- -- (void) arc_read(NULL, dsl_dataset_get_spa(ds), -- &ds->ds_phys->ds_bp, NULL, NULL, -- ZIO_PRIORITY_ASYNC_READ, -- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, -- &aflags, &zb); -- } -- mutex_exit(&ds->ds_opening_lock); -- } -+ spa_t *spa; -+ int error; - -- dsl_dataset_rele(ds, FTAG); -- return (0); -+ error = spa_open(name, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ error = dmu_objset_find_impl(spa, name, func, arg, flags); -+ spa_close(spa, FTAG); -+ return (error); - } -@@ -1792,2 +1772,18 @@ dmu_objset_get_user(objset_t *os) - -+/* -+ * Determine name of filesystem, given name of snapshot. -+ * buf must be at least MAXNAMELEN bytes -+ */ -+int -+dmu_fsname(const char *snapname, char *buf) -+{ -+ char *atp = strchr(snapname, '@'); -+ if (atp == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (atp - snapname >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); -+ (void) strlcpy(buf, snapname, atp - snapname + 1); -+ return (0); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) -@@ -1805,4 +1801,2 @@ EXPORT_SYMBOL(dmu_objset_create); - EXPORT_SYMBOL(dmu_objset_clone); --EXPORT_SYMBOL(dmu_objset_destroy); --EXPORT_SYMBOL(dmu_objset_snapshot); - EXPORT_SYMBOL(dmu_objset_stats); -@@ -1813,4 +1807,2 @@ EXPORT_SYMBOL(dmu_objset_fsid_guid); - EXPORT_SYMBOL(dmu_objset_find); --EXPORT_SYMBOL(dmu_objset_find_spa); --EXPORT_SYMBOL(dmu_objset_prefetch); - EXPORT_SYMBOL(dmu_objset_byteswap); -diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c -index 54e7597..9264fbb 100644 ---- a/module/zfs/dmu_send.c -+++ b/module/zfs/dmu_send.c -@@ -24,4 +24,4 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -50,2 +50,4 @@ - #include -+#include -+#include - -@@ -55,2 +57,3 @@ int zfs_send_corrupt_data = B_FALSE; - static char *dmu_recv_tag = "dmu_recv_tag"; -+static const char *recv_clone_name = "%recv"; - -@@ -108,2 +111,28 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - -+ /* -+ * When we receive a free record, dbuf_free_range() assumes -+ * that the receiving system doesn't have any dbufs in the range -+ * being freed. This is always true because there is a one-record -+ * constraint: we only send one WRITE record for any given -+ * object+offset. We know that the one-record constraint is -+ * true because we always send data in increasing order by -+ * object,offset. -+ * -+ * If the increasing-order constraint ever changes, we should find -+ * another way to assert that the one-record constraint is still -+ * satisfied. -+ */ -+ ASSERT(object > dsp->dsa_last_data_object || -+ (object == dsp->dsa_last_data_object && -+ offset > dsp->dsa_last_data_offset)); -+ -+ /* -+ * If we are doing a non-incremental send, then there can't -+ * be any data in the dataset we're receiving into. Therefore -+ * a free record would simply be a no-op. Save space by not -+ * sending it to begin with. -+ */ -+ if (!dsp->dsa_incremental) -+ return (0); -+ - if (length != -1ULL && offset + length < offset) -@@ -122,3 +151,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -146,3 +175,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -160,3 +189,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - } else { -@@ -174,2 +203,11 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - -+ /* -+ * We send data in increasing object, offset order. -+ * See comment in dump_free() for details. -+ */ -+ ASSERT(object > dsp->dsa_last_data_object || -+ (object == dsp->dsa_last_data_object && -+ offset > dsp->dsa_last_data_offset)); -+ dsp->dsa_last_data_object = object; -+ dsp->dsa_last_data_offset = offset + blksz - 1; - -@@ -184,3 +222,3 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -204,5 +242,5 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - return (0); -@@ -218,3 +256,3 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -230,5 +268,5 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - return (0); -@@ -241,2 +279,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - -+ /* See comment in dump_free(). */ -+ if (!dsp->dsa_incremental) -+ return (0); -+ - /* -@@ -252,3 +294,3 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -267,3 +309,3 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -295,3 +337,3 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -312,13 +354,13 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - - if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -- /* free anything past the end of the file */ -+ /* Free anything past the end of the file. */ - if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * -- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) -- return (EINTR); -- if (dsp->dsa_err) -- return (EINTR); -+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) -+ return (SET_ERROR(EINTR)); -+ if (dsp->dsa_err != 0) -+ return (SET_ERROR(EINTR)); - return (0); -@@ -340,3 +382,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -364,3 +406,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -371,3 +413,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - err = dump_dnode(dsp, dnobj, blk+i); -- if (err) -+ if (err != 0) - break; -@@ -383,3 +425,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -405,3 +447,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - } else { -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -418,8 +460,10 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - --int --dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- int outfd, vnode_t *vp, offset_t *off) -+/* -+ * Releases dp, ds, and fromds, using the specified tag. -+ */ -+static int -+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, -+ dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) - { -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; -+ objset_t *os; - dmu_replay_record_t *drr; -@@ -429,29 +473,17 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- /* tosnap must be a snapshot */ -- if (ds->ds_phys->ds_next_snap_obj == 0) -- return (EINVAL); -- -- /* fromsnap must be an earlier snapshot from the same fs as tosnap */ -- if (fromds && (ds->ds_dir != fromds->ds_dir || -- fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) -- return (EXDEV); -- -- if (fromorigin) { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- if (fromsnap) -- return (EINVAL); -- -- if (dsl_dir_is_clone(ds->ds_dir)) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -- return (err); -- } else { -- fromorigin = B_FALSE; -- } -+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (SET_ERROR(EXDEV)); - } - -+ err = dmu_objset_from_ds(ds, &os); -+ if (err != 0) { -+ if (fromds != NULL) -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (err); -+ } - -@@ -464,9 +496,13 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - #ifdef _KERNEL -- if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { -+ if (dmu_objset_type(os) == DMU_OST_ZFS) { - uint64_t version; -- if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { -+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); -- return (EINVAL); -+ if (fromds != NULL) -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (SET_ERROR(EINVAL)); - } -- if (version == ZPL_VERSION_SA) { -+ if (version >= ZPL_VERSION_SA) { - DMU_SET_FEATUREFLAGS( -@@ -480,4 +516,4 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - ds->ds_phys->ds_creation_time; -- drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; -- if (fromorigin) -+ drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); -+ if (fromds != NULL && ds->ds_dir != fromds->ds_dir) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; -@@ -487,3 +523,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- if (fromds) -+ if (fromds != NULL) - drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; -@@ -491,6 +527,7 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- if (fromds) -+ if (fromds != NULL) { - fromtxg = fromds->ds_phys->ds_creation_txg; -- if (fromorigin) -- dsl_dataset_rele(fromds, FTAG); -+ dsl_dataset_rele(fromds, tag); -+ fromds = NULL; -+ } - -@@ -502,3 +539,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - dsp->dsa_proc = curproc; -- dsp->dsa_os = tosnap; -+ dsp->dsa_os = os; - dsp->dsa_off = off; -@@ -507,2 +544,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - dsp->dsa_pending_op = PENDING_NONE; -+ dsp->dsa_incremental = (fromtxg != 0); - -@@ -512,2 +550,5 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -+ dsl_dataset_long_hold(ds, FTAG); -+ dsl_pool_rele(dp, tag); -+ - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { -@@ -522,6 +563,6 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) -- err = EINTR; -+ err = SET_ERROR(EINTR); - -- if (err) { -- if (err == EINTR && dsp->dsa_err) -+ if (err != 0) { -+ if (err == EINTR && dsp->dsa_err != 0) - err = dsp->dsa_err; -@@ -548,2 +589,5 @@ out: - -+ dsl_dataset_long_rele(ds, FTAG); -+ dsl_dataset_rele(ds, tag); -+ - return (err); -@@ -552,35 +596,86 @@ out: - int --dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- uint64_t *sizep) -+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, -+ int outfd, vnode_t *vp, offset_t *off) - { -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ dsl_dataset_t *fromds = NULL; - int err; -- uint64_t size, recordsize; - -- /* tosnap must be a snapshot */ -- if (ds->ds_phys->ds_next_snap_obj == 0) -- return (EINVAL); -- -- /* fromsnap must be an earlier snapshot from the same fs as tosnap */ -- if (fromds && (ds->ds_dir != fromds->ds_dir || -- fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) -- return (EXDEV); -- -- if (fromorigin) { -- if (fromsnap) -- return (EINVAL); -- -- if (dsl_dir_is_clone(ds->ds_dir)) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -- return (err); -- } else { -- fromorigin = B_FALSE; -+ err = dsl_pool_hold(pool, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ -+ err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (fromsnap != 0) { -+ err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); -+ if (err != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ } -+ -+ return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); -+} -+ -+int -+dmu_send(const char *tosnap, const char *fromsnap, -+ int outfd, vnode_t *vp, offset_t *off) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ dsl_dataset_t *fromds = NULL; -+ int err; -+ -+ if (strchr(tosnap, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ err = dsl_pool_hold(tosnap, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ -+ err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (fromsnap != NULL) { -+ err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); -+ if (err != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (err); - } - } -+ return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); -+} -+ -+int -+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) -+{ -+ int err; -+ uint64_t size, recordsize; -+ ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ /* tosnap must be a snapshot */ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ /* -+ * fromsnap must be an earlier snapshot from the same fs as tosnap, -+ * or the origin's fs. -+ */ -+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) -+ return (SET_ERROR(EXDEV)); - -@@ -593,5 +688,3 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - &used, &comp, &size); -- if (fromorigin) -- dsl_dataset_rele(fromds, FTAG); -- if (err) -+ if (err != 0) - return (err); -@@ -614,7 +707,4 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_prop_get_ds(ds, "recordsize", -- sizeof (recordsize), 1, &recordsize, NULL); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -+ err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); -+ if (err != 0) - return (err); -@@ -630,40 +720,73 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - --struct recvbeginsyncarg { -- const char *tofs; -- const char *tosnap; -- dsl_dataset_t *origin; -- uint64_t fromguid; -- dmu_objset_type_t type; -- void *tag; -- boolean_t force; -- uint64_t dsflags; -- char clonelastname[MAXNAMELEN]; -- dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ -- cred_t *cr; --}; -+typedef struct dmu_recv_begin_arg { -+ const char *drba_origin; -+ dmu_recv_cookie_t *drba_cookie; -+ cred_t *drba_cred; -+ uint64_t drba_snapobj; -+} dmu_recv_begin_arg_t; - --/* ARGSUSED */ - static int --recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) -+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, -+ uint64_t fromguid) - { -- dsl_dir_t *dd = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; -- int err; -+ int error; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ -+ /* temporary clone name must not exist */ -+ error = zap_lookup(dp->dp_meta_objset, -+ ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, -+ 8, 1, &val); -+ if (error != ENOENT) -+ return (error == 0 ? EBUSY : error); - -- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, -- strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); -+ /* new snapshot name must not exist */ -+ error = zap_lookup(dp->dp_meta_objset, -+ ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, -+ 8, 1, &val); -+ if (error != ENOENT) -+ return (error == 0 ? EEXIST : error); -+ -+ if (fromguid != 0) { -+ dsl_dataset_t *snap; -+ uint64_t obj = ds->ds_phys->ds_prev_snap_obj; -+ -+ /* Find snapshot in this dir that matches fromguid. */ -+ while (obj != 0) { -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap); -+ if (error != 0) -+ return (SET_ERROR(ENODEV)); -+ if (snap->ds_dir != ds->ds_dir) { -+ dsl_dataset_rele(snap, FTAG); -+ return (SET_ERROR(ENODEV)); -+ } -+ if (snap->ds_phys->ds_guid == fromguid) -+ break; -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_dataset_rele(snap, FTAG); -+ } -+ if (obj == 0) -+ return (SET_ERROR(ENODEV)); - -- if (err != ENOENT) -- return (err ? err : EEXIST); -+ if (drba->drba_cookie->drc_force) { -+ drba->drba_snapobj = obj; -+ } else { -+ /* -+ * If we are not forcing, there must be no -+ * changes since fromsnap. -+ */ -+ if (dsl_dataset_modified_since_snap(ds, snap)) { -+ dsl_dataset_rele(snap, FTAG); -+ return (SET_ERROR(ETXTBSY)); -+ } -+ drba->drba_snapobj = ds->ds_prev->ds_object; -+ } - -- if (rbsa->origin) { -- /* make sure it's a snap in the same pool */ -- if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) -- return (EXDEV); -- if (!dsl_dataset_is_snapshot(rbsa->origin)) -- return (EINVAL); -- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) -- return (ENODEV); -+ dsl_dataset_rele(snap, FTAG); -+ } else { -+ /* if full, most recent snapshot must be $ORIGIN */ -+ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) -+ return (SET_ERROR(ENODEV)); -+ drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; - } -@@ -671,113 +794,141 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) - return (0); -+ - } - --static void --recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static int -+dmu_recv_begin_check(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; -- uint64_t dsobj; -- -- /* Create and open new dataset. */ -- dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, -- rbsa->origin, flags, rbsa->cr, tx); -- VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, -- B_TRUE, dmu_recv_tag, &rbsa->ds)); -+ dmu_recv_begin_arg_t *drba = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb; -+ uint64_t fromguid = drrb->drr_fromguid; -+ int flags = drrb->drr_flags; -+ int error; -+ dsl_dataset_t *ds; -+ const char *tofs = drba->drba_cookie->drc_tofs; - -- if (rbsa->origin == NULL) { -- (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, -- rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); -- } -+ /* already checked */ -+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - -- spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, -- dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); --} -+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == -+ DMU_COMPOUNDSTREAM || -+ drrb->drr_type >= DMU_OST_NUMTYPES || -+ ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) -+ return (SET_ERROR(EINVAL)); - --/* ARGSUSED */ --static int --recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- int err; -- uint64_t val; -+ /* Verify pool version supports SA if SA_SPILL feature set */ -+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & -+ DMU_BACKUP_FEATURE_SA_SPILL) && -+ spa_version(dp->dp_spa) < SPA_VERSION_SA) { -+ return (SET_ERROR(ENOTSUP)); -+ } - -- /* must not have any changes since most recent snapshot */ -- if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) -- return (ETXTBSY); -+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); -+ if (error == 0) { -+ /* target fs already exists; recv into temp clone */ - -- /* new snapshot name must not exist */ -- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ /* Can't recv a clone into an existing fs */ -+ if (flags & DRR_FLAG_CLONE) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- if (rbsa->fromguid) { -- /* if incremental, most recent snapshot must match fromguid */ -- if (ds->ds_prev == NULL) -- return (ENODEV); -+ error = recv_begin_check_existing_impl(drba, ds, fromguid); -+ dsl_dataset_rele(ds, FTAG); -+ } else if (error == ENOENT) { -+ /* target fs does not exist; must be a full backup or clone */ -+ char buf[MAXNAMELEN]; - - /* -- * most recent snapshot must match fromguid, or there are no -- * changes since the fromguid one -+ * If it's a non-clone incremental, we are missing the -+ * target fs, so fail the recv. - */ -- if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { -- uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; -- uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; -- while (obj != 0) { -- dsl_dataset_t *snap; -- err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- obj, FTAG, &snap); -- if (err) -- return (ENODEV); -- if (snap->ds_phys->ds_creation_txg < birth) { -- dsl_dataset_rele(snap, FTAG); -- return (ENODEV); -- } -- if (snap->ds_phys->ds_guid == rbsa->fromguid) { -- dsl_dataset_rele(snap, FTAG); -- break; /* it's ok */ -- } -- obj = snap->ds_phys->ds_prev_snap_obj; -- dsl_dataset_rele(snap, FTAG); -+ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) -+ return (SET_ERROR(ENOENT)); -+ -+ /* Open the parent of tofs */ -+ ASSERT3U(strlen(tofs), <, MAXNAMELEN); -+ (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); -+ error = dsl_dataset_hold(dp, buf, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ if (drba->drba_origin != NULL) { -+ dsl_dataset_t *origin; -+ error = dsl_dataset_hold(dp, drba->drba_origin, -+ FTAG, &origin); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ if (!dsl_dataset_is_snapshot(origin)) { -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ if (origin->ds_phys->ds_guid != fromguid) { -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENODEV)); - } -- if (obj == 0) -- return (ENODEV); -+ dsl_dataset_rele(origin, FTAG); - } -- } else { -- /* if full, most recent snapshot must be $ORIGIN */ -- if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) -- return (ENODEV); -+ dsl_dataset_rele(ds, FTAG); -+ error = 0; - } -- -- /* temporary clone name must not exist */ -- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_dir->dd_phys->dd_child_dir_zapobj, -- rbsa->clonelastname, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -- -- return (0); -+ return (error); - } - --/* ARGSUSED */ - static void --recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ohds = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- dsl_pool_t *dp = ohds->ds_dir->dd_pool; -- dsl_dataset_t *cds; -- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; -+ dmu_recv_begin_arg_t *drba = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb; -+ const char *tofs = drba->drba_cookie->drc_tofs; -+ dsl_dataset_t *ds, *newds; - uint64_t dsobj; -+ int error; -+ uint64_t crflags; -+ -+ crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? -+ DS_FLAG_CI_DATASET : 0; -+ -+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); -+ if (error == 0) { -+ /* create temporary clone */ -+ dsl_dataset_t *snap = NULL; -+ if (drba->drba_snapobj != 0) { -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ drba->drba_snapobj, FTAG, &snap)); -+ } -+ dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, -+ snap, crflags, drba->drba_cred, tx); -+ dsl_dataset_rele(snap, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ } else { -+ dsl_dir_t *dd; -+ const char *tail; -+ dsl_dataset_t *origin = NULL; -+ -+ VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); -+ -+ if (drba->drba_origin != NULL) { -+ VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, -+ FTAG, &origin)); -+ } -+ -+ /* Create new dataset. */ -+ dsobj = dsl_dataset_create_sync(dd, -+ strrchr(tofs, '/') + 1, -+ origin, crflags, drba->drba_cred, tx); -+ if (origin != NULL) -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ drba->drba_cookie->drc_newfs = B_TRUE; -+ } -+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); - -- /* create and open the temporary clone */ -- dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, -- ohds->ds_prev, flags, rbsa->cr, tx); -- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); -+ dmu_buf_will_dirty(newds->ds_dbuf, tx); -+ newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - -@@ -787,23 +938,10 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { -+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { - (void) dmu_objset_create_impl(dp->dp_spa, -- cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); -+ newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); - } - -- rbsa->ds = cds; -+ drba->drba_cookie->drc_ds = newds; - -- spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, -- dp->dp_spa, tx, "dataset = %lld", dsobj); --} -- --static boolean_t --dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) --{ -- int featureflags; -- -- featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); -- -- /* Verify pool version supports SA if SA_SPILL feature set */ -- return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && -- (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); -+ spa_history_log_internal_ds(newds, "receive", tx, ""); - } -@@ -815,44 +953,7 @@ dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) - int --dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, -- boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) -+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, -+ boolean_t force, char *origin, dmu_recv_cookie_t *drc) - { -- int err = 0; -- boolean_t byteswap; -- struct recvbeginsyncarg rbsa = { 0 }; -- uint64_t versioninfo; -- int flags; -- dsl_dataset_t *ds; -- -- if (drrb->drr_magic == DMU_BACKUP_MAGIC) -- byteswap = FALSE; -- else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -- byteswap = TRUE; -- else -- return (EINVAL); -- -- rbsa.tofs = tofs; -- rbsa.tosnap = tosnap; -- rbsa.origin = origin ? origin->os_dsl_dataset : NULL; -- rbsa.fromguid = drrb->drr_fromguid; -- rbsa.type = drrb->drr_type; -- rbsa.tag = FTAG; -- rbsa.dsflags = 0; -- rbsa.cr = CRED(); -- versioninfo = drrb->drr_versioninfo; -- flags = drrb->drr_flags; -- -- if (byteswap) { -- rbsa.type = BSWAP_32(rbsa.type); -- rbsa.fromguid = BSWAP_64(rbsa.fromguid); -- versioninfo = BSWAP_64(versioninfo); -- flags = BSWAP_32(flags); -- } -- -- if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || -- rbsa.type >= DMU_OST_NUMTYPES || -- ((flags & DRR_FLAG_CLONE) && origin == NULL)) -- return (EINVAL); -- -- if (flags & DRR_FLAG_CI_DATA) -- rbsa.dsflags = DS_FLAG_CI_DATASET; -+ dmu_recv_begin_arg_t drba = { 0 }; -+ dmu_replay_record_t *drr; - -@@ -861,77 +962,37 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, - drc->drc_tosnap = tosnap; -- drc->drc_top_ds = top_ds; -+ drc->drc_tofs = tofs; - drc->drc_force = force; - -- /* -- * Process the begin in syncing context. -- */ -- -- /* open the dataset we are logically receiving into */ -- err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); -- if (err == 0) { -- if (dmu_recv_verify_features(ds, drrb)) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (ENOTSUP); -- } -- /* target fs already exists; recv into temp clone */ -- -- /* Can't recv a clone into an existing fs */ -- if (flags & DRR_FLAG_CLONE) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (EINVAL); -- } -- -- /* must not have an incremental recv already in progress */ -- if (!mutex_tryenter(&ds->ds_recvlock)) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (EBUSY); -- } -- -- /* tmp clone name is: tofs/%tosnap" */ -- (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), -- "%%%s", tosnap); -- rbsa.force = force; -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_existing_check, recv_existing_sync, ds, &rbsa, 5); -- if (err) { -- mutex_exit(&ds->ds_recvlock); -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (err); -- } -- drc->drc_logical_ds = ds; -- drc->drc_real_ds = rbsa.ds; -- } else if (err == ENOENT) { -- /* target fs does not exist; must be a full backup or clone */ -- char *cp; -- -- /* -- * If it's a non-clone incremental, we are missing the -- * target fs, so fail the recv. -- */ -- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) -- return (ENOENT); -- -- /* Open the parent of tofs */ -- cp = strrchr(tofs, '/'); -- *cp = '\0'; -- err = dsl_dataset_hold(tofs, FTAG, &ds); -- *cp = '/'; -- if (err) -- return (err); -+ if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -+ drc->drc_byteswap = B_TRUE; -+ else if (drrb->drr_magic != DMU_BACKUP_MAGIC) -+ return (SET_ERROR(EINVAL)); - -- if (dmu_recv_verify_features(ds, drrb)) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -- } -+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); -+ drr->drr_type = DRR_BEGIN; -+ drr->drr_u.drr_begin = *drc->drc_drrb; -+ if (drc->drc_byteswap) { -+ fletcher_4_incremental_byteswap(drr, -+ sizeof (dmu_replay_record_t), &drc->drc_cksum); -+ } else { -+ fletcher_4_incremental_native(drr, -+ sizeof (dmu_replay_record_t), &drc->drc_cksum); -+ } -+ kmem_free(drr, sizeof (dmu_replay_record_t)); - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); -- dsl_dataset_rele(ds, FTAG); -- if (err) -- return (err); -- drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; -- drc->drc_newfs = B_TRUE; -+ if (drc->drc_byteswap) { -+ drrb->drr_magic = BSWAP_64(drrb->drr_magic); -+ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); -+ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); -+ drrb->drr_type = BSWAP_32(drrb->drr_type); -+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); -+ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - -- return (err); -+ drba.drba_origin = origin; -+ drba.drba_cookie = drc; -+ drba.drba_cred = CRED(); -+ -+ return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, -+ &drba, 5)); - } -@@ -940,3 +1001,3 @@ struct restorearg { - int err; -- int byteswap; -+ boolean_t byteswap; - vnode_t *vp; -@@ -976,3 +1037,4 @@ free_guid_map_onexit(void *arg) - while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { -- dsl_dataset_rele(gmep->gme_ds, ca); -+ dsl_dataset_long_rele(gmep->gme_ds, gmep); -+ dsl_dataset_rele(gmep->gme_ds, gmep); - kmem_free(gmep, sizeof (guid_map_entry_t)); -@@ -1001,6 +1063,6 @@ restore_read(struct restorearg *ra, int len) - if (resid == len - done) -- ra->err = EINVAL; -+ ra->err = SET_ERROR(EINVAL); - ra->voff += len - done - resid; - done = len - resid; -- if (ra->err) -+ if (ra->err != 0) - return (NULL); -@@ -1115,3 +1177,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - drro->drr_bonuslen > DN_MAX_BONUSLEN) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1121,3 +1183,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - if (err != 0 && err != ENOENT) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1125,3 +1187,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); -- if (ra->err) -+ if (ra->err != 0) - return (ra->err); -@@ -1134,3 +1196,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1148,4 +1210,4 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - } -- if (err) { -- return (EINVAL); -+ if (err != 0) { -+ return (SET_ERROR(EINVAL)); - } -@@ -1155,3 +1217,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1192,3 +1254,3 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1202,4 +1264,4 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, - -- err = dmu_free_object(os, obj); -- if (err) -+ err = dmu_free_long_object(os, obj); -+ if (err != 0) - return (err); -@@ -1219,3 +1281,3 @@ restore_write(struct restorearg *ra, objset_t *os, - !DMU_OT_IS_VALID(drrw->drr_type)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1226,3 +1288,3 @@ restore_write(struct restorearg *ra, objset_t *os, - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1233,3 +1295,3 @@ restore_write(struct restorearg *ra, objset_t *os, - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1268,3 +1330,3 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1278,6 +1340,6 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - &where)) == NULL) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else { -@@ -1296,3 +1358,3 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1317,3 +1379,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - drrs->drr_length > SPA_MAXBLOCKSIZE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1324,3 +1386,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - if (dmu_object_info(os, drrs->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1337,3 +1399,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_buf_rele(db, FTAG); -@@ -1366,6 +1428,6 @@ restore_free(struct restorearg *ra, objset_t *os, - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1376,2 +1438,12 @@ restore_free(struct restorearg *ra, objset_t *os, - -+/* used to destroy the drc_ds on error */ -+static void -+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) -+{ -+ char name[MAXNAMELEN]; -+ dsl_dataset_name(drc->drc_ds, name); -+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); -+ (void) dsl_destroy_head(name); -+} -+ - /* -@@ -1389,32 +1461,4 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - -- if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -- ra.byteswap = TRUE; -- -- { -- /* compute checksum of drr_begin record */ -- dmu_replay_record_t *drr; -- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); -- -- drr->drr_type = DRR_BEGIN; -- drr->drr_u.drr_begin = *drc->drc_drrb; -- if (ra.byteswap) { -- fletcher_4_incremental_byteswap(drr, -- sizeof (dmu_replay_record_t), &ra.cksum); -- } else { -- fletcher_4_incremental_native(drr, -- sizeof (dmu_replay_record_t), &ra.cksum); -- } -- kmem_free(drr, sizeof (dmu_replay_record_t)); -- } -- -- if (ra.byteswap) { -- struct drr_begin *drrb = drc->drc_drrb; -- drrb->drr_magic = BSWAP_64(drrb->drr_magic); -- drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); -- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); -- drrb->drr_type = BSWAP_32(drrb->drr_type); -- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); -- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); -- } -- -+ ra.byteswap = drc->drc_byteswap; -+ ra.cksum = drc->drc_cksum; - ra.vp = vp; -@@ -1425,5 +1469,5 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - /* these were verified in dmu_recv_begin */ -- ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == -+ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, - DMU_SUBSTREAM); -- ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); -+ ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); - -@@ -1432,5 +1476,5 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - */ -- VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); -+ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); - -- ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); -+ ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); - -@@ -1443,3 +1487,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (cleanup_fd == -1) { -- ra.err = EBADF; -+ ra.err = SET_ERROR(EBADF); - goto out; -@@ -1447,3 +1491,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); -- if (ra.err) { -+ if (ra.err != 0) { - cleanup_fd = -1; -@@ -1461,3 +1505,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - action_handlep); -- if (ra.err) -+ if (ra.err != 0) - goto out; -@@ -1466,3 +1510,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - (void **)&ra.guid_to_ds_map); -- if (ra.err) -+ if (ra.err != 0) - goto out; -@@ -1480,3 +1524,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (issig(JUSTLOOKING) && issig(FORREAL)) { -- ra.err = EINTR; -+ ra.err = SET_ERROR(EINTR); - goto out; -@@ -1534,3 +1578,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) -- ra.err = ECKSUM; -+ ra.err = SET_ERROR(ECKSUM); - goto out; -@@ -1544,3 +1588,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - default: -- ra.err = EINVAL; -+ ra.err = SET_ERROR(EINVAL); - goto out; -@@ -1560,10 +1604,3 @@ out: - */ -- txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); -- -- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, -- B_FALSE); -- if (drc->drc_real_ds != drc->drc_logical_ds) { -- mutex_exit(&drc->drc_logical_ds->ds_recvlock); -- dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); -- } -+ dmu_recv_cleanup_ds(drc); - } -@@ -1575,15 +1612,61 @@ out: - --struct recvendsyncarg { -- char *tosnap; -- uint64_t creation_time; -- uint64_t toguid; --}; -- - static int --recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_end_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct recvendsyncarg *resa = arg2; -+ dmu_recv_cookie_t *drc = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ int error; -+ -+ ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); -+ -+ if (!drc->drc_newfs) { -+ dsl_dataset_t *origin_head; -+ -+ error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); -+ if (error != 0) -+ return (error); -+ if (drc->drc_force) { -+ /* -+ * We will destroy any snapshots in tofs (i.e. before -+ * origin_head) that are after the origin (which is -+ * the snap before drc_ds, because drc_ds can not -+ * have any snaps of its own). -+ */ -+ uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; -+ while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { -+ dsl_dataset_t *snap; -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap); -+ if (error != 0) -+ return (error); -+ if (snap->ds_dir != origin_head->ds_dir) -+ error = SET_ERROR(EINVAL); -+ if (error == 0) { -+ error = dsl_destroy_snapshot_check_impl( -+ snap, B_FALSE); -+ } -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_dataset_rele(snap, FTAG); -+ if (error != 0) -+ return (error); -+ } -+ } -+ error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, -+ origin_head, drc->drc_force, drc->drc_owner, tx); -+ if (error != 0) { -+ dsl_dataset_rele(origin_head, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_snapshot_check_impl(origin_head, -+ drc->drc_tosnap, tx, B_TRUE); -+ dsl_dataset_rele(origin_head, FTAG); -+ if (error != 0) -+ return (error); - -- return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); -+ error = dsl_destroy_head_check_impl(drc->drc_ds, 1); -+ } else { -+ error = dsl_dataset_snapshot_check_impl(drc->drc_ds, -+ drc->drc_tosnap, tx, B_TRUE); -+ } -+ return (error); - } -@@ -1591,17 +1674,81 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_end_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct recvendsyncarg *resa = arg2; -+ dmu_recv_cookie_t *drc = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ -+ spa_history_log_internal_ds(drc->drc_ds, "finish receiving", -+ tx, "snap=%s", drc->drc_tosnap); -+ -+ if (!drc->drc_newfs) { -+ dsl_dataset_t *origin_head; -+ -+ VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, -+ &origin_head)); -+ -+ if (drc->drc_force) { -+ /* -+ * Destroy any snapshots of drc_tofs (origin_head) -+ * after the origin (the snap before drc_ds). -+ */ -+ uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; -+ while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { -+ dsl_dataset_t *snap; -+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap)); -+ ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_destroy_snapshot_sync_impl(snap, -+ B_FALSE, tx); -+ dsl_dataset_rele(snap, FTAG); -+ } -+ } -+ VERIFY3P(drc->drc_ds->ds_prev, ==, -+ origin_head->ds_prev); -+ -+ dsl_dataset_clone_swap_sync_impl(drc->drc_ds, -+ origin_head, tx); -+ dsl_dataset_snapshot_sync_impl(origin_head, -+ drc->drc_tosnap, tx); -+ -+ /* set snapshot's creation time and guid */ -+ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); -+ origin_head->ds_prev->ds_phys->ds_creation_time = -+ drc->drc_drrb->drr_creation_time; -+ origin_head->ds_prev->ds_phys->ds_guid = -+ drc->drc_drrb->drr_toguid; -+ origin_head->ds_prev->ds_phys->ds_flags &= -+ ~DS_FLAG_INCONSISTENT; -+ -+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx); -+ origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ -+ dsl_dataset_rele(origin_head, FTAG); -+ dsl_destroy_head_sync_impl(drc->drc_ds, tx); -+ -+ if (drc->drc_owner != NULL) -+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); -+ } else { -+ dsl_dataset_t *ds = drc->drc_ds; - -- dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); -+ dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - -- /* set snapshot's creation time and guid */ -- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -- ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; -- ds->ds_prev->ds_phys->ds_guid = resa->toguid; -- ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ /* set snapshot's creation time and guid */ -+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -+ ds->ds_prev->ds_phys->ds_creation_time = -+ drc->drc_drrb->drr_creation_time; -+ ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; -+ ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ } -+ drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; -+ /* -+ * Release the hold from dmu_recv_begin. This must be done before -+ * we return to open context, so that when we free the dataset's dnode, -+ * we can evict its bonus buffer. -+ */ -+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); -+ drc->drc_ds = NULL; - } -@@ -1609,6 +1756,5 @@ recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static int --add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) -+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) - { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; -+ dsl_pool_t *dp; - dsl_dataset_t *snapds; -@@ -1619,6 +1765,8 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); -+ err = dsl_pool_hold(name, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); -+ err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); - if (err == 0) { -- gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); - gmep->guid = snapds->ds_phys->ds_guid; -@@ -1626,5 +1774,8 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - avl_add(guid_map, gmep); -+ dsl_dataset_long_hold(snapds, gmep); -+ } else { -+ kmem_free(gmep, sizeof (*gmep)); - } - -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_rele(dp, FTAG); - return (err); -@@ -1632,2 +1783,4 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - -+static int dmu_recv_end_modified_blocks = 3; -+ - static int -@@ -1635,38 +1788,24 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) - { -- struct recvendsyncarg resa; -- dsl_dataset_t *ds = drc->drc_logical_ds; -- int err, myerr; -- -- if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { -- err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, -- drc->drc_force); -- if (err) -- goto out; -- } else { -- mutex_exit(&ds->ds_recvlock); -- dsl_dataset_rele(ds, dmu_recv_tag); -- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, -- B_FALSE); -- return (EBUSY); -- } -+ int error; -+ -+#ifdef _KERNEL -+ char *name; - -- resa.creation_time = drc->drc_drrb->drr_creation_time; -- resa.toguid = drc->drc_drrb->drr_toguid; -- resa.tosnap = drc->drc_tosnap; -+ /* -+ * We will be destroying the ds; make sure its origin is unmounted if -+ * necessary. -+ */ -+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP); -+ dsl_dataset_name(drc->drc_ds, name); -+ zfs_destroy_unmount_origin(name); -+ kmem_free(name, MAXNAMELEN); -+#endif - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_end_check, recv_end_sync, ds, &resa, 3); -- if (err) { -- /* swap back */ -- (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); -- } -+ error = dsl_sync_task(drc->drc_tofs, -+ dmu_recv_end_check, dmu_recv_end_sync, drc, -+ dmu_recv_end_modified_blocks); - --out: -- mutex_exit(&ds->ds_recvlock); -- if (err == 0 && drc->drc_guid_to_ds_map != NULL) -- (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); -- dsl_dataset_disown(ds, dmu_recv_tag); -- myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); -- ASSERT0(myerr); -- return (err); -+ if (error != 0) -+ dmu_recv_cleanup_ds(drc); -+ return (error); - } -@@ -1676,29 +1815,16 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) - { -- struct recvendsyncarg resa; -- dsl_dataset_t *ds = drc->drc_logical_ds; -- int err; -- -- /* -- * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() -- * expects it to have a ds_user_ptr (and zil), but clone_swap() -- * can close it. -- */ -- txg_wait_synced(ds->ds_dir->dd_pool, 0); -- -- resa.creation_time = drc->drc_drrb->drr_creation_time; -- resa.toguid = drc->drc_drrb->drr_toguid; -- resa.tosnap = drc->drc_tosnap; -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_end_check, recv_end_sync, ds, &resa, 3); -- if (err) { -- /* clean up the fs we just recv'd into */ -- (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); -- } else { -- if (drc->drc_guid_to_ds_map != NULL) -- (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); -- /* release the hold from dmu_recv_begin */ -- dsl_dataset_disown(ds, dmu_recv_tag); -+ int error; -+ -+ error = dsl_sync_task(drc->drc_tofs, -+ dmu_recv_end_check, dmu_recv_end_sync, drc, -+ dmu_recv_end_modified_blocks); -+ -+ if (error != 0) { -+ dmu_recv_cleanup_ds(drc); -+ } else if (drc->drc_guid_to_ds_map != NULL) { -+ (void) add_ds_to_guidmap(drc->drc_tofs, -+ drc->drc_guid_to_ds_map, -+ drc->drc_newsnapobj); - } -- return (err); -+ return (error); - } -@@ -1706,8 +1832,25 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) - int --dmu_recv_end(dmu_recv_cookie_t *drc) -+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) - { -- if (drc->drc_logical_ds != drc->drc_real_ds) -- return (dmu_recv_existing_end(drc)); -- else -+ drc->drc_owner = owner; -+ -+ if (drc->drc_newfs) - return (dmu_recv_new_end(drc)); -+ else -+ return (dmu_recv_existing_end(drc)); - } -+ -+/* -+ * Return TRUE if this objset is currently being received into. -+ */ -+boolean_t -+dmu_objset_is_receiving(objset_t *os) -+{ -+ return (os->os_dsl_dataset != NULL && -+ os->os_dsl_dataset->ds_owner == dmu_recv_tag); -+} -+ -+#if defined(_KERNEL) -+module_param(zfs_send_corrupt_data, int, 0644); -+MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); -+#endif -diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c -index 1c39723..bd291c6 100644 ---- a/module/zfs/dmu_traverse.c -+++ b/module/zfs/dmu_traverse.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -63,2 +63,4 @@ typedef struct traverse_data { - -+#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD) -+ - static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, -@@ -210,7 +212,4 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - { -- zbookmark_t czb; - int err = 0, lasterr = 0; - arc_buf_t *buf = NULL; -- prefetch_data_t *pd = td->td_pfd; -- boolean_t hard = td->td_flags & TRAVERSE_HARD; - boolean_t pause = B_FALSE; -@@ -236,12 +235,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - -- if (pd && !pd->pd_exited && -- ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || -+ if (td->td_pfd && !td->td_pfd->pd_exited && -+ ((td->td_pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { -- mutex_enter(&pd->pd_mtx); -- ASSERT(pd->pd_blks_fetched >= 0); -- while (pd->pd_blks_fetched == 0 && !pd->pd_exited) -- cv_wait(&pd->pd_cv, &pd->pd_mtx); -- pd->pd_blks_fetched--; -- cv_broadcast(&pd->pd_cv); -- mutex_exit(&pd->pd_mtx); -+ mutex_enter(&td->td_pfd->pd_mtx); -+ ASSERT(td->td_pfd->pd_blks_fetched >= 0); -+ while (td->td_pfd->pd_blks_fetched == 0 && -+ !td->td_pfd->pd_exited) -+ cv_wait(&td->td_pfd->pd_cv, &td->td_pfd->pd_mtx); -+ td->td_pfd->pd_blks_fetched--; -+ cv_broadcast(&td->td_pfd->pd_cv); -+ mutex_exit(&td->td_pfd->pd_mtx); - } -@@ -261,5 +261,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - uint32_t flags = ARC_WAIT; -- int i; -- blkptr_t *cbp; -- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; -+ int32_t i; -+ int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; -+ zbookmark_t *czb; - -@@ -267,11 +267,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -- cbp = buf->b_data; -+ -+ czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE); - - for (i = 0; i < epb; i++) { -- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, -+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); -- traverse_prefetch_metadata(td, &cbp[i], &czb); -+ traverse_prefetch_metadata(td, -+ &((blkptr_t *)buf->b_data)[i], czb); - } -@@ -280,8 +282,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - for (i = 0; i < epb; i++) { -- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, -+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); -- err = traverse_visitbp(td, dnp, &cbp[i], &czb); -- if (err) { -- if (!hard) -+ err = traverse_visitbp(td, dnp, -+ &((blkptr_t *)buf->b_data)[i], czb); -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -290,6 +293,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - } -+ -+ kmem_free(czb, sizeof (zbookmark_t)); -+ - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; -- int i; -- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; -+ int32_t i; -+ int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - -@@ -297,3 +303,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -@@ -310,4 +316,4 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - zb->zb_blkid * epb + i); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -323,3 +329,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -@@ -331,5 +337,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- prefetch_dnode_metadata(td, &osp->os_userused_dnode, -- zb->zb_objset, DMU_USERUSED_OBJECT); - prefetch_dnode_metadata(td, &osp->os_groupused_dnode, -+ zb->zb_objset, DMU_GROUPUSED_OBJECT); -+ prefetch_dnode_metadata(td, &osp->os_userused_dnode, - zb->zb_objset, DMU_USERUSED_OBJECT); -@@ -339,3 +345,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - DMU_META_DNODE_OBJECT); -- if (err && hard) { -+ if (err && TD_HARD(td)) { - lasterr = err; -@@ -344,7 +350,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- dnp = &osp->os_userused_dnode; -+ dnp = &osp->os_groupused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, -- DMU_USERUSED_OBJECT); -+ DMU_GROUPUSED_OBJECT); - } -- if (err && hard) { -+ if (err && TD_HARD(td)) { - lasterr = err; -@@ -353,5 +359,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- dnp = &osp->os_groupused_dnode; -+ dnp = &osp->os_userused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, -- DMU_GROUPUSED_OBJECT); -+ DMU_USERUSED_OBJECT); - } -@@ -363,3 +369,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - post: -- if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { -+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); -@@ -371,3 +377,3 @@ post: - ASSERT3U(err, ==, ERESTART); -- ASSERT(!hard); -+ ASSERT(!TD_HARD(td)); - traverse_pause(td, zb); -@@ -402,3 +408,2 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - zbookmark_t czb; -- boolean_t hard = (td->td_flags & TRAVERSE_HARD); - -@@ -407,4 +412,4 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -417,4 +422,4 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - return (err); -@@ -436,3 +441,3 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (pfd->pd_cancel) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -500,5 +505,5 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE); -- pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE); -- czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE); -+ td = kmem_alloc(sizeof (traverse_data_t), KM_PUSHPAGE); -+ pd = kmem_zalloc(sizeof (prefetch_data_t), KM_PUSHPAGE); -+ czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE); - -@@ -519,11 +524,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -+ SET_BOOKMARK(czb, td->td_objset, -+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); -+ - /* See comment on ZIL traversal in dsl_scan_visitds. */ -- if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { -- objset_t *os; -+ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { -+ uint32_t flags = ARC_WAIT; -+ objset_phys_t *osp; -+ arc_buf_t *buf; - -- err = dmu_objset_from_ds(ds, &os); -- if (err) -+ err = arc_read(NULL, td->td_spa, rootbp, -+ arc_getbuf_func, &buf, -+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); -+ if (err != 0) - return (err); - -- traverse_zil(td, &os->os_zil_header); -+ osp = buf->b_data; -+ traverse_zil(td, &osp->os_zil_header); -+ (void) arc_buf_remove_ref(buf, &buf); - } -@@ -535,4 +549,2 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- SET_BOOKMARK(czb, td->td_objset, -- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(td, NULL, rootbp, czb); -@@ -549,5 +561,5 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- kmem_free(czb, sizeof(zbookmark_t)); -- kmem_free(pd, sizeof(struct prefetch_data)); -- kmem_free(td, sizeof(struct traverse_data)); -+ kmem_free(czb, sizeof (zbookmark_t)); -+ kmem_free(pd, sizeof (struct prefetch_data)); -+ kmem_free(td, sizeof (struct traverse_data)); - -@@ -593,3 +605,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - txg_start, NULL, flags, func, arg); -- if (err) -+ if (err != 0) - return (err); -@@ -602,3 +614,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - err = dmu_object_info(mos, obj, &doi); -- if (err) { -+ if (err != 0) { - if (!hard) -@@ -613,6 +625,6 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ dsl_pool_config_enter(dp, FTAG); - err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) { -+ dsl_pool_config_exit(dp, FTAG); -+ if (err != 0) { - if (!hard) -@@ -626,3 +638,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - dsl_dataset_rele(ds, FTAG); -- if (err) { -+ if (err != 0) { - if (!hard) -diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c -index fd71413..4f2fae2 100644 ---- a/module/zfs/dmu_tx.c -+++ b/module/zfs/dmu_tx.c -@@ -50,8 +50,7 @@ dmu_tx_stats_t dmu_tx_stats = { - { "dmu_tx_group", KSTAT_DATA_UINT64 }, -- { "dmu_tx_how", KSTAT_DATA_UINT64 }, - { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, - { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, -- { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, - { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, -- { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, -+ { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, -+ { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_quota", KSTAT_DATA_UINT64 }, -@@ -66,3 +65,3 @@ dmu_tx_create_dd(dsl_dir_t *dd) - tx->tx_dir = dd; -- if (dd) -+ if (dd != NULL) - tx->tx_pool = dd->dd_pool; -@@ -72,2 +71,3 @@ dmu_tx_create_dd(dsl_dir_t *dd) - offsetof(dmu_tx_callback_t, dcb_node)); -+ tx->tx_start = gethrtime(); - #ifdef DEBUG_DMU_TX -@@ -178,3 +178,3 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) - if (db == NULL) -- return (EIO); -+ return (SET_ERROR(EIO)); - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); -@@ -389,3 +389,3 @@ out: - 2 * DMU_MAX_ACCESS) -- err = EFBIG; -+ err = SET_ERROR(EFBIG); - -@@ -467,3 +467,3 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) - -- if (blkid >= dn->dn_maxblkid) { -+ if (blkid > dn->dn_maxblkid) { - rw_exit(&dn->dn_struct_rwlock); -@@ -472,3 +472,3 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) - if (blkid + nblks > dn->dn_maxblkid) -- nblks = dn->dn_maxblkid - blkid; -+ nblks = dn->dn_maxblkid - blkid + 1; - -@@ -606,4 +606,3 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - dnode_t *dn; -- uint64_t start, end, i; -- int err, shift; -+ int err; - zio_t *zio; -@@ -617,10 +616,2 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - dn = txh->txh_dnode; -- -- /* first block */ -- if (off != 0) -- dmu_tx_count_write(txh, off, 1); -- /* last block */ -- if (len != DMU_OBJECT_END) -- dmu_tx_count_write(txh, off+len, 1); -- - dmu_tx_count_dnode(txh); -@@ -632,12 +623,44 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - -+ dmu_tx_count_dnode(txh); -+ -+ /* -+ * For i/o error checking, we read the first and last level-0 -+ * blocks if they are not aligned, and all the level-1 blocks. -+ * -+ * Note: dbuf_free_range() assumes that we have not instantiated -+ * any level-0 dbufs that will be completely freed. Therefore we must -+ * exercise care to not read or count the first and last blocks -+ * if they are blocksize-aligned. -+ */ -+ if (dn->dn_datablkshift == 0) { -+ if (off != 0 || len < dn->dn_datablksz) -+ dmu_tx_count_write(txh, 0, dn->dn_datablksz); -+ } else { -+ /* first block will be modified if it is not aligned */ -+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) -+ dmu_tx_count_write(txh, off, 1); -+ /* last block will be modified if it is not aligned */ -+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) -+ dmu_tx_count_write(txh, off+len, 1); -+ } -+ - /* -- * For i/o error checking, read the first and last level-0 -- * blocks, and all the level-1 blocks. The above count_write's -- * have already taken care of the level-0 blocks. -+ * Check level-1 blocks. - */ - if (dn->dn_nlevels > 1) { -- shift = dn->dn_datablkshift + dn->dn_indblkshift - -+ int shift = dn->dn_datablkshift + dn->dn_indblkshift - - SPA_BLKPTRSHIFT; -- start = off >> shift; -- end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; -+ uint64_t start = off >> shift; -+ uint64_t end = (off + len) >> shift; -+ uint64_t i; -+ -+ ASSERT(dn->dn_indblkshift != 0); -+ -+ /* -+ * dnode_reallocate() can result in an object with indirect -+ * blocks having an odd data block size. In this case, -+ * just check the single block. -+ */ -+ if (dn->dn_datablkshift == 0) -+ start = end = 0; - -@@ -918,4 +941,140 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) - -+/* -+ * If we can't do 10 iops, something is wrong. Let us go ahead -+ * and hit zfs_dirty_data_max. -+ */ -+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ -+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ -+ -+/* -+ * We delay transactions when we've determined that the backend storage -+ * isn't able to accommodate the rate of incoming writes. -+ * -+ * If there is already a transaction waiting, we delay relative to when -+ * that transaction finishes waiting. This way the calculated min_time -+ * is independent of the number of threads concurrently executing -+ * transactions. -+ * -+ * If we are the only waiter, wait relative to when the transaction -+ * started, rather than the current time. This credits the transaction for -+ * "time already served", e.g. reading indirect blocks. -+ * -+ * The minimum time for a transaction to take is calculated as: -+ * min_time = scale * (dirty - min) / (max - dirty) -+ * min_time is then capped at zfs_delay_max_ns. -+ * -+ * The delay has two degrees of freedom that can be adjusted via tunables. -+ * The percentage of dirty data at which we start to delay is defined by -+ * zfs_delay_min_dirty_percent. This should typically be at or above -+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to -+ * delay after writing at full speed has failed to keep up with the incoming -+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly -+ * speaking, this variable determines the amount of delay at the midpoint of -+ * the curve. -+ * -+ * delay -+ * 10ms +-------------------------------------------------------------*+ -+ * | *| -+ * 9ms + *+ -+ * | *| -+ * 8ms + *+ -+ * | * | -+ * 7ms + * + -+ * | * | -+ * 6ms + * + -+ * | * | -+ * 5ms + * + -+ * | * | -+ * 4ms + * + -+ * | * | -+ * 3ms + * + -+ * | * | -+ * 2ms + (midpoint) * + -+ * | | ** | -+ * 1ms + v *** + -+ * | zfs_delay_scale ----------> ******** | -+ * 0 +-------------------------------------*********----------------+ -+ * 0% <- zfs_dirty_data_max -> 100% -+ * -+ * Note that since the delay is added to the outstanding time remaining on the -+ * most recent transaction, the delay is effectively the inverse of IOPS. -+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve -+ * was chosen such that small changes in the amount of accumulated dirty data -+ * in the first 3/4 of the curve yield relatively small differences in the -+ * amount of delay. -+ * -+ * The effects can be easier to understand when the amount of delay is -+ * represented on a log scale: -+ * -+ * delay -+ * 100ms +-------------------------------------------------------------++ -+ * + + -+ * | | -+ * + *+ -+ * 10ms + *+ -+ * + ** + -+ * | (midpoint) ** | -+ * + | ** + -+ * 1ms + v **** + -+ * + zfs_delay_scale ----------> ***** + -+ * | **** | -+ * + **** + -+ * 100us + ** + -+ * + * + -+ * | * | -+ * + * + -+ * 10us + * + -+ * + + -+ * | | -+ * + + -+ * +--------------------------------------------------------------+ -+ * 0% <- zfs_dirty_data_max -> 100% -+ * -+ * Note here that only as the amount of dirty data approaches its limit does -+ * the delay start to increase rapidly. The goal of a properly tuned system -+ * should be to keep the amount of dirty data out of that range by first -+ * ensuring that the appropriate limits are set for the I/O scheduler to reach -+ * optimal throughput on the backend storage, and then by changing the value -+ * of zfs_delay_scale to increase the steepness of the curve. -+ */ -+static void -+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) -+{ -+ dsl_pool_t *dp = tx->tx_pool; -+ uint64_t delay_min_bytes = -+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; -+ hrtime_t wakeup, min_tx_time, now; -+ -+ if (dirty <= delay_min_bytes) -+ return; -+ -+ /* -+ * The caller has already waited until we are under the max. -+ * We make them pass us the amount of dirty data so we don't -+ * have to handle the case of it being >= the max, which could -+ * cause a divide-by-zero if it's == the max. -+ */ -+ ASSERT3U(dirty, <, zfs_dirty_data_max); -+ -+ now = gethrtime(); -+ min_tx_time = zfs_delay_scale * -+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); -+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); -+ if (now > tx->tx_start + min_tx_time) -+ return; -+ -+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, -+ uint64_t, min_tx_time); -+ -+ mutex_enter(&dp->dp_lock); -+ wakeup = MAX(tx->tx_start + min_tx_time, -+ dp->dp_last_wakeup + min_tx_time); -+ dp->dp_last_wakeup = wakeup; -+ mutex_exit(&dp->dp_lock); -+ -+ zfs_sleep_until(wakeup); -+} -+ - static int --dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) -+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) - { -@@ -947,4 +1106,11 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - txg_how != TXG_WAIT) -- return (EIO); -+ return (SET_ERROR(EIO)); -+ -+ return (SET_ERROR(ERESTART)); -+ } - -+ if (!tx->tx_waited && -+ dsl_pool_need_dirty_delay(tx->tx_pool)) { -+ tx->tx_wait_dirty = B_TRUE; -+ DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); - return (ERESTART); -@@ -971,3 +1137,3 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - DMU_TX_STAT_BUMP(dmu_tx_group); -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -988,11 +1154,2 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - /* -- * NB: This check must be after we've held the dnodes, so that -- * the dmu_tx_unassign() logic will work properly -- */ -- if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { -- DMU_TX_STAT_BUMP(dmu_tx_how); -- return (ERESTART); -- } -- -- /* - * If a snapshot has been taken since we made our estimates, -@@ -1051,2 +1208,6 @@ dmu_tx_unassign(dmu_tx_t *tx) - -+ /* -+ * Walk the transaction's hold list, removing the hold on the -+ * associated dnode, and notifying waiters if the refcount drops to 0. -+ */ - for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; -@@ -1078,3 +1239,3 @@ dmu_tx_unassign(dmu_tx_t *tx) - * a new one. This should be used when you're not holding locks. -- * If will only fail if we're truly out of space (or over quota). -+ * It will only fail if we're truly out of space (or over quota). - * -@@ -1085,10 +1246,9 @@ dmu_tx_unassign(dmu_tx_t *tx) - * -- * (3) A specific txg. Use this if you need to ensure that multiple -- * transactions all sync in the same txg. Like TXG_NOWAIT, it -- * returns ERESTART if it can't assign you into the requested txg. -+ * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() -+ * has already been called on behalf of this operation (though -+ * most likely on a different tx). - */ - int --dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) -+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) - { -- hrtime_t before, after; - int err; -@@ -1096,6 +1256,11 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) - ASSERT(tx->tx_txg == 0); -- ASSERT(txg_how != 0); -+ ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || -+ txg_how == TXG_WAITED); - ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - -- before = gethrtime(); -+ if (txg_how == TXG_WAITED) -+ tx->tx_waited = B_TRUE; -+ -+ /* If we might wait, we must not hold the config lock. */ -+ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); - -@@ -1112,7 +1277,2 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) - -- after = gethrtime(); -- -- dsl_pool_tx_assign_add_usecs(tx->tx_pool, -- (after - before) / NSEC_PER_USEC); -- - return (0); -@@ -1124,12 +1284,46 @@ dmu_tx_wait(dmu_tx_t *tx) - spa_t *spa = tx->tx_pool->dp_spa; -+ dsl_pool_t *dp = tx->tx_pool; -+ hrtime_t before; - - ASSERT(tx->tx_txg == 0); -+ ASSERT(!dsl_pool_config_held(tx->tx_pool)); - -- /* -- * It's possible that the pool has become active after this thread -- * has tried to obtain a tx. If that's the case then his -- * tx_lasttried_txg would not have been assigned. -- */ -- if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { -- txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); -+ before = gethrtime(); -+ -+ if (tx->tx_wait_dirty) { -+ uint64_t dirty; -+ -+ /* -+ * dmu_tx_try_assign() has determined that we need to wait -+ * because we've consumed much or all of the dirty buffer -+ * space. -+ */ -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_total >= zfs_dirty_data_max) -+ DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); -+ while (dp->dp_dirty_total >= zfs_dirty_data_max) -+ cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); -+ dirty = dp->dp_dirty_total; -+ mutex_exit(&dp->dp_lock); -+ -+ dmu_tx_delay(tx, dirty); -+ -+ tx->tx_wait_dirty = B_FALSE; -+ -+ /* -+ * Note: setting tx_waited only has effect if the caller -+ * used TX_WAIT. Otherwise they are going to destroy -+ * this tx and try again. The common case, zfs_write(), -+ * uses TX_WAIT. -+ */ -+ tx->tx_waited = B_TRUE; -+ } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { -+ /* -+ * If the pool is suspended we need to wait until it -+ * is resumed. Note that it's possible that the pool -+ * has become active after this thread has tried to -+ * obtain a tx. If that's the case then tx_lasttried_txg -+ * would not have been set. -+ */ -+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); - } else if (tx->tx_needassign_txh) { -@@ -1143,4 +1337,10 @@ dmu_tx_wait(dmu_tx_t *tx) - } else { -+ /* -+ * A dnode is assigned to the quiescing txg. Wait for its -+ * transaction to complete. -+ */ - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); - } -+ -+ spa_tx_assign_add_nsecs(spa, gethrtime() - before); - } -@@ -1171,2 +1371,6 @@ dmu_tx_commit(dmu_tx_t *tx) - -+ /* -+ * Go through the transaction's hold list and remove holds on -+ * associated dnodes, notifying waiters if no holds remain. -+ */ - while ((txh = list_head(&tx->tx_holds))) { -@@ -1252,2 +1456,9 @@ dmu_tx_get_txg(dmu_tx_t *tx) - -+dsl_pool_t * -+dmu_tx_pool(dmu_tx_t *tx) -+{ -+ ASSERT(tx->tx_pool != NULL); -+ return (tx->tx_pool); -+} -+ - void -diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c -index 1763bae..9bc9191 100644 ---- a/module/zfs/dmu_zfetch.c -+++ b/module/zfs/dmu_zfetch.c -@@ -25,2 +25,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #include -@@ -50,3 +54,3 @@ unsigned long zfetch_array_rd_sz = 1024 * 1024; - /* forward decls for static routines */ --static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); -+static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); - static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); -@@ -54,3 +58,3 @@ static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); - static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); --static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); -+static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); - static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); -@@ -106,5 +110,5 @@ kstat_t *zfetch_ksp; - * -- * If no co-linear streams are found, return NULL. -+ * Returns whether co-linear streams were found. - */ --static int -+static boolean_t - dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) -@@ -136,3 +140,4 @@ dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) - z_walk->zst_offset = zh->zst_offset; -- z_walk->zst_direction = diff < 0 ? -1 : 1; -+ z_walk->zst_direction = diff < 0 ? -+ ZFETCH_BACKWARD : ZFETCH_FORWARD; - z_walk->zst_stride = -@@ -154,3 +159,4 @@ dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) - z_walk->zst_offset = zh->zst_offset; -- z_walk->zst_direction = diff < 0 ? -1 : 1; -+ z_walk->zst_direction = diff < 0 ? -+ ZFETCH_BACKWARD : ZFETCH_FORWARD; - z_walk->zst_stride = -@@ -289,3 +295,3 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) - for (i = 0; i < fetchsz; i++) { -- dbuf_prefetch(dn, blkid + i); -+ dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); - } -@@ -328,3 +334,3 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) - */ --static int -+static boolean_t - dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) -@@ -641,3 +647,3 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) - zstream_t *newstream; -- int fetched; -+ boolean_t fetched; - int inserted; -@@ -701,3 +707,4 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) - } -- newstream = kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); -+ newstream = -+ kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); - } -@@ -741,2 +748 @@ MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); - #endif -- -diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c -index d8d6651..25c7775 100644 ---- a/module/zfs/dnode.c -+++ b/module/zfs/dnode.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -76,3 +76,7 @@ dnode_cons(void *arg, void *unused, int kmflag) - -- refcount_create(&dn->dn_holds); -+ /* -+ * Every dbuf has a reference, and dropping a tracked reference is -+ * O(number of references), so don't track dn_holds. -+ */ -+ refcount_create_untracked(&dn->dn_holds); - refcount_create(&dn->dn_tx_holds); -@@ -115,2 +119,3 @@ dnode_cons(void *arg, void *unused, int kmflag) - dn->dn_dbufs_count = 0; -+ dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), -@@ -167,2 +172,3 @@ dnode_dest(void *arg, void *unused) - ASSERT0(dn->dn_dbufs_count); -+ ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); -@@ -470,2 +476,3 @@ dnode_destroy(dnode_t *dn) - dn->dn_id_flags = 0; -+ dn->dn_unlisted_l0_blkid = 0; - -@@ -701,2 +708,3 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) - ndn->dn_dbufs_count = odn->dn_dbufs_count; -+ ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; - ndn->dn_bonus = odn->dn_bonus; -@@ -735,2 +743,3 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) - odn->dn_dbufs_count = 0; -+ odn->dn_unlisted_l0_blkid = 0; - odn->dn_bonus = NULL; -@@ -1029,8 +1038,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (dn == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - type = dn->dn_type; - if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - DNODE_VERIFY(dn); -@@ -1042,3 +1051,3 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (object == 0 || object >= DN_MAX_OBJECT) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1060,3 +1069,3 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (db == NULL) -- return (EIO); -+ return (SET_ERROR(EIO)); - err = dbuf_read(db, NULL, DB_RF_CANFAIL); -@@ -1369,3 +1378,3 @@ fail: - rw_exit(&dn->dn_struct_rwlock); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1522,3 +1531,3 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) - -- if (len == -1ULL) { -+ if (len == DMU_OBJECT_END) { - len = UINT64_MAX - off; -@@ -1782,5 +1791,4 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) - /* -- * Call when we think we're going to write/free space in open context. -- * Be conservative (ie. OK to write less than this or free more than -- * this, but don't write more or free less). -+ * Call when we think we're going to write/free space in open context to track -+ * the amount of memory in use by the currently open txg. - */ -@@ -1791,10 +1799,10 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) - dsl_dataset_t *ds = os->os_dsl_dataset; -+ int64_t aspace = spa_get_asize(os->os_spa, space); - -- if (space > 0) -- space = spa_get_asize(os->os_spa, space); -- -- if (ds) -- dsl_dir_willuse_space(ds->ds_dir, space, tx); -+ if (ds != NULL) { -+ dsl_dir_willuse_space(ds->ds_dir, aspace, tx); -+ dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); -+ } - -- dmu_tx_willuse_space(tx, space); -+ dmu_tx_willuse_space(tx, aspace); - } -@@ -1802,10 +1810,12 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) - /* -- * This function scans a block at the indicated "level" looking for -- * a hole or data (depending on 'flags'). If level > 0, then we are -- * scanning an indirect block looking at its pointers. If level == 0, -- * then we are looking at a block of dnodes. If we don't find what we -- * are looking for in the block, we return ESRCH. Otherwise, return -- * with *offset pointing to the beginning (if searching forwards) or -- * end (if searching backwards) of the range covered by the block -- * pointer we matched on (or dnode). -+ * Scans a block at the indicated "level" looking for a hole or data, -+ * depending on 'flags'. -+ * -+ * If level > 0, then we are scanning an indirect block looking at its -+ * pointers. If level == 0, then we are looking at a block of dnodes. -+ * -+ * If we don't find what we are looking for in the block, we return ESRCH. -+ * Otherwise, return with *offset pointing to the beginning (if searching -+ * forwards) or end (if searching backwards) of the range covered by the -+ * block pointer we matched on (or dnode). - * -@@ -1855,3 +1865,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - */ -- return (ESRCH); -+ return (SET_ERROR(ESRCH)); - } -@@ -1871,3 +1881,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - */ -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } else if (lvl == 0) { -@@ -1884,3 +1894,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - if (i < 0 || i == blkfill) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } else { -@@ -1916,3 +1926,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - if (i < 0 || i >= epb) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1960,3 +1970,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - if (dn->dn_phys->dn_nlevels == 0) { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - goto out; -@@ -1969,3 +1979,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - } else { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1990,3 +2000,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - initial_offset < *offset : initial_offset > *offset)) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - out: -diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c -index 76e6037..0ff25d2 100644 ---- a/module/zfs/dnode_sync.c -+++ b/module/zfs/dnode_sync.c -@@ -304,3 +304,3 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, - /* -- * free_range: Traverse the indicated range of the provided file -+ * Traverse the indicated range of the provided file - * and "free" all the blocks contained there. -@@ -372,3 +372,3 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) - /* -- * Try to kick all the dnodes dbufs out of the cache... -+ * Try to kick all the dnode's dbufs out of the cache... - */ -@@ -483,2 +483,3 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); -+ ASSERT3P(dn->dn_bonus, ==, NULL); - -diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c -index d9e8bd3..5eb8c01 100644 ---- a/module/zfs/dsl_dataset.c -+++ b/module/zfs/dsl_dataset.c -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2014 RackTop Systems. - */ -@@ -47,8 +48,4 @@ - #include -- --static char *dsl_reaper = "the grim reaper"; -- --static dsl_checkfunc_t dsl_dataset_destroy_begin_check; --static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; --static dsl_syncfunc_t dsl_dataset_set_reservation_sync; -+#include -+#include - -@@ -65,5 +62,2 @@ static dsl_syncfunc_t dsl_dataset_set_reservation_sync; - --#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) -- -- - /* -@@ -111,5 +105,4 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) - } -- dmu_buf_will_dirty(ds->ds_dbuf, tx); - -- mutex_enter(&ds->ds_dir->dd_lock); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_lock); -@@ -125,3 +118,2 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) - DD_USED_REFRSRV, DD_USED_HEAD, tx); -- mutex_exit(&ds->ds_dir->dd_lock); - } -@@ -162,3 +154,2 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, - -- mutex_enter(&ds->ds_dir->dd_lock); - mutex_enter(&ds->ds_lock); -@@ -173,3 +164,2 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, - DD_USED_REFRSRV, DD_USED_HEAD, tx); -- mutex_exit(&ds->ds_dir->dd_lock); - } else { -@@ -258,3 +248,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - -- ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); -+ ASSERT(ds->ds_owner == NULL); - -@@ -266,3 +256,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - if (ds->ds_prev) { -- dsl_dataset_drop_ref(ds->ds_prev, ds); -+ dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = NULL; -@@ -271,10 +261,6 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - bplist_destroy(&ds->ds_pending_deadlist); -- if (db != NULL) { -+ if (ds->ds_phys->ds_deadlist_obj != 0) - dsl_deadlist_close(&ds->ds_deadlist); -- } else { -- ASSERT(ds->ds_deadlist.dl_dbuf == NULL); -- ASSERT(!ds->ds_deadlist.dl_oldfmt); -- } - if (ds->ds_dir) -- dsl_dir_close(ds->ds_dir, ds); -+ dsl_dir_rele(ds->ds_dir, ds); - -@@ -283,6 +269,4 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - -@@ -291,3 +275,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - --static int -+int - dsl_dataset_get_snapname(dsl_dataset_t *ds) -@@ -307,3 +291,3 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) - FTAG, &headdbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -336,4 +320,4 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) - --static int --dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) -+int -+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) - { -@@ -357,4 +341,4 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) - --static int --dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, -+int -+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -@@ -367,7 +351,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -+ ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -376,4 +359,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dmu_object_info_from_db(dbuf, &doi); -- if (doi.doi_type != DMU_OT_DSL_DATASET) -- return (EINVAL); -+ if (doi.doi_type != DMU_OT_DSL_DATASET) { -+ dmu_buf_rele(dbuf, tag); -+ return (SET_ERROR(EINVAL)); -+ } - -@@ -390,8 +375,5 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); -- mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); -- -- rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL); -- cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); -+ refcount_create(&ds->ds_longholds); - -@@ -405,11 +387,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (err == 0) { -- err = dsl_dir_open_obj(dp, -+ err = dsl_dir_hold_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); - } -- if (err) { -+ if (err != 0) { - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - bplist_destroy(&ds->ds_pending_deadlist); -@@ -423,4 +403,4 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - ds->ds_snapname[0] = '\0'; -- if (ds->ds_phys->ds_prev_snap_obj) { -- err = dsl_dataset_get_ref(dp, -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, -@@ -440,25 +420,10 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { -- /* -- * In sync context, we're called with either no lock -- * or with the write lock. If we're not syncing, -- * we're always called with the read lock held. -- */ -- boolean_t need_lock = -- !RW_WRITE_HELD(&dp->dp_config_rwlock) && -- dsl_pool_sync_context(dp); -- -- if (need_lock) -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- -- err = dsl_prop_get_ds(ds, -- "refreservation", sizeof (uint64_t), 1, -- &ds->ds_reserved, NULL); -+ err = dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ &ds->ds_reserved); - if (err == 0) { -- err = dsl_prop_get_ds(ds, -- "refquota", sizeof (uint64_t), 1, -- &ds->ds_quota, NULL); -+ err = dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ &ds->ds_quota); - } -- -- if (need_lock) -- rw_exit(&dp->dp_config_rwlock); - } else { -@@ -467,7 +432,4 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - -- if (err == 0) { -- winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, -- dsl_dataset_evict); -- } -- if (err || winner) { -+ if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, -+ &ds->ds_phys, dsl_dataset_evict)) != NULL) { - bplist_destroy(&ds->ds_pending_deadlist); -@@ -475,11 +437,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (ds->ds_prev) -- dsl_dataset_drop_ref(ds->ds_prev, ds); -- dsl_dir_close(ds->ds_dir, ds); -+ dsl_dataset_rele(ds->ds_prev, ds); -+ dsl_dir_rele(ds->ds_dir, ds); - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - kmem_free(ds, sizeof (dsl_dataset_t)); -- if (err) { -+ if (err != 0) { - dmu_buf_rele(dbuf, tag); -@@ -498,9 +458,2 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); -- mutex_enter(&ds->ds_lock); -- if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { -- mutex_exit(&ds->ds_lock); -- dmu_buf_rele(ds->ds_dbuf, tag); -- return (ENOENT); -- } -- mutex_exit(&ds->ds_lock); - *dsp = ds; -@@ -509,87 +462,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - --static int --dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) --{ -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- /* -- * In syncing context we don't want the rwlock lock: there -- * may be an existing writer waiting for sync phase to -- * finish. We don't need to worry about such writers, since -- * sync phase is single-threaded, so the writer can't be -- * doing anything while we are active. -- */ -- if (dsl_pool_sync_context(dp)) { -- ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); -- return (0); -- } -- -- /* -- * Normal users will hold the ds_rwlock as a READER until they -- * are finished (i.e., call dsl_dataset_rele()). "Owners" will -- * drop their READER lock after they set the ds_owner field. -- * -- * If the dataset is being destroyed, the destroy thread will -- * obtain a WRITER lock for exclusive access after it's done its -- * open-context work and then change the ds_owner to -- * dsl_reaper once destruction is assured. So threads -- * may block here temporarily, until the "destructability" of -- * the dataset is determined. -- */ -- ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); -- mutex_enter(&ds->ds_lock); -- while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { -- rw_exit(&dp->dp_config_rwlock); -- cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); -- if (DSL_DATASET_IS_DESTROYED(ds)) { -- mutex_exit(&ds->ds_lock); -- dsl_dataset_drop_ref(ds, tag); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- return (ENOENT); -- } -- /* -- * The dp_config_rwlock lives above the ds_lock. And -- * we need to check DSL_DATASET_IS_DESTROYED() while -- * holding the ds_lock, so we have to drop and reacquire -- * the ds_lock here. -- */ -- mutex_exit(&ds->ds_lock); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- mutex_enter(&ds->ds_lock); -- } -- mutex_exit(&ds->ds_lock); -- return (0); --} -- --int --dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, -- dsl_dataset_t **dsp) --{ -- int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); -- -- if (err) -- return (err); -- return (dsl_dataset_hold_ref(*dsp, tag)); --} -- - int --dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, -+dsl_dataset_hold(dsl_pool_t *dp, const char *name, - void *tag, dsl_dataset_t **dsp) - { -- int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); -- if (err) -- return (err); -- if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { -- dsl_dataset_rele(*dsp, tag); -- *dsp = NULL; -- return (EBUSY); -- } -- return (0); --} -- --int --dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) --{ - dsl_dir_t *dd; -- dsl_pool_t *dp; - const char *snapname; -@@ -598,17 +471,12 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - -- err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); -- if (err) -+ err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); -+ if (err != 0) - return (err); - -- dp = dd->dd_pool; -+ ASSERT(dsl_pool_config_held(dp)); - obj = dd->dd_phys->dd_head_dataset_obj; -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- if (obj) -- err = dsl_dataset_get_ref(dp, obj, tag, dsp); -+ if (obj != 0) -+ err = dsl_dataset_hold_obj(dp, obj, tag, dsp); - else -- err = ENOENT; -- if (err) -- goto out; -- -- err = dsl_dataset_hold_ref(*dsp, tag); -+ err = SET_ERROR(ENOENT); - -@@ -616,3 +484,3 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - if (err == 0 && snapname != NULL) { -- dsl_dataset_t *ds = NULL; -+ dsl_dataset_t *ds; - -@@ -620,4 +488,4 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - dsl_dataset_rele(*dsp, tag); -- err = ENOENT; -- goto out; -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(ENOENT)); - } -@@ -627,8 +495,6 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - if (err == 0) -- err = dsl_dataset_get_ref(dp, obj, tag, &ds); -+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); - -- ASSERT3U((err == 0), ==, (ds != NULL)); -- -- if (ds) { -+ if (err == 0) { - mutex_enter(&ds->ds_lock); -@@ -638,9 +504,7 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - mutex_exit(&ds->ds_lock); -- err = dsl_dataset_hold_ref(ds, tag); -- *dsp = err ? NULL : ds; -+ *dsp = ds; - } - } --out: -- rw_exit(&dp->dp_config_rwlock); -- dsl_dir_close(dd, FTAG); -+ -+ dsl_dir_rele(dd, FTAG); - return (err); -@@ -649,11 +513,12 @@ out: - int --dsl_dataset_own(const char *name, boolean_t inconsistentok, -+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **dsp) - { -- int err = dsl_dataset_hold(name, tag, dsp); -- if (err) -+ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); -+ if (err != 0) - return (err); -- if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { -+ if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); -- return (EBUSY); -+ *dsp = NULL; -+ return (SET_ERROR(EBUSY)); - } -@@ -662,2 +527,45 @@ dsl_dataset_own(const char *name, boolean_t inconsistentok, - -+int -+dsl_dataset_own(dsl_pool_t *dp, const char *name, -+ void *tag, dsl_dataset_t **dsp) -+{ -+ int err = dsl_dataset_hold(dp, name, tag, dsp); -+ if (err != 0) -+ return (err); -+ if (!dsl_dataset_tryown(*dsp, tag)) { -+ dsl_dataset_rele(*dsp, tag); -+ return (SET_ERROR(EBUSY)); -+ } -+ return (0); -+} -+ -+/* -+ * See the comment above dsl_pool_hold() for details. In summary, a long -+ * hold is used to prevent destruction of a dataset while the pool hold -+ * is dropped, allowing other concurrent operations (e.g. spa_sync()). -+ * -+ * The dataset and pool must be held when this function is called. After it -+ * is called, the pool hold may be released while the dataset is still held -+ * and accessed. -+ */ -+void -+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) -+{ -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); -+ (void) refcount_add(&ds->ds_longholds, tag); -+} -+ -+void -+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) -+{ -+ (void) refcount_remove(&ds->ds_longholds, tag); -+} -+ -+/* Return B_TRUE if there are any long holds on this dataset. */ -+boolean_t -+dsl_dataset_long_held(dsl_dataset_t *ds) -+{ -+ return (!refcount_is_zero(&ds->ds_longholds)); -+} -+ - void -@@ -669,3 +577,3 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) - dsl_dir_name(ds->ds_dir, name); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { -@@ -687,33 +595,2 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) - --static int --dsl_dataset_namelen(dsl_dataset_t *ds) --{ -- int result; -- -- if (ds == NULL) { -- result = 3; /* "mos" */ -- } else { -- result = dsl_dir_namelen(ds->ds_dir); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- if (ds->ds_snapname[0]) { -- ++result; /* adding one for the @-sign */ -- if (!MUTEX_HELD(&ds->ds_lock)) { -- mutex_enter(&ds->ds_lock); -- result += strlen(ds->ds_snapname); -- mutex_exit(&ds->ds_lock); -- } else { -- result += strlen(ds->ds_snapname); -- } -- } -- } -- -- return (result); --} -- --void --dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) --{ -- dmu_buf_rele(ds->ds_dbuf, tag); --} -- - void -@@ -721,6 +598,3 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) - { -- if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { -- rw_exit(&ds->ds_rwlock); -- } -- dsl_dataset_drop_ref(ds, tag); -+ dmu_buf_rele(ds->ds_dbuf, tag); - } -@@ -730,4 +604,3 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - { -- ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || -- (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); -+ ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL); - -@@ -735,9 +608,6 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - ds->ds_owner = NULL; -- if (RW_WRITE_HELD(&ds->ds_rwlock)) { -- rw_exit(&ds->ds_rwlock); -- cv_broadcast(&ds->ds_exclusive_cv); -- } - mutex_exit(&ds->ds_lock); -- if (ds->ds_dbuf) -- dsl_dataset_drop_ref(ds, tag); -+ dsl_dataset_long_rele(ds, tag); -+ if (ds->ds_dbuf != NULL) -+ dsl_dataset_rele(ds, tag); - else -@@ -747,3 +617,3 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - boolean_t --dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) -+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) - { -@@ -752,7 +622,5 @@ dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) - mutex_enter(&ds->ds_lock); -- if (ds->ds_owner == NULL && -- (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { -+ if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { - ds->ds_owner = tag; -- if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) -- rw_exit(&ds->ds_rwlock); -+ dsl_dataset_long_hold(ds, tag); - gotit = TRUE; -@@ -763,10 +631,2 @@ dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) - --void --dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) --{ -- ASSERT3P(owner, ==, ds->ds_owner); -- if (!RW_WRITE_HELD(&ds->ds_rwlock)) -- rw_enter(&ds->ds_rwlock, RW_WRITER); --} -- - uint64_t -@@ -791,3 +651,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); -- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); -+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); -@@ -809,3 +669,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } else { -- dsl_dataset_t *ohds; -+ dsl_dataset_t *ohds; /* head of the origin snapshot */ - -@@ -826,3 +686,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); -@@ -838,5 +698,4 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } -- VERIFY(0 == zap_add_int(mos, -- origin->ds_phys->ds_next_clones_obj, -- dsobj, tx)); -+ VERIFY0(zap_add_int(mos, -+ origin->ds_phys->ds_next_clones_obj, dsobj, tx)); - } -@@ -852,3 +711,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } -- VERIFY3U(0, ==, zap_add_int(mos, -+ VERIFY0(zap_add_int(mos, - origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); -@@ -868,2 +727,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - -+static void -+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ objset_t *os; -+ -+ VERIFY0(dmu_objset_from_ds(ds, &os)); -+ bzero(&os->os_zil_header, sizeof (os->os_zil_header)); -+ dsl_dataset_dirty(ds, tx); -+} -+ - uint64_t -@@ -876,2 +745,3 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - -+ ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(lastname[0] != '@'); -@@ -879,5 +749,6 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); -- VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); -+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - -- dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); -+ dsobj = dsl_dataset_create_sync_dd(dd, origin, -+ flags & ~DS_CREATE_FLAG_NODIRTY, tx); - -@@ -885,3 +756,3 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - -@@ -891,10 +762,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - */ -- if (origin != NULL) { -+ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { - dsl_dataset_t *ds; -- objset_t *os; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); -- bzero(&os->os_zil_header, sizeof (os->os_zil_header)); -- dsl_dataset_dirty(ds, tx); -+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -+ dsl_dataset_zero_zil(ds, tx); - dsl_dataset_rele(ds, FTAG); -@@ -906,331 +774,2 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - /* -- * The snapshots must all be in the same pool. -- */ --int --dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) --{ -- int err; -- dsl_sync_task_t *dst; -- spa_t *spa; -- nvpair_t *pair; -- dsl_sync_task_group_t *dstg; -- -- pair = nvlist_next_nvpair(snaps, NULL); -- if (pair == NULL) -- return (0); -- -- err = spa_open(nvpair_name(pair), &spa, FTAG); -- if (err) -- return (err); -- dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- -- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -- pair = nvlist_next_nvpair(snaps, pair)) { -- dsl_dataset_t *ds; -- -- err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); -- if (err == 0) { -- struct dsl_ds_destroyarg *dsda; -- -- dsl_dataset_make_exclusive(ds, dstg); -- dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), -- KM_SLEEP); -- dsda->ds = ds; -- dsda->defer = defer; -- dsl_sync_task_create(dstg, dsl_dataset_destroy_check, -- dsl_dataset_destroy_sync, dsda, dstg, 0); -- } else if (err == ENOENT) { -- err = 0; -- } else { -- (void) strcpy(failed, nvpair_name(pair)); -- break; -- } -- } -- -- if (err == 0) -- err = dsl_sync_task_group_wait(dstg); -- -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- struct dsl_ds_destroyarg *dsda = dst->dst_arg1; -- dsl_dataset_t *ds = dsda->ds; -- -- /* -- * Return the file system name that triggered the error -- */ -- if (dst->dst_err) { -- dsl_dataset_name(ds, failed); -- } -- ASSERT3P(dsda->rm_origin, ==, NULL); -- dsl_dataset_disown(ds, dstg); -- kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); -- } -- -- dsl_sync_task_group_destroy(dstg); -- spa_close(spa, FTAG); -- return (err); -- --} -- --static boolean_t --dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) --{ -- boolean_t might_destroy = B_FALSE; -- -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && -- DS_IS_DEFER_DESTROY(ds)) -- might_destroy = B_TRUE; -- mutex_exit(&ds->ds_lock); -- -- return (might_destroy); --} -- --/* -- * If we're removing a clone, and these three conditions are true: -- * 1) the clone's origin has no other children -- * 2) the clone's origin has no user references -- * 3) the clone's origin has been marked for deferred destruction -- * Then, prepare to remove the origin as part of this sync task group. -- */ --static int --dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) --{ -- dsl_dataset_t *ds = dsda->ds; -- dsl_dataset_t *origin = ds->ds_prev; -- -- if (dsl_dataset_might_destroy_origin(origin)) { -- char *name; -- int namelen; -- int error; -- -- namelen = dsl_dataset_namelen(origin) + 1; -- name = kmem_alloc(namelen, KM_SLEEP); -- dsl_dataset_name(origin, name); --#ifdef _KERNEL -- error = zfs_unmount_snap(name, NULL); -- if (error) { -- kmem_free(name, namelen); -- return (error); -- } --#endif -- error = dsl_dataset_own(name, B_TRUE, tag, &origin); -- kmem_free(name, namelen); -- if (error) -- return (error); -- dsda->rm_origin = origin; -- dsl_dataset_make_exclusive(origin, tag); -- } -- -- return (0); --} -- --/* -- * ds must be opened as OWNER. On return (whether successful or not), -- * ds will be closed and caller can no longer dereference it. -- */ --int --dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) --{ -- int err; -- dsl_sync_task_group_t *dstg; -- objset_t *os; -- dsl_dir_t *dd; -- uint64_t obj; -- struct dsl_ds_destroyarg dsda = { 0 }; -- dsl_dataset_t *dummy_ds; -- -- dsda.ds = ds; -- -- if (dsl_dataset_is_snapshot(ds)) { -- /* Destroying a snapshot is simpler */ -- dsl_dataset_make_exclusive(ds, tag); -- -- dsda.defer = defer; -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_destroy_check, dsl_dataset_destroy_sync, -- &dsda, tag, 0); -- ASSERT3P(dsda.rm_origin, ==, NULL); -- goto out; -- } else if (defer) { -- err = EINVAL; -- goto out; -- } -- -- dd = ds->ds_dir; -- dummy_ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); -- dummy_ds->ds_dir = dd; -- dummy_ds->ds_object = ds->ds_object; -- -- if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -- /* -- * Check for errors and mark this ds as inconsistent, in -- * case we crash while freeing the objects. -- */ -- err = dsl_sync_task_do(dd->dd_pool, -- dsl_dataset_destroy_begin_check, -- dsl_dataset_destroy_begin_sync, ds, NULL, 0); -- if (err) -- goto out_free; -- -- err = dmu_objset_from_ds(ds, &os); -- if (err) -- goto out_free; -- -- /* -- * Remove all objects while in the open context so that -- * there is less work to do in the syncing context. -- */ -- for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, -- ds->ds_phys->ds_prev_snap_txg)) { -- /* -- * Ignore errors, if there is not enough disk space -- * we will deal with it in dsl_dataset_destroy_sync(). -- */ -- (void) dmu_free_object(os, obj); -- } -- if (err != ESRCH) -- goto out_free; -- -- /* -- * Sync out all in-flight IO. -- */ -- txg_wait_synced(dd->dd_pool, 0); -- -- /* -- * If we managed to free all the objects in open -- * context, the user space accounting should be zero. -- */ -- if (ds->ds_phys->ds_bp.blk_fill == 0 && -- dmu_objset_userused_enabled(os)) { -- ASSERTV(uint64_t count); -- -- ASSERT(zap_count(os, DMU_USERUSED_OBJECT, -- &count) != 0 || count == 0); -- ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, -- &count) != 0 || count == 0); -- } -- } -- -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); -- err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); -- rw_exit(&dd->dd_pool->dp_config_rwlock); -- -- if (err) -- goto out_free; -- -- /* -- * Blow away the dsl_dir + head dataset. -- */ -- dsl_dataset_make_exclusive(ds, tag); -- /* -- * If we're removing a clone, we might also need to remove its -- * origin. -- */ -- do { -- dsda.need_prep = B_FALSE; -- if (dsl_dir_is_clone(dd)) { -- err = dsl_dataset_origin_rm_prep(&dsda, tag); -- if (err) { -- dsl_dir_close(dd, FTAG); -- goto out_free; -- } -- } -- -- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); -- dsl_sync_task_create(dstg, dsl_dataset_destroy_check, -- dsl_dataset_destroy_sync, &dsda, tag, 0); -- dsl_sync_task_create(dstg, dsl_dir_destroy_check, -- dsl_dir_destroy_sync, dummy_ds, FTAG, 0); -- err = dsl_sync_task_group_wait(dstg); -- dsl_sync_task_group_destroy(dstg); -- -- /* -- * We could be racing against 'zfs release' or 'zfs destroy -d' -- * on the origin snap, in which case we can get EBUSY if we -- * needed to destroy the origin snap but were not ready to -- * do so. -- */ -- if (dsda.need_prep) { -- ASSERT(err == EBUSY); -- ASSERT(dsl_dir_is_clone(dd)); -- ASSERT(dsda.rm_origin == NULL); -- } -- } while (dsda.need_prep); -- -- if (dsda.rm_origin != NULL) -- dsl_dataset_disown(dsda.rm_origin, tag); -- -- /* if it is successful, dsl_dir_destroy_sync will close the dd */ -- if (err) -- dsl_dir_close(dd, FTAG); -- --out_free: -- kmem_free(dummy_ds, sizeof (dsl_dataset_t)); --out: -- dsl_dataset_disown(ds, tag); -- return (err); --} -- --blkptr_t * --dsl_dataset_get_blkptr(dsl_dataset_t *ds) --{ -- return (&ds->ds_phys->ds_bp); --} -- --void --dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) --{ -- ASSERT(dmu_tx_is_syncing(tx)); -- /* If it's the meta-objset, set dp_meta_rootbp */ -- if (ds == NULL) { -- tx->tx_pool->dp_meta_rootbp = *bp; -- } else { -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_bp = *bp; -- } --} -- --spa_t * --dsl_dataset_get_spa(dsl_dataset_t *ds) --{ -- return (ds->ds_dir->dd_pool->dp_spa); --} -- --void --dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) --{ -- dsl_pool_t *dp; -- -- if (ds == NULL) /* this is the meta-objset */ -- return; -- -- ASSERT(ds->ds_objset != NULL); -- -- if (ds->ds_phys->ds_next_snap_obj != 0) -- panic("dirtying snapshot!"); -- -- dp = ds->ds_dir->dd_pool; -- -- if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { -- /* up the hold count until we can be written out */ -- dmu_buf_add_ref(ds->ds_dbuf, ds); -- } --} -- --boolean_t --dsl_dataset_is_dirty(dsl_dataset_t *ds) --{ -- int t; -- -- for (t = 0; t < TXG_SIZE; t++) { -- if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, -- ds, t)) -- return (B_TRUE); -- } -- return (B_FALSE); --} -- --/* - * The unique space in the head dataset can be calculated by subtracting -@@ -1242,3 +781,3 @@ dsl_dataset_is_dirty(dsl_dataset_t *ds) - */ --static void -+void - dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) -@@ -1266,232 +805,6 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) - --struct killarg { -- dsl_dataset_t *ds; -- dmu_tx_t *tx; --}; -- --/* ARGSUSED */ --static int --kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, -- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) --{ -- struct killarg *ka = arg; -- dmu_tx_t *tx = ka->tx; -- -- if (bp == NULL) -- return (0); -- -- if (zb->zb_level == ZB_ZIL_LEVEL) { -- ASSERT(zilog != NULL); -- /* -- * It's a block in the intent log. It has no -- * accounting, so just free it. -- */ -- dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); -- } else { -- ASSERT(zilog == NULL); -- ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); -- (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); -- } -- -- return (0); --} -- --/* ARGSUSED */ --static int --dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- uint64_t count; -- int err; -- -- /* -- * Can't delete a head dataset if there are snapshots of it. -- * (Except if the only snapshots are from the branch we cloned -- * from.) -- */ -- if (ds->ds_prev != NULL && -- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -- return (EBUSY); -- -- /* -- * This is really a dsl_dir thing, but check it here so that -- * we'll be less likely to leave this dataset inconsistent & -- * nearly destroyed. -- */ -- err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); -- if (err) -- return (err); -- if (count != 0) -- return (EEXIST); -- -- return (0); --} -- --/* ARGSUSED */ --static void --dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- /* Mark it as inconsistent on-disk, in case we crash */ -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -- -- spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, -- "dataset = %llu", ds->ds_object); --} -- --static int --dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, -+void -+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, - dmu_tx_t *tx) - { -- dsl_dataset_t *ds = dsda->ds; -- dsl_dataset_t *ds_prev = ds->ds_prev; -- -- if (dsl_dataset_might_destroy_origin(ds_prev)) { -- struct dsl_ds_destroyarg ndsda = {0}; -- -- /* -- * If we're not prepared to remove the origin, don't remove -- * the clone either. -- */ -- if (dsda->rm_origin == NULL) { -- dsda->need_prep = B_TRUE; -- return (EBUSY); -- } -- -- ndsda.ds = ds_prev; -- ndsda.is_origin_rm = B_TRUE; -- return (dsl_dataset_destroy_check(&ndsda, tag, tx)); -- } -- -- /* -- * If we're not going to remove the origin after all, -- * undo the open context setup. -- */ -- if (dsda->rm_origin != NULL) { -- dsl_dataset_disown(dsda->rm_origin, tag); -- dsda->rm_origin = NULL; -- } -- -- return (0); --} -- --/* -- * If you add new checks here, you may need to add -- * additional checks to the "temporary" case in -- * snapshot_check() in dmu_objset.c. -- */ --/* ARGSUSED */ --int --dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- struct dsl_ds_destroyarg *dsda = arg1; -- dsl_dataset_t *ds = dsda->ds; -- -- /* we have an owner hold, so noone else can destroy us */ -- ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); -- -- /* -- * Only allow deferred destroy on pools that support it. -- * NOTE: deferred destroy is only supported on snapshots. -- */ -- if (dsda->defer) { -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -- SPA_VERSION_USERREFS) -- return (ENOTSUP); -- ASSERT(dsl_dataset_is_snapshot(ds)); -- return (0); -- } -- -- /* -- * Can't delete a head dataset if there are snapshots of it. -- * (Except if the only snapshots are from the branch we cloned -- * from.) -- */ -- if (ds->ds_prev != NULL && -- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -- return (EBUSY); -- -- /* -- * If we made changes this txg, traverse_dsl_dataset won't find -- * them. Try again. -- */ -- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) -- return (EAGAIN); -- -- if (dsl_dataset_is_snapshot(ds)) { -- /* -- * If this snapshot has an elevated user reference count, -- * we can't destroy it yet. -- */ -- if (ds->ds_userrefs > 0 && !dsda->releasing) -- return (EBUSY); -- -- mutex_enter(&ds->ds_lock); -- /* -- * Can't delete a branch point. However, if we're destroying -- * a clone and removing its origin due to it having a user -- * hold count of 0 and having been marked for deferred destroy, -- * it's OK for the origin to have a single clone. -- */ -- if (ds->ds_phys->ds_num_children > -- (dsda->is_origin_rm ? 2 : 1)) { -- mutex_exit(&ds->ds_lock); -- return (EEXIST); -- } -- mutex_exit(&ds->ds_lock); -- } else if (dsl_dir_is_clone(ds->ds_dir)) { -- return (dsl_dataset_origin_check(dsda, arg2, tx)); -- } -- -- /* XXX we should do some i/o error checking... */ -- return (0); --} -- --struct refsarg { -- kmutex_t lock; -- boolean_t gone; -- kcondvar_t cv; --}; -- --/* ARGSUSED */ --static void --dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) --{ -- struct refsarg *arg = argv; -- -- mutex_enter(&arg->lock); -- arg->gone = TRUE; -- cv_signal(&arg->cv); -- mutex_exit(&arg->lock); --} -- --static void --dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) --{ -- struct refsarg arg; -- -- mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); -- cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); -- arg.gone = FALSE; -- (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, -- dsl_dataset_refs_gone); -- dmu_buf_rele(ds->ds_dbuf, tag); -- mutex_enter(&arg.lock); -- while (!arg.gone) -- cv_wait(&arg.cv, &arg.lock); -- ASSERT(arg.gone); -- mutex_exit(&arg.lock); -- ds->ds_dbuf = NULL; -- ds->ds_phys = NULL; -- mutex_destroy(&arg.lock); -- cv_destroy(&arg.cv); --} -- --static void --remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) --{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -@@ -1512,6 +825,5 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) - */ -- if (err != ENOENT) { -+ if (err != ENOENT) - VERIFY0(err); -- } -- ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, -+ ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); -@@ -1520,121 +832,26 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) - --static void --dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) --{ -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- zap_cursor_t zc; -- zap_attribute_t za; -- -- /* -- * If it is the old version, dd_clones doesn't exist so we can't -- * find the clones, but deadlist_remove_key() is a no-op so it -- * doesn't matter. -- */ -- if (ds->ds_dir->dd_phys->dd_clones == 0) -- return; -- -- for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); -- zap_cursor_retrieve(&zc, &za) == 0; -- zap_cursor_advance(&zc)) { -- dsl_dataset_t *clone; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- za.za_first_integer, FTAG, &clone)); -- if (clone->ds_dir->dd_origin_txg > mintxg) { -- dsl_deadlist_remove_key(&clone->ds_deadlist, -- mintxg, tx); -- dsl_dataset_remove_clones_key(clone, mintxg, tx); -- } -- dsl_dataset_rele(clone, FTAG); -- } -- zap_cursor_fini(&zc); -+blkptr_t * -+dsl_dataset_get_blkptr(dsl_dataset_t *ds) -+{ -+ return (&ds->ds_phys->ds_bp); - } - --struct process_old_arg { -- dsl_dataset_t *ds; -- dsl_dataset_t *ds_prev; -- boolean_t after_branch_point; -- zio_t *pio; -- uint64_t used, comp, uncomp; --}; -- --static int --process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -+void -+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) - { -- struct process_old_arg *poa = arg; -- dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; -- -- if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { -- dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); -- if (poa->ds_prev && !poa->after_branch_point && -- bp->blk_birth > -- poa->ds_prev->ds_phys->ds_prev_snap_txg) { -- poa->ds_prev->ds_phys->ds_unique_bytes += -- bp_get_dsize_sync(dp->dp_spa, bp); -- } -+ ASSERT(dmu_tx_is_syncing(tx)); -+ /* If it's the meta-objset, set dp_meta_rootbp */ -+ if (ds == NULL) { -+ tx->tx_pool->dp_meta_rootbp = *bp; - } else { -- poa->used += bp_get_dsize_sync(dp->dp_spa, bp); -- poa->comp += BP_GET_PSIZE(bp); -- poa->uncomp += BP_GET_UCSIZE(bp); -- dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_bp = *bp; - } -- return (0); - } - --static void --process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, -- dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) --{ -- struct process_old_arg poa = { 0 }; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- -- ASSERT(ds->ds_deadlist.dl_oldfmt); -- ASSERT(ds_next->ds_deadlist.dl_oldfmt); -- -- poa.ds = ds; -- poa.ds_prev = ds_prev; -- poa.after_branch_point = after_branch_point; -- poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, -- process_old_cb, &poa, tx)); -- VERIFY0(zio_wait(poa.pio)); -- ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); -- -- /* change snapused */ -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -- -poa.used, -poa.comp, -poa.uncomp, tx); -- -- /* swap next's deadlist to our deadlist */ -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_close(&ds_next->ds_deadlist); -- SWITCH64(ds_next->ds_phys->ds_deadlist_obj, -- ds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&ds_next->ds_deadlist, mos, -- ds_next->ds_phys->ds_deadlist_obj); --} -- --static int --old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -+spa_t * -+dsl_dataset_get_spa(dsl_dataset_t *ds) - { -- int err; -- struct killarg ka; -- -- /* -- * Free everything that we point to (that's born after -- * the previous snapshot, if we are a clone) -- * -- * NB: this should be very quick, because we already -- * freed all the objects in open context. -- */ -- ka.ds = ds; -- ka.tx = tx; -- err = traverse_dataset(ds, -- ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, -- kill_blkptr, &ka); -- ASSERT0(err); -- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); -- -- return (err); -+ return (ds->ds_dir->dd_pool->dp_spa); - } -@@ -1642,355 +859,33 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) - void --dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) - { -- struct dsl_ds_destroyarg *dsda = arg1; -- dsl_dataset_t *ds = dsda->ds; -- int err = 0; -- int after_branch_point = FALSE; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- dsl_dataset_t *ds_prev = NULL; -- boolean_t wont_destroy; -- uint64_t obj; -- -- wont_destroy = (dsda->defer && -- (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); -- -- ASSERT(ds->ds_owner || wont_destroy); -- ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); -- ASSERT(ds->ds_prev == NULL || -- ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); -- ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ dsl_pool_t *dp; - -- if (wont_destroy) { -- ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; -+ if (ds == NULL) /* this is the meta-objset */ - return; -- } -- -- /* signal any waiters that this dataset is going away */ -- mutex_enter(&ds->ds_lock); -- ds->ds_owner = dsl_reaper; -- cv_broadcast(&ds->ds_exclusive_cv); -- mutex_exit(&ds->ds_lock); -- -- /* Remove our reservation */ -- if (ds->ds_reserved != 0) { -- dsl_prop_setarg_t psa; -- uint64_t value = 0; -- -- dsl_prop_setarg_init_uint64(&psa, "refreservation", -- (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -- &value); -- psa.psa_effective_value = 0; /* predict default value */ -- -- dsl_dataset_set_reservation_sync(ds, &psa, tx); -- ASSERT0(ds->ds_reserved); -- } -- -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); -- -- dsl_scan_ds_destroyed(ds, tx); -- -- obj = ds->ds_object; -- -- if (ds->ds_phys->ds_prev_snap_obj != 0) { -- if (ds->ds_prev) { -- ds_prev = ds->ds_prev; -- } else { -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); -- } -- after_branch_point = -- (ds_prev->ds_phys->ds_next_snap_obj != obj); -- -- dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); -- if (after_branch_point && -- ds_prev->ds_phys->ds_next_clones_obj != 0) { -- remove_from_next_clones(ds_prev, obj, tx); -- if (ds->ds_phys->ds_next_snap_obj != 0) { -- VERIFY(0 == zap_add_int(mos, -- ds_prev->ds_phys->ds_next_clones_obj, -- ds->ds_phys->ds_next_snap_obj, tx)); -- } -- } -- if (after_branch_point && -- ds->ds_phys->ds_next_snap_obj == 0) { -- /* This clone is toast. */ -- ASSERT(ds_prev->ds_phys->ds_num_children > 1); -- ds_prev->ds_phys->ds_num_children--; -- -- /* -- * If the clone's origin has no other clones, no -- * user holds, and has been marked for deferred -- * deletion, then we should have done the necessary -- * destroy setup for it. -- */ -- if (ds_prev->ds_phys->ds_num_children == 1 && -- ds_prev->ds_userrefs == 0 && -- DS_IS_DEFER_DESTROY(ds_prev)) { -- ASSERT3P(dsda->rm_origin, !=, NULL); -- } else { -- ASSERT3P(dsda->rm_origin, ==, NULL); -- } -- } else if (!after_branch_point) { -- ds_prev->ds_phys->ds_next_snap_obj = -- ds->ds_phys->ds_next_snap_obj; -- } -- } -- -- if (dsl_dataset_is_snapshot(ds)) { -- dsl_dataset_t *ds_next; -- uint64_t old_unique; -- uint64_t used = 0, comp = 0, uncomp = 0; -- -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); -- ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); -- -- old_unique = ds_next->ds_phys->ds_unique_bytes; -- -- dmu_buf_will_dirty(ds_next->ds_dbuf, tx); -- ds_next->ds_phys->ds_prev_snap_obj = -- ds->ds_phys->ds_prev_snap_obj; -- ds_next->ds_phys->ds_prev_snap_txg = -- ds->ds_phys->ds_prev_snap_txg; -- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, -- ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); -- -- -- if (ds_next->ds_deadlist.dl_oldfmt) { -- process_old_deadlist(ds, ds_prev, ds_next, -- after_branch_point, tx); -- } else { -- /* Adjust prev's unique space. */ -- if (ds_prev && !after_branch_point) { -- dsl_deadlist_space_range(&ds_next->ds_deadlist, -- ds_prev->ds_phys->ds_prev_snap_txg, -- ds->ds_phys->ds_prev_snap_txg, -- &used, &comp, &uncomp); -- ds_prev->ds_phys->ds_unique_bytes += used; -- } -- -- /* Adjust snapused. */ -- dsl_deadlist_space_range(&ds_next->ds_deadlist, -- ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -- &used, &comp, &uncomp); -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -- -used, -comp, -uncomp, tx); -- -- /* Move blocks to be freed to pool's free list. */ -- dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, -- &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, -- tx); -- dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, -- DD_USED_HEAD, used, comp, uncomp, tx); -- -- /* Merge our deadlist into next's and free it. */ -- dsl_deadlist_merge(&ds_next->ds_deadlist, -- ds->ds_phys->ds_deadlist_obj, tx); -- } -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -- -- /* Collapse range in clone heads */ -- dsl_dataset_remove_clones_key(ds, -- ds->ds_phys->ds_creation_txg, tx); -- -- if (dsl_dataset_is_snapshot(ds_next)) { -- dsl_dataset_t *ds_nextnext; -- dsl_dataset_t *hds; -- -- /* -- * Update next's unique to include blocks which -- * were previously shared by only this snapshot -- * and it. Those blocks will be born after the -- * prev snap and before this snap, and will have -- * died after the next snap and before the one -- * after that (ie. be on the snap after next's -- * deadlist). -- */ -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds_next->ds_phys->ds_next_snap_obj, -- FTAG, &ds_nextnext)); -- dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, -- ds->ds_phys->ds_prev_snap_txg, -- ds->ds_phys->ds_creation_txg, -- &used, &comp, &uncomp); -- ds_next->ds_phys->ds_unique_bytes += used; -- dsl_dataset_rele(ds_nextnext, FTAG); -- ASSERT3P(ds_next->ds_prev, ==, NULL); -- -- /* Collapse range in this head. */ -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_head_dataset_obj, -- FTAG, &hds)); -- dsl_deadlist_remove_key(&hds->ds_deadlist, -- ds->ds_phys->ds_creation_txg, tx); -- dsl_dataset_rele(hds, FTAG); -- -- } else { -- ASSERT3P(ds_next->ds_prev, ==, ds); -- dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); -- ds_next->ds_prev = NULL; -- if (ds_prev) { -- VERIFY(0 == dsl_dataset_get_ref(dp, -- ds->ds_phys->ds_prev_snap_obj, -- ds_next, &ds_next->ds_prev)); -- } -- -- dsl_dataset_recalc_head_uniq(ds_next); -- -- /* -- * Reduce the amount of our unconsmed refreservation -- * being charged to our parent by the amount of -- * new unique data we have gained. -- */ -- if (old_unique < ds_next->ds_reserved) { -- int64_t mrsdelta; -- uint64_t new_unique = -- ds_next->ds_phys->ds_unique_bytes; -- -- ASSERT(old_unique <= new_unique); -- mrsdelta = MIN(new_unique - old_unique, -- ds_next->ds_reserved - old_unique); -- dsl_dir_diduse_space(ds->ds_dir, -- DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); -- } -- } -- dsl_dataset_rele(ds_next, FTAG); -- } else { -- zfeature_info_t *async_destroy = -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; -- objset_t *os; -- -- /* -- * There's no next snapshot, so this is a head dataset. -- * Destroy the deadlist. Unless it's a clone, the -- * deadlist should be empty. (If it's a clone, it's -- * safe to ignore the deadlist contents.) -- */ -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -- ds->ds_phys->ds_deadlist_obj = 0; -- -- VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); -- -- if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { -- err = old_synchronous_dataset_destroy(ds, tx); -- } else { -- /* -- * Move the bptree into the pool's list of trees to -- * clean up and update space accounting information. -- */ -- uint64_t used, comp, uncomp; -- -- zil_destroy_sync(dmu_objset_zil(os), tx); -- -- if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { -- spa_feature_incr(dp->dp_spa, async_destroy, tx); -- dp->dp_bptree_obj = bptree_alloc(mos, tx); -- VERIFY(zap_add(mos, -- DMU_POOL_DIRECTORY_OBJECT, -- DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, -- &dp->dp_bptree_obj, tx) == 0); -- } - -- used = ds->ds_dir->dd_phys->dd_used_bytes; -- comp = ds->ds_dir->dd_phys->dd_compressed_bytes; -- uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; -- -- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || -- ds->ds_phys->ds_unique_bytes == used); -- -- bptree_add(mos, dp->dp_bptree_obj, -- &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, -- used, comp, uncomp, tx); -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -- -used, -comp, -uncomp, tx); -- dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -- used, comp, uncomp, tx); -- } -+ ASSERT(ds->ds_objset != NULL); - -- if (ds->ds_prev != NULL) { -- if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -- VERIFY3U(0, ==, zap_remove_int(mos, -- ds->ds_prev->ds_dir->dd_phys->dd_clones, -- ds->ds_object, tx)); -- } -- dsl_dataset_rele(ds->ds_prev, ds); -- ds->ds_prev = ds_prev = NULL; -- } -- } -+ if (ds->ds_phys->ds_next_snap_obj != 0) -+ panic("dirtying snapshot!"); - -- /* -- * This must be done after the dsl_traverse(), because it will -- * re-open the objset. -- */ -- if (ds->ds_objset) { -- dmu_objset_evict(ds->ds_objset); -- ds->ds_objset = NULL; -- } -+ dp = ds->ds_dir->dd_pool; - -- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { -- /* Erase the link in the dir */ -- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -- ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; -- ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); -- err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); -- ASSERT(err == 0); -- } else { -- /* remove from snapshot namespace */ -- dsl_dataset_t *ds_head; -- ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); --#ifdef ZFS_DEBUG -- { -- uint64_t val; -- -- err = dsl_dataset_snap_lookup(ds_head, -- ds->ds_snapname, &val); -- ASSERT0(err); -- ASSERT3U(val, ==, obj); -- } --#endif -- err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); -- ASSERT(err == 0); -- dsl_dataset_rele(ds_head, FTAG); -+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { -+ /* up the hold count until we can be written out */ -+ dmu_buf_add_ref(ds->ds_dbuf, ds); - } -+} - -- if (ds_prev && ds->ds_prev != ds_prev) -- dsl_dataset_rele(ds_prev, FTAG); -- -- spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -- spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, -- "dataset = %llu", ds->ds_object); -- -- if (ds->ds_phys->ds_next_clones_obj != 0) { -- ASSERTV(uint64_t count); -- ASSERT(0 == zap_count(mos, -- ds->ds_phys->ds_next_clones_obj, &count) && count == 0); -- VERIFY(0 == dmu_object_free(mos, -- ds->ds_phys->ds_next_clones_obj, tx)); -- } -- if (ds->ds_phys->ds_props_obj != 0) -- VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); -- if (ds->ds_phys->ds_userrefs_obj != 0) -- VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); -- dsl_dir_close(ds->ds_dir, ds); -- ds->ds_dir = NULL; -- dsl_dataset_drain_refs(ds, tag); -- VERIFY(0 == dmu_object_free(mos, obj, tx)); -- -- if (dsda->rm_origin) { -- /* -- * Remove the origin of the clone we just destroyed. -- */ -- struct dsl_ds_destroyarg ndsda = {0}; -+boolean_t -+dsl_dataset_is_dirty(dsl_dataset_t *ds) -+{ -+ int t; - -- ndsda.ds = dsda->rm_origin; -- dsl_dataset_destroy_sync(&ndsda, tag, tx); -+ for (t = 0; t < TXG_SIZE; t++) { -+ if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, -+ ds, t)) -+ return (B_TRUE); - } -+ return (B_FALSE); - } -@@ -2013,6 +908,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - - /* -- * Propogate any reserved space for this snapshot to other -+ * Propagate any reserved space for this snapshot to other - * snapshot checks in this sync group. -@@ -2025,10 +920,20 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) - -+typedef struct dsl_dataset_snapshot_arg { -+ nvlist_t *ddsa_snaps; -+ nvlist_t *ddsa_props; -+ nvlist_t *ddsa_errors; -+} dsl_dataset_snapshot_arg_t; -+ - int --dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx, boolean_t recv) - { -- dsl_dataset_t *ds = arg1; -- const char *snapname = arg2; -- int err; -+ int error; - uint64_t value; - -+ ds->ds_trysnap_txg = tx->tx_txg; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ - /* -@@ -2038,25 +943,29 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - - /* -- * Check for conflicting name snapshot name. -+ * Check for conflicting snapshot name. - */ -- err = dsl_dataset_snap_lookup(ds, snapname, &value); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ error = dsl_dataset_snap_lookup(ds, snapname, &value); -+ if (error == 0) -+ return (SET_ERROR(EEXIST)); -+ if (error != ENOENT) -+ return (error); - - /* -- * Check that the dataset's name is not too long. Name consists -- * of the dataset's length + 1 for the @-sign + snapshot name's length -+ * We don't allow taking snapshots of inconsistent datasets, such as -+ * those into which we are currently receiving. However, if we are -+ * creating this snapshot as part of a receive, this check will be -+ * executed atomically with respect to the completion of the receive -+ * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this -+ * case we ignore this, knowing it will be fixed up for us shortly in -+ * dmu_recv_end_sync(). - */ -- if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (!recv && DS_IS_INCONSISTENT(ds)) -+ return (SET_ERROR(EBUSY)); - -- err = dsl_dataset_snapshot_reserve_space(ds, tx); -- if (err) -- return (err); -+ error = dsl_dataset_snapshot_reserve_space(ds, tx); -+ if (error != 0) -+ return (error); - -- ds->ds_trysnap_txg = tx->tx_txg; - return (0); -@@ -2064,7 +973,50 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - -+static int -+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_arg_t *ddsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ int rv = 0; -+ -+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { -+ int error = 0; -+ dsl_dataset_t *ds; -+ char *name, *atp; -+ char dsname[MAXNAMELEN]; -+ -+ name = nvpair_name(pair); -+ if (strlen(name) >= MAXNAMELEN) -+ error = SET_ERROR(ENAMETOOLONG); -+ if (error == 0) { -+ atp = strchr(name, '@'); -+ if (atp == NULL) -+ error = SET_ERROR(EINVAL); -+ if (error == 0) -+ (void) strlcpy(dsname, name, atp - name + 1); -+ } -+ if (error == 0) -+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_dataset_snapshot_check_impl(ds, -+ atp + 1, tx, B_FALSE); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error != 0) { -+ if (ddsa->ddsa_errors != NULL) { -+ fnvlist_add_int32(ddsa->ddsa_errors, -+ name, error); -+ } -+ rv = error; -+ } -+ } -+ return (rv); -+} -+ - void --dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- const char *snapname = arg2; - dsl_pool_t *dp = ds->ds_dir->dd_pool; -@@ -2074,5 +1026,16 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - objset_t *mos = dp->dp_meta_objset; -- int err; -+ ASSERTV(static zil_header_t zero_zil); -+ ASSERTV(objset_t *os); -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ /* -+ * If we are on an old pool, the zil must not be active, in which -+ * case it will be zeroed. Usually zil_suspend() accomplishes this. -+ */ -+ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || -+ dmu_objset_from_ds(ds, &os) != 0 || -+ bcmp(&os->os_phys->os_zil_header, &zero_zil, -+ sizeof (zero_zil)) == 0); - -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - -@@ -2088,3 +1051,3 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); -- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); -+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); -@@ -2123,5 +1086,5 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } else if (next_clones_obj != 0) { -- remove_from_next_clones(ds->ds_prev, -+ dsl_dataset_remove_from_next_clones(ds->ds_prev, - dsphys->ds_next_snap_obj, tx); -- VERIFY3U(0, ==, zap_add_int(mos, -+ VERIFY0(zap_add_int(mos, - next_clones_obj, dsobj, tx)); -@@ -2144,5 +1107,2 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - dmu_buf_will_dirty(ds->ds_dbuf, tx); -- zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", -- ds->ds_dir->dd_myname, snapname, dsobj, -- ds->ds_phys->ds_prev_snap_txg); - ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, -@@ -2161,9 +1121,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, -- snapname, 8, 1, &dsobj, tx); -- ASSERT(err == 0); -+ VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, -+ snapname, 8, 1, &dsobj, tx)); - - if (ds->ds_prev) -- dsl_dataset_drop_ref(ds->ds_prev, ds); -- VERIFY(0 == dsl_dataset_get_ref(dp, -+ dsl_dataset_rele(ds->ds_prev, ds); -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); -@@ -2174,6 +1133,208 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, -- "dataset = %llu", dsobj); -+ spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); - } - -+static void -+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_arg_t *ddsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { -+ dsl_dataset_t *ds; -+ char *name, *atp; -+ char dsname[MAXNAMELEN]; -+ -+ name = nvpair_name(pair); -+ atp = strchr(name, '@'); -+ (void) strlcpy(dsname, name, atp - name + 1); -+ VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); -+ -+ dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); -+ if (ddsa->ddsa_props != NULL) { -+ dsl_props_set_sync_impl(ds->ds_prev, -+ ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The snapshots must all be in the same pool. -+ * All-or-nothing: if there are any failures, nothing will be modified. -+ */ -+int -+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) -+{ -+ dsl_dataset_snapshot_arg_t ddsa; -+ nvpair_t *pair; -+ boolean_t needsuspend; -+ int error; -+ spa_t *spa; -+ char *firstname; -+ nvlist_t *suspended = NULL; -+ -+ pair = nvlist_next_nvpair(snaps, NULL); -+ if (pair == NULL) -+ return (0); -+ firstname = nvpair_name(pair); -+ -+ error = spa_open(firstname, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -+ spa_close(spa, FTAG); -+ -+ if (needsuspend) { -+ suspended = fnvlist_alloc(); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char fsname[MAXNAMELEN]; -+ char *snapname = nvpair_name(pair); -+ char *atp; -+ void *cookie; -+ -+ atp = strchr(snapname, '@'); -+ if (atp == NULL) { -+ error = SET_ERROR(EINVAL); -+ break; -+ } -+ (void) strlcpy(fsname, snapname, atp - snapname + 1); -+ -+ error = zil_suspend(fsname, &cookie); -+ if (error != 0) -+ break; -+ fnvlist_add_uint64(suspended, fsname, -+ (uintptr_t)cookie); -+ } -+ } -+ -+ ddsa.ddsa_snaps = snaps; -+ ddsa.ddsa_props = props; -+ ddsa.ddsa_errors = errors; -+ -+ if (error == 0) { -+ error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, -+ dsl_dataset_snapshot_sync, &ddsa, -+ fnvlist_num_pairs(snaps) * 3); -+ } -+ -+ if (suspended != NULL) { -+ for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(suspended, pair)) { -+ zil_resume((void *)(uintptr_t) -+ fnvpair_value_uint64(pair)); -+ } -+ fnvlist_free(suspended); -+ } -+ -+#ifdef _KERNEL -+ if (error == 0) { -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char *snapname = nvpair_name(pair); -+ zvol_create_minors(snapname); -+ } -+ } -+#endif -+ -+ return (error); -+} -+ -+typedef struct dsl_dataset_snapshot_tmp_arg { -+ const char *ddsta_fsname; -+ const char *ddsta_snapname; -+ minor_t ddsta_cleanup_minor; -+ const char *ddsta_htag; -+} dsl_dataset_snapshot_tmp_arg_t; -+ -+static int -+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ -+ error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, -+ tx, B_FALSE); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOTSUP)); -+ } -+ error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, -+ B_TRUE, tx); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+static void -+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); -+ -+ dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); -+ dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, -+ ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); -+ dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); -+ -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, -+ minor_t cleanup_minor, const char *htag) -+{ -+ dsl_dataset_snapshot_tmp_arg_t ddsta; -+ int error; -+ spa_t *spa; -+ boolean_t needsuspend; -+ void *cookie; -+ -+ ddsta.ddsta_fsname = fsname; -+ ddsta.ddsta_snapname = snapname; -+ ddsta.ddsta_cleanup_minor = cleanup_minor; -+ ddsta.ddsta_htag = htag; -+ -+ error = spa_open(fsname, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -+ spa_close(spa, FTAG); -+ -+ if (needsuspend) { -+ error = zil_suspend(fsname, &cookie); -+ if (error != 0) -+ return (error); -+ } -+ -+ error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, -+ dsl_dataset_snapshot_tmp_sync, &ddsta, 3); -+ -+ if (needsuspend) -+ zil_resume(cookie); -+ return (error); -+} -+ -+ - void -@@ -2202,11 +1363,9 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - zap_attribute_t za; -- nvlist_t *propval; -- nvlist_t *val; -+ nvlist_t *propval = fnvlist_alloc(); -+ nvlist_t *val = fnvlist_alloc(); - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* -- * There may me missing entries in ds_next_clones_obj -+ * There may be missing entries in ds_next_clones_obj - * due to a bug in a previous version of the code. -@@ -2215,8 +1374,7 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - if (ds->ds_phys->ds_next_clones_obj != 0) { -- ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, -+ VERIFY0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); - } -- if (count != ds->ds_phys->ds_num_children - 1) { -+ if (count != ds->ds_phys->ds_num_children - 1) - goto fail; -- } - for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); -@@ -2226,20 +1384,6 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - char buf[ZFS_MAXNAMELEN]; -- /* -- * Even though we hold the dp_config_rwlock, the dataset -- * may fail to open, returning ENOENT. If there is a -- * thread concurrently attempting to destroy this -- * dataset, it will have the ds_rwlock held for -- * RW_WRITER. Our call to dsl_dataset_hold_obj() -> -- * dsl_dataset_hold_ref() will fail its -- * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the -- * dp_config_rwlock, and wait for the destroy progress -- * and signal ds_exclusive_cv. If the destroy was -- * successful, we will see that -- * DSL_DATASET_IS_DESTROYED(), and return ENOENT. -- */ -- if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- za.za_first_integer, FTAG, &clone) != 0) -- continue; -+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -+ za.za_first_integer, FTAG, &clone)); - dsl_dir_name(clone->ds_dir, buf); -- VERIFY(nvlist_add_boolean(val, buf) == 0); -+ fnvlist_add_boolean(val, buf); - dsl_dataset_rele(clone, FTAG); -@@ -2247,5 +1391,4 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - zap_cursor_fini(&zc); -- VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); -- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), -- propval) == 0); -+ fnvlist_add_nvlist(propval, ZPROP_VALUE, val); -+ fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); - fail: -@@ -2253,3 +1396,2 @@ fail: - nvlist_free(propval); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - } -@@ -2260,4 +1402,22 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - uint64_t refd, avail, uobjs, aobjs, ratio; -+ ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); - -- dsl_dir_stats(ds->ds_dir, nv); -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : -+ (ds->ds_phys->ds_uncompressed_bytes * 100 / -+ ds->ds_phys->ds_compressed_bytes); -+ -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, -+ ds->ds_phys->ds_uncompressed_bytes); -+ -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, -+ ds->ds_phys->ds_unique_bytes); -+ get_clones_stat(ds, nv); -+ } else { -+ dsl_dir_stats(ds->ds_dir, nv); -+ } - -@@ -2292,6 +1452,4 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); -- rw_exit(&dp->dp_config_rwlock); - if (err == 0) { -@@ -2307,18 +1465,2 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - -- ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : -- (ds->ds_phys->ds_uncompressed_bytes * 100 / -- ds->ds_phys->ds_compressed_bytes); -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); -- -- if (ds->ds_phys->ds_next_snap_obj) { -- /* -- * This is a snapshot; override the dd's space used with -- * our unique space and compression ratio. -- */ -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, -- ds->ds_phys->ds_unique_bytes); -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); -- -- get_clones_stat(ds, nv); -- } - } -@@ -2328,2 +1470,5 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - { -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ ASSERT(dsl_pool_config_held(dp)); -+ - stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; -@@ -2331,3 +1476,4 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - stat->dds_guid = ds->ds_phys->ds_guid; -- if (ds->ds_phys->ds_next_snap_obj) { -+ stat->dds_origin[0] = '\0'; -+ if (dsl_dataset_is_snapshot(ds)) { - stat->dds_is_snapshot = B_TRUE; -@@ -2337,17 +1483,12 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - stat->dds_num_clones = 0; -- } - -- /* clone origin is really a dsl_dir thing... */ -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- if (dsl_dir_is_clone(ds->ds_dir)) { -- dsl_dataset_t *ods; -+ if (dsl_dir_is_clone(ds->ds_dir)) { -+ dsl_dataset_t *ods; - -- VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); -- dsl_dataset_name(ods, stat->dds_origin); -- dsl_dataset_drop_ref(ods, FTAG); -- } else { -- stat->dds_origin[0] = '\0'; -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); -+ dsl_dataset_name(ods, stat->dds_origin); -+ dsl_dataset_rele(ods, FTAG); -+ } - } -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - } -@@ -2384,13 +1525,10 @@ dsl_dataset_space(dsl_dataset_t *ds, - boolean_t --dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) -+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) - { -- ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); -- -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -- if (ds->ds_prev == NULL) -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); -+ if (snap == NULL) - return (B_FALSE); - if (ds->ds_phys->ds_bp.blk_birth > -- ds->ds_prev->ds_phys->ds_creation_txg) { -- objset_t *os, *os_prev; -+ snap->ds_phys->ds_creation_txg) { -+ objset_t *os, *os_snap; - /* -@@ -2402,6 +1540,6 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) - return (B_TRUE); -- if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) -+ if (dmu_objset_from_ds(snap, &os_snap) != 0) - return (B_TRUE); - return (bcmp(&os->os_phys->os_meta_dnode, -- &os_prev->os_phys->os_meta_dnode, -+ &os_snap->os_phys->os_meta_dnode, - sizeof (os->os_phys->os_meta_dnode)) != 0); -@@ -2411,61 +1549,128 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) - -+typedef struct dsl_dataset_rename_snapshot_arg { -+ const char *ddrsa_fsname; -+ const char *ddrsa_oldsnapname; -+ const char *ddrsa_newsnapname; -+ boolean_t ddrsa_recursive; -+ dmu_tx_t *ddrsa_tx; -+} dsl_dataset_rename_snapshot_arg_t; -+ - /* ARGSUSED */ - static int --dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, -+ dsl_dataset_t *hds, void *arg) - { -- dsl_dataset_t *ds = arg1; -- char *newsnapname = arg2; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_dataset_t *hds; -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ int error; - uint64_t val; -- int err; - -- err = dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); -- if (err) -- return (err); -- -- /* new name better not be in use */ -- err = dsl_dataset_snap_lookup(hds, newsnapname, &val); -- dsl_dataset_rele(hds, FTAG); -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); -+ if (error != 0) { -+ /* ignore nonexistent snapshots */ -+ return (error == ENOENT ? 0 : error); -+ } - -- if (err == 0) -- err = EEXIST; -- else if (err == ENOENT) -- err = 0; -+ /* new name should not exist */ -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); -+ if (error == 0) -+ error = SET_ERROR(EEXIST); -+ else if (error == ENOENT) -+ error = 0; - - /* dataset name + 1 for the "@" + the new snapshot name must fit */ -- if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) -- err = ENAMETOOLONG; -+ if (dsl_dir_namelen(hds->ds_dir) + 1 + -+ strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) -+ error = SET_ERROR(ENAMETOOLONG); - -- return (err); -+ return (error); - } - --static void --dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static int -+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- const char *newsnapname = arg2; -- dsl_dir_t *dd = ds->ds_dir; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; -- int err; -+ int error; - -- ASSERT(ds->ds_phys->ds_next_snap_obj != 0); -+ error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); -+ if (error != 0) -+ return (error); - -- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); -+ if (ddrsa->ddrsa_recursive) { -+ error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, -+ dsl_dataset_rename_snapshot_check_impl, ddrsa, -+ DS_FIND_CHILDREN); -+ } else { -+ error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); -+ } -+ dsl_dataset_rele(hds, FTAG); -+ return (error); -+} - -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); -- ASSERT0(err); -+static int -+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, -+ dsl_dataset_t *hds, void *arg) -+{ -+#ifdef _KERNEL -+ char *oldname, *newname; -+#endif -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_dataset_t *ds; -+ uint64_t val; -+ dmu_tx_t *tx = ddrsa->ddrsa_tx; -+ int error; -+ -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); -+ ASSERT(error == 0 || error == ENOENT); -+ if (error == ENOENT) { -+ /* ignore nonexistent snapshots */ -+ return (0); -+ } -+ -+ VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); -+ -+ /* log before we change the name */ -+ spa_history_log_internal_ds(ds, "rename", tx, -+ "-> @%s", ddrsa->ddrsa_newsnapname); -+ -+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); - mutex_enter(&ds->ds_lock); -- (void) strcpy(ds->ds_snapname, newsnapname); -+ (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); - mutex_exit(&ds->ds_lock); -- err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, -- ds->ds_snapname, 8, 1, &ds->ds_object, tx); -- ASSERT0(err); -+ VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, -+ ds->ds_snapname, 8, 1, &ds->ds_object, tx)); -+ -+#ifdef _KERNEL -+ oldname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ newname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, -+ ddrsa->ddrsa_oldsnapname); -+ snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, -+ ddrsa->ddrsa_newsnapname); -+ zvol_rename_minors(oldname, newname); -+ kmem_free(newname, MAXPATHLEN); -+ kmem_free(oldname, MAXPATHLEN); -+#endif - -- spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, -- "dataset = %llu", ds->ds_object); -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+static void -+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); -+ ddrsa->ddrsa_tx = tx; -+ if (ddrsa->ddrsa_recursive) { -+ VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, -+ dsl_dataset_rename_snapshot_sync_impl, ddrsa, -+ DS_FIND_CHILDREN)); -+ } else { -+ VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); -+ } - dsl_dataset_rele(hds, FTAG); -@@ -2473,43 +1678,44 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --struct renamesnaparg { -- dsl_sync_task_group_t *dstg; -- char failed[MAXPATHLEN]; -- char *oldsnap; -- char *newsnap; --}; -+int -+dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive) -+{ -+ dsl_dataset_rename_snapshot_arg_t ddrsa; -+ -+ ddrsa.ddrsa_fsname = fsname; -+ ddrsa.ddrsa_oldsnapname = oldsnapname; -+ ddrsa.ddrsa_newsnapname = newsnapname; -+ ddrsa.ddrsa_recursive = recursive; -+ -+ return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, -+ dsl_dataset_rename_snapshot_sync, &ddrsa, 1)); -+} - -+/* -+ * If we're doing an ownership handoff, we need to make sure that there is -+ * only one long hold on the dataset. We're not allowed to change anything here -+ * so we don't permanently release the long hold or regular hold here. We want -+ * to do this only when syncing to avoid the dataset unexpectedly going away -+ * when we release the long hold. -+ */ - static int --dsl_snapshot_rename_one(const char *name, void *arg) -+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) - { -- struct renamesnaparg *ra = arg; -- dsl_dataset_t *ds = NULL; -- char *snapname; -- int err; -+ boolean_t held; - -- snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); -- (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); - -- /* -- * For recursive snapshot renames the parent won't be changing -- * so we just pass name for both the to/from argument. -- */ -- err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); -- if (err != 0) { -- strfree(snapname); -- return (err == ENOENT ? 0 : err); -+ if (owner != NULL) { -+ VERIFY3P(ds->ds_owner, ==, owner); -+ dsl_dataset_long_rele(ds, owner); - } - --#ifdef _KERNEL -- /* -- * For all filesystems undergoing rename, we'll need to unmount it. -- */ -- (void) zfs_unmount_snap(snapname, NULL); --#endif -- err = dsl_dataset_hold(snapname, ra->dstg, &ds); -- strfree(snapname); -- if (err != 0) -- return (err == ENOENT ? 0 : err); -+ held = dsl_dataset_long_held(ds); - -- dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, -- dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); -+ if (owner != NULL) -+ dsl_dataset_long_hold(ds, owner); -+ -+ if (held) -+ return (SET_ERROR(EBUSY)); - -@@ -2518,65 +1724,67 @@ dsl_snapshot_rename_one(const char *name, void *arg) - -+typedef struct dsl_dataset_rollback_arg { -+ const char *ddra_fsname; -+ void *ddra_owner; -+ nvlist_t *ddra_result; -+} dsl_dataset_rollback_arg_t; -+ - static int --dsl_recursive_rename(char *oldname, const char *newname) -+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) - { -- int err; -- struct renamesnaparg *ra; -- dsl_sync_task_t *dst; -- spa_t *spa; -- char *cp, *fsname = spa_strdup(oldname); -- int len = strlen(oldname) + 1; -+ dsl_dataset_rollback_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int64_t unused_refres_delta; -+ int error; - -- /* truncate the snapshot name to get the fsname */ -- cp = strchr(fsname, '@'); -- *cp = '\0'; -+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- err = spa_open(fsname, &spa, FTAG); -- if (err) { -- kmem_free(fsname, len); -- return (err); -+ /* must not be a snapshot */ -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); - } -- ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); -- ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- -- ra->oldsnap = strchr(oldname, '@') + 1; -- ra->newsnap = strchr(newname, '@') + 1; -- *ra->failed = '\0'; - -- err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, -- DS_FIND_CHILDREN); -- kmem_free(fsname, len); -- -- if (err == 0) { -- err = dsl_sync_task_group_wait(ra->dstg); -+ /* must have a most recent snapshot */ -+ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); - } - -- for (dst = list_head(&ra->dstg->dstg_tasks); dst; -- dst = list_next(&ra->dstg->dstg_tasks, dst)) { -- dsl_dataset_t *ds = dst->dst_arg1; -- if (dst->dst_err) { -- dsl_dir_name(ds->ds_dir, ra->failed); -- (void) strlcat(ra->failed, "@", sizeof (ra->failed)); -- (void) strlcat(ra->failed, ra->newsnap, -- sizeof (ra->failed)); -- } -- dsl_dataset_rele(ds, ra->dstg); -+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - -- if (err) -- (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); -- -- dsl_sync_task_group_destroy(ra->dstg); -- kmem_free(ra, sizeof (struct renamesnaparg)); -- spa_close(spa, FTAG); -- return (err); --} -+ /* -+ * Check if the snap we are rolling back to uses more than -+ * the refquota. -+ */ -+ if (ds->ds_quota != 0 && -+ ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EDQUOT)); -+ } - --static int --dsl_valid_rename(const char *oldname, void *arg) --{ -- int delta = *(int *)arg; -+ /* -+ * When we do the clone swap, we will temporarily use more space -+ * due to the refreservation (the head will no longer have any -+ * unique space, so the entire amount of the refreservation will need -+ * to be free). We will immediately destroy the clone, freeing -+ * this space, but the freeing happens over many txg's. -+ */ -+ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, -+ ds->ds_phys->ds_unique_bytes); - -- if (strlen(oldname) + delta >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (unused_refres_delta > 0 && -+ unused_refres_delta > -+ dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -2584,60 +1792,54 @@ dsl_valid_rename(const char *oldname, void *arg) - --#pragma weak dmu_objset_rename = dsl_dataset_rename --int --dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) -+static void -+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd; -- dsl_dataset_t *ds; -- const char *tail; -- int err; -- -- err = dsl_dir_open(oldname, FTAG, &dd, &tail); -- if (err) -- return (err); -+ dsl_dataset_rollback_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds, *clone; -+ uint64_t cloneobj; -+ char namebuf[ZFS_MAXNAMELEN]; - -- if (tail == NULL) { -- int delta = strlen(newname) - strlen(oldname); -+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); - -- /* if we're growing, validate child name lengths */ -- if (delta > 0) -- err = dmu_objset_find(oldname, dsl_valid_rename, -- &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ dsl_dataset_name(ds->ds_prev, namebuf); -+ fnvlist_add_string(ddra->ddra_result, "target", namebuf); - -- if (err == 0) -- err = dsl_dir_rename(dd, newname); -- dsl_dir_close(dd, FTAG); -- return (err); -- } -+ cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", -+ ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); - -- if (tail[0] != '@') { -- /* the name ended in a nonexistent component */ -- dsl_dir_close(dd, FTAG); -- return (ENOENT); -- } -+ VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); - -- dsl_dir_close(dd, FTAG); -+ dsl_dataset_clone_swap_sync_impl(clone, ds, tx); -+ dsl_dataset_zero_zil(ds, tx); - -- /* new name must be snapshot in same filesystem */ -- tail = strchr(newname, '@'); -- if (tail == NULL) -- return (EINVAL); -- tail++; -- if (strncmp(oldname, newname, tail - newname) != 0) -- return (EXDEV); -+ dsl_destroy_head_sync_impl(clone, tx); - -- if (recursive) { -- err = dsl_recursive_rename(oldname, newname); -- } else { -- err = dsl_dataset_hold(oldname, FTAG, &ds); -- if (err) -- return (err); -+ dsl_dataset_rele(clone, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+} - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_snapshot_rename_check, -- dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); -+/* -+ * Rolls back the given filesystem or volume to the most recent snapshot. -+ * The name of the most recent snapshot will be returned under key "target" -+ * in the result nvlist. -+ * -+ * If owner != NULL: -+ * - The existing dataset MUST be owned by the specified owner at entry -+ * - Upon return, dataset will still be held by the same owner, whether we -+ * succeed or not. -+ * -+ * This mode is required any time the existing filesystem is mounted. See -+ * notes above zfs_suspend_fs() for further details. -+ */ -+int -+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result) -+{ -+ dsl_dataset_rollback_arg_t ddra; - -- dsl_dataset_rele(ds, FTAG); -- } -+ ddra.ddra_fsname = fsname; -+ ddra.ddra_owner = owner; -+ ddra.ddra_result = result; - -- return (err); -+ return (dsl_sync_task(fsname, dsl_dataset_rollback_check, -+ dsl_dataset_rollback_sync, &ddra, 1)); - } -@@ -2649,18 +1851,24 @@ struct promotenode { - --struct promotearg { -+typedef struct dsl_dataset_promote_arg { -+ const char *ddpa_clonename; -+ dsl_dataset_t *ddpa_clone; - list_t shared_snaps, origin_snaps, clone_snaps; -- dsl_dataset_t *origin_origin; -+ dsl_dataset_t *origin_origin; /* origin of the origin */ - uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; - char *err_ds; --}; -+} dsl_dataset_promote_arg_t; - - static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); -+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, -+ void *tag); -+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); - - static int --dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *hds = arg1; -- struct promotearg *pa = arg2; -- struct promotenode *snap = list_head(&pa->shared_snaps); -- dsl_dataset_t *origin_ds = snap->ds; -+ dsl_dataset_promote_arg_t *ddpa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ struct promotenode *snap; -+ dsl_dataset_t *origin_ds; - int err; -@@ -2668,15 +1876,27 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- /* Check that it is a real clone */ -- if (!dsl_dir_is_clone(hds->ds_dir)) -- return (EINVAL); -+ err = promote_hold(ddpa, dp, FTAG); -+ if (err != 0) -+ return (err); -+ -+ hds = ddpa->ddpa_clone; - -- /* Since this is so expensive, don't do the preliminary check */ -- if (!dmu_tx_is_syncing(tx)) -+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { -+ promote_rele(ddpa, FTAG); -+ return (SET_ERROR(EXDEV)); -+ } -+ -+ /* -+ * Compute and check the amount of space to transfer. Since this is -+ * so expensive, don't do the preliminary check. -+ */ -+ if (!dmu_tx_is_syncing(tx)) { -+ promote_rele(ddpa, FTAG); - return (0); -+ } - -- if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) -- return (EXDEV); -+ snap = list_head(&ddpa->shared_snaps); -+ origin_ds = snap->ds; - - /* compute origin's new unique space */ -- snap = list_tail(&pa->clone_snaps); -+ snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); -@@ -2684,3 +1904,3 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -- &pa->unique, &unused, &unused); -+ &ddpa->unique, &unused, &unused); - -@@ -2690,3 +1910,3 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - * Compute space to transfer. Consider the incremental changes -- * to used for each snapshot: -+ * to used by each snapshot: - * (my used) = (prev's used) + (blocks born) - (blocks killed) -@@ -2701,7 +1921,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- pa->used = origin_ds->ds_phys->ds_referenced_bytes; -- pa->comp = origin_ds->ds_phys->ds_compressed_bytes; -- pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; -- for (snap = list_head(&pa->shared_snaps); snap; -- snap = list_next(&pa->shared_snaps, snap)) { -+ ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; -+ ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; -+ ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; -+ for (snap = list_head(&ddpa->shared_snaps); snap; -+ snap = list_next(&ddpa->shared_snaps, snap)) { - uint64_t val, dlused, dlcomp, dluncomp; -@@ -2709,7 +1929,17 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - -+ /* -+ * If there are long holds, we won't be able to evict -+ * the objset. -+ */ -+ if (dsl_dataset_long_held(ds)) { -+ err = SET_ERROR(EBUSY); -+ goto out; -+ } -+ - /* Check that the snapshot name does not conflict */ -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); - err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); - if (err == 0) { -- err = EEXIST; -+ (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); -+ err = SET_ERROR(EEXIST); - goto out; -@@ -2725,5 +1955,5 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - &dlused, &dlcomp, &dluncomp); -- pa->used += dlused; -- pa->comp += dlcomp; -- pa->uncomp += dluncomp; -+ ddpa->used += dlused; -+ ddpa->comp += dlcomp; -+ ddpa->uncomp += dluncomp; - } -@@ -2734,6 +1964,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (pa->origin_origin) { -- pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; -- pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; -- pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; -+ if (ddpa->origin_origin) { -+ ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes; -+ ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes; -+ ddpa->uncomp -= -+ ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; - } -@@ -2742,5 +1973,5 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, -- pa->used); -- if (err) -- return (err); -+ ddpa->used); -+ if (err != 0) -+ goto out; - -@@ -2762,24 +1993,23 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- snap = list_head(&pa->origin_snaps); -- err = snaplist_space(&pa->shared_snaps, -- snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); -- if (err) -- return (err); -+ snap = list_head(&ddpa->origin_snaps); -+ err = snaplist_space(&ddpa->shared_snaps, -+ snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); -+ if (err != 0) -+ goto out; - -- err = snaplist_space(&pa->clone_snaps, -+ err = snaplist_space(&ddpa->clone_snaps, - snap->ds->ds_dir->dd_origin_txg, &space); -- if (err) -- return (err); -- pa->cloneusedsnap += space; -+ if (err != 0) -+ goto out; -+ ddpa->cloneusedsnap += space; - } - if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { -- err = snaplist_space(&pa->origin_snaps, -- origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); -- if (err) -- return (err); -+ err = snaplist_space(&ddpa->origin_snaps, -+ origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap); -+ if (err != 0) -+ goto out; - } - -- return (0); - out: -- pa->err_ds = snap->ds->ds_snapname; -+ promote_rele(ddpa, FTAG); - return (err); -@@ -2788,11 +2018,11 @@ out: - static void --dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *hds = arg1; -- struct promotearg *pa = arg2; -- struct promotenode *snap = list_head(&pa->shared_snaps); -- dsl_dataset_t *origin_ds = snap->ds; -+ dsl_dataset_promote_arg_t *ddpa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ struct promotenode *snap; -+ dsl_dataset_t *origin_ds; - dsl_dataset_t *origin_head; -- dsl_dir_t *dd = hds->ds_dir; -- dsl_pool_t *dp = hds->ds_dir->dd_pool; -+ dsl_dir_t *dd; - dsl_dir_t *odd = NULL; -@@ -2801,5 +2031,12 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); -+ VERIFY0(promote_hold(ddpa, dp, FTAG)); -+ hds = ddpa->ddpa_clone; -+ -+ ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE); - -- snap = list_head(&pa->origin_snaps); -+ snap = list_head(&ddpa->shared_snaps); -+ origin_ds = snap->ds; -+ dd = hds->ds_dir; -+ -+ snap = list_head(&ddpa->origin_snaps); - origin_head = snap->ds; -@@ -2810,3 +2047,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, -+ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, - NULL, FTAG, &odd)); -@@ -2816,3 +2053,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; -- snap = list_tail(&pa->clone_snaps); -+ snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); -@@ -2822,4 +2059,5 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (origin_ds->ds_phys->ds_next_clones_obj) { -- remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -+ dsl_dataset_remove_from_next_clones(origin_ds, -+ snap->ds->ds_object, tx); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, -@@ -2840,10 +2078,10 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, hds->ds_object, tx)); -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -- pa->origin_origin->ds_dir->dd_phys->dd_clones, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ ddpa->origin_origin->ds_dir->dd_phys->dd_clones, - hds->ds_object, tx)); - -- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, -- pa->origin_origin->ds_dir->dd_phys->dd_clones, -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, -+ ddpa->origin_origin->ds_dir->dd_phys->dd_clones, - origin_head->ds_object, tx)); -@@ -2853,5 +2091,4 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, origin_head->ds_object, tx)); -- - } -@@ -2859,7 +2096,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* move snapshots to this dir */ -- for (snap = list_head(&pa->shared_snaps); snap; -- snap = list_next(&pa->shared_snaps, snap)) { -+ for (snap = list_head(&ddpa->shared_snaps); snap; -+ snap = list_next(&ddpa->shared_snaps, snap)) { - dsl_dataset_t *ds = snap->ds; - -- /* unregister props as dsl_dir is changing */ -+ /* -+ * Property callbacks are registered to a particular -+ * dsl_dir. Since ours is changing, evict the objset -+ * so that they will be unregistered from the old dsl_dir. -+ */ - if (ds->ds_objset) { -@@ -2868,7 +2109,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -+ - /* move snap name entry */ -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- VERIFY(0 == dsl_dataset_snap_remove(origin_head, -+ VERIFY0(dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx)); -- VERIFY(0 == zap_add(dp->dp_meta_objset, -+ VERIFY0(zap_add(dp->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, -@@ -2881,4 +2123,4 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - ASSERT3P(ds->ds_dir, ==, odd); -- dsl_dir_close(ds->ds_dir, ds); -- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, -+ dsl_dir_rele(ds->ds_dir, ds); -+ VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, - NULL, ds, &ds->ds_dir)); -@@ -2906,3 +2148,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &cnds)); -@@ -2910,6 +2152,6 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY3U(zap_remove_int(dp->dp_meta_objset, -- odd->dd_phys->dd_clones, o, tx), ==, 0); -- VERIFY3U(zap_add_int(dp->dp_meta_objset, -- dd->dd_phys->dd_clones, o, tx), ==, 0); -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, -+ odd->dd_phys->dd_clones, o, tx)); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ dd->dd_phys->dd_clones, o, tx)); - dsl_dataset_rele(cnds, FTAG); -@@ -2919,3 +2161,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT0(dsl_prop_numcb(ds)); -+ ASSERT(!dsl_prop_hascb(ds)); - } -@@ -2929,28 +2171,27 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- delta = pa->cloneusedsnap - -+ delta = ddpa->cloneusedsnap - - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, >=, 0); -- ASSERT3U(pa->used, >=, delta); -+ ASSERT3U(ddpa->used, >=, delta); - dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(dd, DD_USED_HEAD, -- pa->used - delta, pa->comp, pa->uncomp, tx); -+ ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); - -- delta = pa->originusedsnap - -+ delta = ddpa->originusedsnap - - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, <=, 0); -- ASSERT3U(pa->used, >=, -delta); -+ ASSERT3U(ddpa->used, >=, -delta); - dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(odd, DD_USED_HEAD, -- -pa->used - delta, -pa->comp, -pa->uncomp, tx); -+ -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - -- origin_ds->ds_phys->ds_unique_bytes = pa->unique; -+ origin_ds->ds_phys->ds_unique_bytes = ddpa->unique; - - /* log history record */ -- spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, -- "dataset = %llu", hds->ds_object); -+ spa_history_log_internal_ds(hds, "promote", tx, ""); - -- dsl_dir_close(odd, FTAG); -+ dsl_dir_rele(odd, FTAG); -+ promote_rele(ddpa, FTAG); - } - --static char *snaplist_tag = "snaplist"; - /* -@@ -2962,4 +2203,4 @@ static char *snaplist_tag = "snaplist"; - static int --snaplist_make(dsl_pool_t *dp, boolean_t own, -- uint64_t first_obj, uint64_t last_obj, list_t *l) -+snaplist_make(dsl_pool_t *dp, -+ uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) - { -@@ -2967,4 +2208,2 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); -- - list_create(l, sizeof (struct promotenode), -@@ -2977,19 +2216,6 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- if (own) { -- err = dsl_dataset_own_obj(dp, obj, -- 0, snaplist_tag, &ds); -- if (err == 0) -- dsl_dataset_make_exclusive(ds, snaplist_tag); -- } else { -- err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); -- } -- if (err == ENOENT) { -- /* lost race with snapshot destroy */ -- struct promotenode *last = list_tail(l); -- ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); -- obj = last->ds->ds_phys->ds_prev_snap_obj; -- continue; -- } else if (err) { -+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds); -+ ASSERT(err != ENOENT); -+ if (err != 0) - return (err); -- } - -@@ -2998,3 +2224,3 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); -+ snap = kmem_alloc(sizeof (*snap), KM_PUSHPAGE); - snap->ds = ds; -@@ -3023,3 +2249,3 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) - static void --snaplist_destroy(list_t *l, boolean_t own) -+snaplist_destroy(list_t *l, void *tag) - { -@@ -3027,3 +2253,3 @@ snaplist_destroy(list_t *l, boolean_t own) - -- if (!l || !list_link_active(&l->list_head)) -+ if (l == NULL || !list_link_active(&l->list_head)) - return; -@@ -3032,7 +2258,4 @@ snaplist_destroy(list_t *l, boolean_t own) - list_remove(l, snap); -- if (own) -- dsl_dataset_disown(snap->ds, snaplist_tag); -- else -- dsl_dataset_rele(snap->ds, snaplist_tag); -- kmem_free(snap, sizeof (struct promotenode)); -+ dsl_dataset_rele(snap->ds, tag); -+ kmem_free(snap, sizeof (*snap)); - } -@@ -3041,63 +2264,37 @@ snaplist_destroy(list_t *l, boolean_t own) - --/* -- * Promote a clone. Nomenclature note: -- * "clone" or "cds": the original clone which is being promoted -- * "origin" or "ods": the snapshot which is originally clone's origin -- * "origin head" or "ohds": the dataset which is the head -- * (filesystem/volume) for the origin -- * "origin origin": the origin of the origin's filesystem (typically -- * NULL, indicating that the clone is not a clone of a clone). -- */ --int --dsl_dataset_promote(const char *name, char *conflsnap) -+static int -+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) - { -- dsl_dataset_t *ds; -+ int error; - dsl_dir_t *dd; -- dsl_pool_t *dp; -- dmu_object_info_t doi; -- struct promotearg pa; - struct promotenode *snap; -- int err; - -- bzero(&pa, sizeof(struct promotearg)); -- err = dsl_dataset_hold(name, FTAG, &ds); -- if (err) -- return (err); -- dd = ds->ds_dir; -- dp = dd->dd_pool; -- -- err = dmu_object_info(dp->dp_meta_objset, -- ds->ds_phys->ds_snapnames_zapobj, &doi); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -- } -+ error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, -+ &ddpa->ddpa_clone); -+ if (error != 0) -+ return (error); -+ dd = ddpa->ddpa_clone->ds_dir; - -- if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -+ if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || -+ !dsl_dir_is_clone(dd)) { -+ dsl_dataset_rele(ddpa->ddpa_clone, tag); -+ return (SET_ERROR(EINVAL)); - } - -- /* -- * We are going to inherit all the snapshots taken before our -- * origin (i.e., our new origin will be our parent's origin). -- * Take ownership of them so that we can rename them into our -- * namespace. -- */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- -- err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, -- &pa.shared_snaps); -- if (err != 0) -+ error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj, -+ &ddpa->shared_snaps, tag); -+ if (error != 0) - goto out; - -- err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); -- if (err != 0) -+ error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, -+ &ddpa->clone_snaps, tag); -+ if (error != 0) - goto out; - -- snap = list_head(&pa.shared_snaps); -+ snap = list_head(&ddpa->shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); -- err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, -- snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); -- if (err != 0) -+ error = snaplist_make(dp, dd->dd_phys->dd_origin_obj, -+ snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, -+ &ddpa->origin_snaps, tag); -+ if (error != 0) - goto out; -@@ -3105,86 +2302,109 @@ dsl_dataset_promote(const char *name, char *conflsnap) - if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { -- err = dsl_dataset_hold_obj(dp, -+ error = dsl_dataset_hold_obj(dp, - snap->ds->ds_dir->dd_phys->dd_origin_obj, -- FTAG, &pa.origin_origin); -- if (err != 0) -+ tag, &ddpa->origin_origin); -+ if (error != 0) - goto out; - } -- - out: -- rw_exit(&dp->dp_config_rwlock); -+ if (error != 0) -+ promote_rele(ddpa, tag); -+ return (error); -+} -+ -+static void -+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) -+{ -+ snaplist_destroy(&ddpa->shared_snaps, tag); -+ snaplist_destroy(&ddpa->clone_snaps, tag); -+ snaplist_destroy(&ddpa->origin_snaps, tag); -+ if (ddpa->origin_origin != NULL) -+ dsl_dataset_rele(ddpa->origin_origin, tag); -+ dsl_dataset_rele(ddpa->ddpa_clone, tag); -+} -+ -+/* -+ * Promote a clone. -+ * -+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled -+ * in with the name. (It must be at least MAXNAMELEN bytes long.) -+ */ -+int -+dsl_dataset_promote(const char *name, char *conflsnap) -+{ -+ dsl_dataset_promote_arg_t ddpa = { 0 }; -+ uint64_t numsnaps; -+ int error; -+ objset_t *os; - - /* -- * Add in 128x the snapnames zapobj size, since we will be moving -- * a bunch of snapnames to the promoted ds, and dirtying their -- * bonus buffers. -+ * We will modify space proportional to the number of -+ * snapshots. Compute numsnaps. - */ -- if (err == 0) { -- err = dsl_sync_task_do(dp, dsl_dataset_promote_check, -- dsl_dataset_promote_sync, ds, &pa, -- 2 + 2 * doi.doi_physical_blocks_512); -- if (err && pa.err_ds && conflsnap) -- (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); -- } -+ error = dmu_objset_hold(name, FTAG, &os); -+ if (error != 0) -+ return (error); -+ error = zap_count(dmu_objset_pool(os)->dp_meta_objset, -+ dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps); -+ dmu_objset_rele(os, FTAG); -+ if (error != 0) -+ return (error); - -- snaplist_destroy(&pa.shared_snaps, B_TRUE); -- snaplist_destroy(&pa.clone_snaps, B_FALSE); -- snaplist_destroy(&pa.origin_snaps, B_FALSE); -- if (pa.origin_origin) -- dsl_dataset_rele(pa.origin_origin, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); --} -+ ddpa.ddpa_clonename = name; -+ ddpa.err_ds = conflsnap; - --struct cloneswaparg { -- dsl_dataset_t *cds; /* clone dataset */ -- dsl_dataset_t *ohds; /* origin's head dataset */ -- boolean_t force; -- int64_t unused_refres_delta; /* change in unconsumed refreservation */ --}; -+ return (dsl_sync_task(name, dsl_dataset_promote_check, -+ dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); -+} - --/* ARGSUSED */ --static int --dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) -+int -+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) - { -- struct cloneswaparg *csa = arg1; -+ int64_t unused_refres_delta; - - /* they should both be heads */ -- if (dsl_dataset_is_snapshot(csa->cds) || -- dsl_dataset_is_snapshot(csa->ohds)) -- return (EINVAL); -+ if (dsl_dataset_is_snapshot(clone) || -+ dsl_dataset_is_snapshot(origin_head)) -+ return (SET_ERROR(EINVAL)); - -- /* the branch point should be just before them */ -- if (csa->cds->ds_prev != csa->ohds->ds_prev) -- return (EINVAL); -+ /* if we are not forcing, the branch point should be just before them */ -+ if (!force && clone->ds_prev != origin_head->ds_prev) -+ return (SET_ERROR(EINVAL)); - -- /* cds should be the clone (unless they are unrelated) */ -- if (csa->cds->ds_prev != NULL && -- csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && -- csa->ohds->ds_object != -- csa->cds->ds_prev->ds_phys->ds_next_snap_obj) -- return (EINVAL); -+ /* clone should be the clone (unless they are unrelated) */ -+ if (clone->ds_prev != NULL && -+ clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && -+ origin_head->ds_dir != clone->ds_prev->ds_dir) -+ return (SET_ERROR(EINVAL)); - - /* the clone should be a child of the origin */ -- if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) -- return (EINVAL); -- -- /* ohds shouldn't be modified unless 'force' */ -- if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) -- return (ETXTBSY); -- -- /* adjust amount of any unconsumed refreservation */ -- csa->unused_refres_delta = -- (int64_t)MIN(csa->ohds->ds_reserved, -- csa->ohds->ds_phys->ds_unique_bytes) - -- (int64_t)MIN(csa->ohds->ds_reserved, -- csa->cds->ds_phys->ds_unique_bytes); -- -- if (csa->unused_refres_delta > 0 && -- csa->unused_refres_delta > -- dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -- -- if (csa->ohds->ds_quota != 0 && -- csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) -- return (EDQUOT); -+ if (clone->ds_dir->dd_parent != origin_head->ds_dir) -+ return (SET_ERROR(EINVAL)); -+ -+ /* origin_head shouldn't be modified unless 'force' */ -+ if (!force && -+ dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) -+ return (SET_ERROR(ETXTBSY)); -+ -+ /* origin_head should have no long holds (e.g. is not mounted) */ -+ if (dsl_dataset_handoff_check(origin_head, owner, tx)) -+ return (SET_ERROR(EBUSY)); -+ -+ /* check amount of any unconsumed refreservation */ -+ unused_refres_delta = -+ (int64_t)MIN(origin_head->ds_reserved, -+ origin_head->ds_phys->ds_unique_bytes) - -+ (int64_t)MIN(origin_head->ds_reserved, -+ clone->ds_phys->ds_unique_bytes); -+ -+ if (unused_refres_delta > 0 && -+ unused_refres_delta > -+ dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) -+ return (SET_ERROR(ENOSPC)); -+ -+ /* clone can't be over the head's refquota */ -+ if (origin_head->ds_quota != 0 && -+ clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota) -+ return (SET_ERROR(EDQUOT)); - -@@ -3193,26 +2413,33 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) - --/* ARGSUSED */ --static void --dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, dmu_tx_t *tx) - { -- struct cloneswaparg *csa = arg1; -- dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ int64_t unused_refres_delta; - -- ASSERT(csa->cds->ds_reserved == 0); -- ASSERT(csa->ohds->ds_quota == 0 || -- csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); -+ ASSERT(clone->ds_reserved == 0); -+ ASSERT(origin_head->ds_quota == 0 || -+ clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota); -+ ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); - -- dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); -- dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); -+ dmu_buf_will_dirty(clone->ds_dbuf, tx); -+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - -- if (csa->cds->ds_objset != NULL) { -- dmu_objset_evict(csa->cds->ds_objset); -- csa->cds->ds_objset = NULL; -+ if (clone->ds_objset != NULL) { -+ dmu_objset_evict(clone->ds_objset); -+ clone->ds_objset = NULL; - } - -- if (csa->ohds->ds_objset != NULL) { -- dmu_objset_evict(csa->ohds->ds_objset); -- csa->ohds->ds_objset = NULL; -+ if (origin_head->ds_objset != NULL) { -+ dmu_objset_evict(origin_head->ds_objset); -+ origin_head->ds_objset = NULL; - } - -+ unused_refres_delta = -+ (int64_t)MIN(origin_head->ds_reserved, -+ origin_head->ds_phys->ds_unique_bytes) - -+ (int64_t)MIN(origin_head->ds_reserved, -+ clone->ds_phys->ds_unique_bytes); -+ - /* -@@ -3220,4 +2447,4 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (csa->cds->ds_prev) { -- dsl_dataset_t *origin = csa->cds->ds_prev; -+ if (clone->ds_prev) { -+ dsl_dataset_t *origin = clone->ds_prev; - uint64_t comp, uncomp; -@@ -3225,3 +2452,3 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - dmu_buf_will_dirty(origin->ds_dbuf, tx); -- dsl_deadlist_space_range(&csa->cds->ds_deadlist, -+ dsl_deadlist_space_range(&clone->ds_deadlist, - origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, -@@ -3233,5 +2460,5 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - blkptr_t tmp; -- tmp = csa->ohds->ds_phys->ds_bp; -- csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; -- csa->cds->ds_phys->ds_bp = tmp; -+ tmp = origin_head->ds_phys->ds_bp; -+ origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp; -+ clone->ds_phys->ds_bp = tmp; - } -@@ -3244,21 +2471,21 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT3U(csa->cds->ds_dir->dd_phys-> -+ ASSERT3U(clone->ds_dir->dd_phys-> - dd_used_breakdown[DD_USED_SNAP], ==, 0); - -- dsl_deadlist_space(&csa->cds->ds_deadlist, -+ dsl_deadlist_space(&clone->ds_deadlist, - &cdl_used, &cdl_comp, &cdl_uncomp); -- dsl_deadlist_space(&csa->ohds->ds_deadlist, -+ dsl_deadlist_space(&origin_head->ds_deadlist, - &odl_used, &odl_comp, &odl_uncomp); - -- dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - -- (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); -- dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - -- (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); -- duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + -+ dused = clone->ds_phys->ds_referenced_bytes + cdl_used - -+ (origin_head->ds_phys->ds_referenced_bytes + odl_used); -+ dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp - -+ (origin_head->ds_phys->ds_compressed_bytes + odl_comp); -+ duncomp = clone->ds_phys->ds_uncompressed_bytes + - cdl_uncomp - -- (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); -+ (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp); - -- dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, -+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, - dused, dcomp, duncomp, tx); -- dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, -+ dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, - -dused, -dcomp, -duncomp, tx); -@@ -3271,9 +2498,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- dsl_deadlist_space_range(&csa->cds->ds_deadlist, -- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, -+ dsl_deadlist_space_range(&clone->ds_deadlist, -+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &cdl_used, &cdl_comp, &cdl_uncomp); -- dsl_deadlist_space_range(&csa->ohds->ds_deadlist, -- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, -+ dsl_deadlist_space_range(&origin_head->ds_deadlist, -+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &odl_used, &odl_comp, &odl_uncomp); -- dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, -+ dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, - DD_USED_HEAD, DD_USED_SNAP, tx); -@@ -3282,14 +2509,14 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* swap ds_*_bytes */ -- SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, -- csa->cds->ds_phys->ds_referenced_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, -- csa->cds->ds_phys->ds_compressed_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, -- csa->cds->ds_phys->ds_uncompressed_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, -- csa->cds->ds_phys->ds_unique_bytes); -+ SWITCH64(origin_head->ds_phys->ds_referenced_bytes, -+ clone->ds_phys->ds_referenced_bytes); -+ SWITCH64(origin_head->ds_phys->ds_compressed_bytes, -+ clone->ds_phys->ds_compressed_bytes); -+ SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes, -+ clone->ds_phys->ds_uncompressed_bytes); -+ SWITCH64(origin_head->ds_phys->ds_unique_bytes, -+ clone->ds_phys->ds_unique_bytes); - - /* apply any parent delta for change in unconsumed refreservation */ -- dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, -- csa->unused_refres_delta, 0, 0, tx); -+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, -+ unused_refres_delta, 0, 0, tx); - -@@ -3298,52 +2525,15 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- dsl_deadlist_close(&csa->cds->ds_deadlist); -- dsl_deadlist_close(&csa->ohds->ds_deadlist); -- SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, -- csa->cds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, -- csa->cds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, -- csa->ohds->ds_phys->ds_deadlist_obj); -- -- dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); --} -+ dsl_deadlist_close(&clone->ds_deadlist); -+ dsl_deadlist_close(&origin_head->ds_deadlist); -+ SWITCH64(origin_head->ds_phys->ds_deadlist_obj, -+ clone->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, -+ clone->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, -+ origin_head->ds_phys->ds_deadlist_obj); - --/* -- * Swap 'clone' with its origin head datasets. Used at the end of "zfs -- * recv" into an existing fs to swizzle the file system to the new -- * version, and by "zfs rollback". Can also be used to swap two -- * independent head datasets if neither has any snapshots. -- */ --int --dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, -- boolean_t force) --{ -- struct cloneswaparg csa; -- int error; -+ dsl_scan_ds_clone_swapped(origin_head, clone, tx); - -- ASSERT(clone->ds_owner); -- ASSERT(origin_head->ds_owner); --retry: -- /* -- * Need exclusive access for the swap. If we're swapping these -- * datasets back after an error, we already hold the locks. -- */ -- if (!RW_WRITE_HELD(&clone->ds_rwlock)) -- rw_enter(&clone->ds_rwlock, RW_WRITER); -- if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && -- !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { -- rw_exit(&clone->ds_rwlock); -- rw_enter(&origin_head->ds_rwlock, RW_WRITER); -- if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { -- rw_exit(&origin_head->ds_rwlock); -- goto retry; -- } -- } -- csa.cds = clone; -- csa.ohds = origin_head; -- csa.force = force; -- error = dsl_sync_task_do(clone->ds_dir->dd_pool, -- dsl_dataset_clone_swap_check, -- dsl_dataset_clone_swap_sync, &csa, NULL, 9); -- return (error); -+ spa_history_log_internal_ds(clone, "clone swap", tx, -+ "parent=%s", origin_head->ds_dir->dd_myname); - } -@@ -3357,3 +2547,2 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - { -- spa_t *spa; - dsl_pool_t *dp; -@@ -3362,7 +2551,8 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - -- if ((error = spa_open(pname, &spa, FTAG)) != 0) -+ error = dsl_pool_hold(pname, FTAG, &dp); -+ if (error != 0) - return (error); -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { -+ -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); -+ if (error == 0) { - dsl_dataset_name(ds, buf); -@@ -3370,4 +2560,3 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - } -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -3415,7 +2604,5 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - ds->ds_phys->ds_referenced_bytes < ds->ds_quota) -- error = ERESTART; -+ error = SET_ERROR(ERESTART); - else -- error = EDQUOT; -- -- DMU_TX_STAT_BUMP(dmu_tx_quota); -+ error = SET_ERROR(EDQUOT); - } -@@ -3426,23 +2613,51 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - -+typedef struct dsl_dataset_set_qr_arg { -+ const char *ddsqra_name; -+ zprop_source_t ddsqra_source; -+ uint64_t ddsqra_value; -+} dsl_dataset_set_qr_arg_t; -+ -+ - /* ARGSUSED */ - static int --dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- int err; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t newval; - -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) -- return (ENOTSUP); -+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) -+ return (SET_ERROR(ENOTSUP)); - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- if (psa->psa_effective_value == 0) -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ if (newval == 0) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -- if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || -- psa->psa_effective_value < ds->ds_reserved) -- return (ENOSPC); -+ if (newval < ds->ds_phys->ds_referenced_bytes || -+ newval < ds->ds_reserved) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -3450,18 +2665,25 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) - --extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); -- --void --dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static void -+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ uint64_t newval; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); -+ -+ dsl_prop_set_sync_impl(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - -- if (ds->ds_quota != effective_value) { -+ if (ds->ds_quota != newval) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_quota = effective_value; -+ ds->ds_quota = newval; - } -+ dsl_dataset_rele(ds, FTAG); - } -@@ -3469,26 +2691,13 @@ dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) -+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, -+ uint64_t refquota) - { -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -- -- dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -+ dsl_dataset_set_qr_arg_t ddsqra; - -- /* -- * If someone removes a file, then tries to set the quota, we -- * want to make sure the file freeing takes effect. -- */ -- txg_wait_open(ds->ds_dir->dd_pool, 0); -+ ddsqra.ddsqra_name = dsname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = refquota; - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, -- ds, &psa, 0); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, -+ dsl_dataset_set_refquota_sync, &ddsqra, 0)); - } -@@ -3496,21 +2705,29 @@ dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) - static int --dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value; -- uint64_t unique; -- int err; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t newval, unique; - -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -- SPA_VERSION_REFRESERVATION) -- return (ENOTSUP); -+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) -+ return (SET_ERROR(ENOTSUP)); - -- if (dsl_dataset_is_snapshot(ds)) -- return (EINVAL); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- effective_value = psa->psa_effective_value; -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -@@ -3520,4 +2737,6 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (!dmu_tx_is_syncing(tx)) -+ if (!dmu_tx_is_syncing(tx)) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -@@ -3529,13 +2748,15 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { -- uint64_t delta = MAX(unique, effective_value) - -+ if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { -+ uint64_t delta = MAX(unique, newval) - - MAX(unique, ds->ds_reserved); - -- if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -- if (ds->ds_quota > 0 && -- effective_value > ds->ds_quota) -- return (ENOSPC); -+ if (delta > -+ dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || -+ (ds->ds_quota > 0 && newval > ds->ds_quota)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -3543,8 +2764,7 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - --static void --dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, -+ zprop_source_t source, uint64_t value, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ uint64_t newval; - uint64_t unique; -@@ -3552,7 +2772,9 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); -+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ source, sizeof (value), 1, &value, tx); - -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); - -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_dir->dd_lock); -@@ -3561,5 +2783,5 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - unique = ds->ds_phys->ds_unique_bytes; -- delta = MAX(0, (int64_t)(effective_value - unique)) - -+ delta = MAX(0, (int64_t)(newval - unique)) - - MAX(0, (int64_t)(ds->ds_reserved - unique)); -- ds->ds_reserved = effective_value; -+ ds->ds_reserved = newval; - mutex_exit(&ds->ds_lock); -@@ -3570,596 +2792,28 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --int --dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, -- uint64_t reservation) --{ -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -- -- dsl_prop_setarg_init_uint64(&psa, "refreservation", source, -- &reservation); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_set_reservation_check, -- dsl_dataset_set_reservation_sync, ds, &psa, 0); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); --} -- --typedef struct zfs_hold_cleanup_arg { -- dsl_pool_t *dp; -- uint64_t dsobj; -- char htag[MAXNAMELEN]; --} zfs_hold_cleanup_arg_t; -- - static void --dsl_dataset_user_release_onexit(void *arg) --{ -- zfs_hold_cleanup_arg_t *ca = arg; -- -- (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, -- B_TRUE); -- kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); --} -- --void --dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, -- minor_t minor) --{ -- zfs_hold_cleanup_arg_t *ca; -- -- ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); -- ca->dp = ds->ds_dir->dd_pool; -- ca->dsobj = ds->ds_object; -- (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); -- VERIFY3U(0, ==, zfs_onexit_add_cb(minor, -- dsl_dataset_user_release_onexit, ca, NULL)); --} -- --/* -- * If you add new checks here, you may need to add -- * additional checks to the "temporary" case in -- * snapshot_check() in dmu_objset.c. -- */ --static int --dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- struct dsl_ds_holdarg *ha = arg2; -- char *htag = ha->htag; -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- int error = 0; -- -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- -- if (!dsl_dataset_is_snapshot(ds)) -- return (EINVAL); -- -- /* tags must be unique */ -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_userrefs_obj) { -- error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, -- 8, 1, tx); -- if (error == 0) -- error = EEXIST; -- else if (error == ENOENT) -- error = 0; -- } -- mutex_exit(&ds->ds_lock); -- -- if (error == 0 && ha->temphold && -- strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -- error = E2BIG; -- -- return (error); --} -- --void --dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct dsl_ds_holdarg *ha = arg2; -- char *htag = ha->htag; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- uint64_t now = gethrestime_sec(); -- uint64_t zapobj; -- -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_userrefs_obj == 0) { -- /* -- * This is the first user hold for this dataset. Create -- * the userrefs zap object. -- */ -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- zapobj = ds->ds_phys->ds_userrefs_obj = -- zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); -- } else { -- zapobj = ds->ds_phys->ds_userrefs_obj; -- } -- ds->ds_userrefs++; -- mutex_exit(&ds->ds_lock); -- -- VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); -- -- if (ha->temphold) { -- VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, -- htag, &now, tx)); -- } -- -- spa_history_log_internal(LOG_DS_USER_HOLD, -- dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, -- (int)ha->temphold, ds->ds_object); --} -- --static int --dsl_dataset_user_hold_one(const char *dsname, void *arg) --{ -- struct dsl_ds_holdarg *ha = arg; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -- int error; -- char *name; -- -- /* alloc a buffer to hold dsname@snapname plus terminating NULL */ -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = dsl_dataset_hold(name, ha->dstg, &ds); -- strfree(name); -- if (error == 0) { -- ha->gotone = B_TRUE; -- dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, -- dsl_dataset_user_hold_sync, ds, ha, 0); -- } else if (error == ENOENT && ha->recursive) { -- error = 0; -- } else { -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- } -- return (error); --} -- --int --dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, -- boolean_t temphold) --{ -- struct dsl_ds_holdarg *ha; -- int error; -- -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- ha->htag = htag; -- ha->temphold = temphold; -- error = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, -- ds, ha, 0); -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- -- return (error); --} -- --int --dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, -- boolean_t recursive, boolean_t temphold, int cleanup_fd) --{ -- struct dsl_ds_holdarg *ha; -- dsl_sync_task_t *dst; -- spa_t *spa; -- int error; -- minor_t minor = 0; -- -- if (cleanup_fd != -1) { -- /* Currently we only support cleanup-on-exit of tempholds. */ -- if (!temphold) -- return (EINVAL); -- error = zfs_onexit_fd_hold(cleanup_fd, &minor); -- if (error) -- return (error); -- } -- -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- -- error = spa_open(dsname, &spa, FTAG); -- if (error) { -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- if (cleanup_fd != -1) -- zfs_onexit_fd_rele(cleanup_fd); -- return (error); -- } -- -- ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- ha->htag = htag; -- ha->snapname = snapname; -- ha->recursive = recursive; -- ha->temphold = temphold; -- -- if (recursive) { -- error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, -- ha, DS_FIND_CHILDREN); -- } else { -- error = dsl_dataset_user_hold_one(dsname, ha); -- } -- if (error == 0) -- error = dsl_sync_task_group_wait(ha->dstg); -- -- for (dst = list_head(&ha->dstg->dstg_tasks); dst; -- dst = list_next(&ha->dstg->dstg_tasks, dst)) { -- dsl_dataset_t *ds = dst->dst_arg1; - -- if (dst->dst_err) { -- dsl_dataset_name(ds, ha->failed); -- *strchr(ha->failed, '@') = '\0'; -- } else if (error == 0 && minor != 0 && temphold) { -- /* -- * If this hold is to be released upon process exit, -- * register that action now. -- */ -- dsl_register_onexit_hold_cleanup(ds, htag, minor); -- } -- dsl_dataset_rele(ds, ha->dstg); -- } -- -- if (error == 0 && recursive && !ha->gotone) -- error = ENOENT; -- -- if (error) -- (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); -- -- dsl_sync_task_group_destroy(ha->dstg); -- -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- spa_close(spa, FTAG); -- if (cleanup_fd != -1) -- zfs_onexit_fd_rele(cleanup_fd); -- return (error); --} -- --struct dsl_ds_releasearg { -- dsl_dataset_t *ds; -- const char *htag; -- boolean_t own; /* do we own or just hold ds? */ --}; -- --static int --dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, -- boolean_t *might_destroy) --{ -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- uint64_t zapobj; -- uint64_t tmp; -- int error; -- -- *might_destroy = B_FALSE; -- -- mutex_enter(&ds->ds_lock); -- zapobj = ds->ds_phys->ds_userrefs_obj; -- if (zapobj == 0) { -- /* The tag can't possibly exist */ -- mutex_exit(&ds->ds_lock); -- return (ESRCH); -- } -- -- /* Make sure the tag exists */ -- error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); -- if (error) { -- mutex_exit(&ds->ds_lock); -- if (error == ENOENT) -- error = ESRCH; -- return (error); -- } -- -- if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && -- DS_IS_DEFER_DESTROY(ds)) -- *might_destroy = B_TRUE; -- -- mutex_exit(&ds->ds_lock); -- return (0); --} -- --static int --dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) --{ -- struct dsl_ds_releasearg *ra = arg1; -- dsl_dataset_t *ds = ra->ds; -- boolean_t might_destroy; -- int error; -- -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- -- error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); -- if (error) -- return (error); -- -- if (might_destroy) { -- struct dsl_ds_destroyarg dsda = {0}; -- -- if (dmu_tx_is_syncing(tx)) { -- /* -- * If we're not prepared to remove the snapshot, -- * we can't allow the release to happen right now. -- */ -- if (!ra->own) -- return (EBUSY); -- } -- dsda.ds = ds; -- dsda.releasing = B_TRUE; -- return (dsl_dataset_destroy_check(&dsda, tag, tx)); -- } -- -- return (0); --} -- --static void --dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) --{ -- struct dsl_ds_releasearg *ra = arg1; -- dsl_dataset_t *ds = ra->ds; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- uint64_t zapobj; -- uint64_t dsobj = ds->ds_object; -- uint64_t refs; -- int error; -- -- mutex_enter(&ds->ds_lock); -- ds->ds_userrefs--; -- refs = ds->ds_userrefs; -- mutex_exit(&ds->ds_lock); -- error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); -- VERIFY(error == 0 || error == ENOENT); -- zapobj = ds->ds_phys->ds_userrefs_obj; -- VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); -- -- spa_history_log_internal(LOG_DS_USER_RELEASE, -- dp->dp_spa, tx, "<%s> %lld dataset = %llu", -- ra->htag, (longlong_t)refs, dsobj); -- -- if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && -- DS_IS_DEFER_DESTROY(ds)) { -- struct dsl_ds_destroyarg dsda = {0}; -- -- ASSERT(ra->own); -- dsda.ds = ds; -- dsda.releasing = B_TRUE; -- /* We already did the destroy_check */ -- dsl_dataset_destroy_sync(&dsda, tag, tx); -- } --} -- --static int --dsl_dataset_user_release_one(const char *dsname, void *arg) --{ -- struct dsl_ds_holdarg *ha = arg; -- struct dsl_ds_releasearg *ra; -- dsl_dataset_t *ds; -- int error; -- void *dtag = ha->dstg; -- char *name; -- boolean_t own = B_FALSE; -- boolean_t might_destroy; -- -- /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = dsl_dataset_hold(name, dtag, &ds); -- strfree(name); -- if (error == ENOENT && ha->recursive) -- return (0); -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- if (error) -- return (error); -- -- ha->gotone = B_TRUE; -- -- ASSERT(dsl_dataset_is_snapshot(ds)); -- -- error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); -- if (error) { -- dsl_dataset_rele(ds, dtag); -- return (error); -- } -- -- if (might_destroy) { --#ifdef _KERNEL -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = zfs_unmount_snap(name, NULL); -- strfree(name); -- if (error) { -- dsl_dataset_rele(ds, dtag); -- return (error); -- } --#endif -- if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { -- dsl_dataset_rele(ds, dtag); -- return (EBUSY); -- } else { -- own = B_TRUE; -- dsl_dataset_make_exclusive(ds, dtag); -- } -- } -- -- ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); -- ra->ds = ds; -- ra->htag = ha->htag; -- ra->own = own; -- dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, -- dsl_dataset_user_release_sync, ra, dtag, 0); -- -- return (0); --} -- --int --dsl_dataset_user_release(char *dsname, char *snapname, char *htag, -- boolean_t recursive) --{ -- struct dsl_ds_holdarg *ha; -- dsl_sync_task_t *dst; -- spa_t *spa; -- int error; -- --top: -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- -- error = spa_open(dsname, &spa, FTAG); -- if (error) { -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- return (error); -- } -- -- ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- ha->htag = htag; -- ha->snapname = snapname; -- ha->recursive = recursive; -- if (recursive) { -- error = dmu_objset_find(dsname, dsl_dataset_user_release_one, -- ha, DS_FIND_CHILDREN); -- } else { -- error = dsl_dataset_user_release_one(dsname, ha); -- } -- if (error == 0) -- error = dsl_sync_task_group_wait(ha->dstg); -- -- for (dst = list_head(&ha->dstg->dstg_tasks); dst; -- dst = list_next(&ha->dstg->dstg_tasks, dst)) { -- struct dsl_ds_releasearg *ra = dst->dst_arg1; -- dsl_dataset_t *ds = ra->ds; -- -- if (dst->dst_err) -- dsl_dataset_name(ds, ha->failed); -- -- if (ra->own) -- dsl_dataset_disown(ds, ha->dstg); -- else -- dsl_dataset_rele(ds, ha->dstg); -- -- kmem_free(ra, sizeof (struct dsl_ds_releasearg)); -- } -- -- if (error == 0 && recursive && !ha->gotone) -- error = ENOENT; -- -- if (error && error != EBUSY) -- (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); -- -- dsl_sync_task_group_destroy(ha->dstg); -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- spa_close(spa, FTAG); -- -- /* -- * We can get EBUSY if we were racing with deferred destroy and -- * dsl_dataset_user_release_check() hadn't done the necessary -- * open context setup. We can also get EBUSY if we're racing -- * with destroy and that thread is the ds_owner. Either way -- * the busy condition should be transient, and we should retry -- * the release operation. -- */ -- if (error == EBUSY) -- goto top; -- -- return (error); --} -- --/* -- * Called at spa_load time (with retry == B_FALSE) to release a stale -- * temporary user hold. Also called by the onexit code (with retry == B_TRUE). -- */ --int --dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, -- boolean_t retry) --{ -- dsl_dataset_t *ds; -- char *snap; -- char *name; -- int namelen; -- int error; -- -- do { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) -- return (error); -- namelen = dsl_dataset_namelen(ds)+1; -- name = kmem_alloc(namelen, KM_SLEEP); -- dsl_dataset_name(ds, name); -- dsl_dataset_rele(ds, FTAG); -- -- snap = strchr(name, '@'); -- *snap = '\0'; -- ++snap; -- error = dsl_dataset_user_release(name, snap, htag, B_FALSE); -- kmem_free(name, namelen); -- -- /* -- * The object can't have been destroyed because we have a hold, -- * but it might have been renamed, resulting in ENOENT. Retry -- * if we've been requested to do so. -- * -- * It would be nice if we could use the dsobj all the way -- * through and avoid ENOENT entirely. But we might need to -- * unmount the snapshot, and there's currently no way to lookup -- * a vfsp using a ZFS object id. -- */ -- } while ((error == ENOENT) && retry); -- -- return (error); --} -- --int --dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) --{ -- dsl_dataset_t *ds; -- int err; -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); -- if (ds->ds_phys->ds_userrefs_obj != 0) { -- zap_attribute_t *za; -- zap_cursor_t zc; -- -- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); -- for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_phys->ds_userrefs_obj); -- zap_cursor_retrieve(&zc, za) == 0; -- zap_cursor_advance(&zc)) { -- VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, -- za->za_first_integer)); -- } -- zap_cursor_fini(&zc); -- kmem_free(za, sizeof (zap_attribute_t)); -- } -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); -+ dsl_dataset_set_refreservation_sync_impl(ds, -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); - dsl_dataset_rele(ds, FTAG); -- return (0); - } - --/* -- * Note, this function is used as the callback for dmu_objset_find(). We -- * always return 0 so that we will continue to find and process -- * inconsistent datasets, even if we encounter an error trying to -- * process one of them. -- */ --/* ARGSUSED */ - int --dsl_destroy_inconsistent(const char *dsname, void *arg) -+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, -+ uint64_t refreservation) - { -- dsl_dataset_t *ds; -+ dsl_dataset_set_qr_arg_t ddsqra; - -- if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { -- if (DS_IS_INCONSISTENT(ds)) -- (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); -- else -- dsl_dataset_disown(ds, FTAG); -- } -- return (0); --} -+ ddsqra.ddsqra_name = dsname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = refreservation; - -+ return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, -+ dsl_dataset_set_refreservation_sync, &ddsqra, 0)); -+} - -@@ -4190,2 +2844,4 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - -+ ASSERT(dsl_pool_config_held(dp)); -+ - *usedp = 0; -@@ -4202,3 +2858,2 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - snapobj = new->ds_object; -@@ -4246,3 +2901,3 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - if (snapobj == 0) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -4251,3 +2906,2 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - } -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -4289,3 +2943,3 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - lastsnap->ds_phys->ds_creation_txg) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4293,3 +2947,2 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - snapobj = lastsnap->ds_phys->ds_next_snap_obj; -@@ -4314,3 +2967,2 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - } -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -4318,4 +2970,40 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - -+/* -+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. -+ * For example, they could both be snapshots of the same filesystem, and -+ * 'earlier' is before 'later'. Or 'earlier' could be the origin of -+ * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's -+ * filesystem. Or 'earlier' could be the origin's origin. -+ */ -+boolean_t -+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) -+{ -+ dsl_pool_t *dp = later->ds_dir->dd_pool; -+ int error; -+ boolean_t ret; -+ dsl_dataset_t *origin; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ if (earlier->ds_phys->ds_creation_txg >= -+ later->ds_phys->ds_creation_txg) -+ return (B_FALSE); -+ -+ if (later->ds_dir == earlier->ds_dir) -+ return (B_TRUE); -+ if (!dsl_dir_is_clone(later->ds_dir)) -+ return (B_FALSE); -+ -+ if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) -+ return (B_TRUE); -+ error = dsl_dataset_hold_obj(dp, -+ later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); -+ if (error != 0) -+ return (B_FALSE); -+ ret = dsl_dataset_is_before(origin, earlier); -+ dsl_dataset_rele(origin, FTAG); -+ return (ret); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) --EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); - EXPORT_SYMBOL(dsl_dataset_hold); -@@ -4327,18 +3015,10 @@ EXPORT_SYMBOL(dsl_dataset_rele); - EXPORT_SYMBOL(dsl_dataset_disown); --EXPORT_SYMBOL(dsl_dataset_drop_ref); - EXPORT_SYMBOL(dsl_dataset_tryown); --EXPORT_SYMBOL(dsl_dataset_make_exclusive); - EXPORT_SYMBOL(dsl_dataset_create_sync); - EXPORT_SYMBOL(dsl_dataset_create_sync_dd); --EXPORT_SYMBOL(dsl_dataset_destroy); --EXPORT_SYMBOL(dsl_dataset_destroy_check); --EXPORT_SYMBOL(dsl_dataset_destroy_sync); - EXPORT_SYMBOL(dsl_dataset_snapshot_check); - EXPORT_SYMBOL(dsl_dataset_snapshot_sync); --EXPORT_SYMBOL(dsl_dataset_rename); - EXPORT_SYMBOL(dsl_dataset_promote); --EXPORT_SYMBOL(dsl_dataset_clone_swap); - EXPORT_SYMBOL(dsl_dataset_user_hold); - EXPORT_SYMBOL(dsl_dataset_user_release); --EXPORT_SYMBOL(dsl_dataset_user_release_tmp); - EXPORT_SYMBOL(dsl_dataset_get_holds); -@@ -4347,3 +3027,3 @@ EXPORT_SYMBOL(dsl_dataset_set_blkptr); - EXPORT_SYMBOL(dsl_dataset_get_spa); --EXPORT_SYMBOL(dsl_dataset_modified_since_lastsnap); -+EXPORT_SYMBOL(dsl_dataset_modified_since_snap); - EXPORT_SYMBOL(dsl_dataset_space_written); -@@ -4362,6 +3042,4 @@ EXPORT_SYMBOL(dsl_dsobj_to_dsname); - EXPORT_SYMBOL(dsl_dataset_check_quota); --EXPORT_SYMBOL(dsl_dataset_set_quota); --EXPORT_SYMBOL(dsl_dataset_set_quota_sync); --EXPORT_SYMBOL(dsl_dataset_set_reservation); --EXPORT_SYMBOL(dsl_destroy_inconsistent); -+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); -+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); - #endif -diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c -index 294932c..99670df 100644 ---- a/module/zfs/dsl_deleg.c -+++ b/module/zfs/dsl_deleg.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -109,3 +109,3 @@ dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) - if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -141,6 +141,6 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) - type != ZFS_DELEG_USER_SETS) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - - if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -149,11 +149,20 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) - -+typedef struct dsl_deleg_arg { -+ const char *dda_name; -+ nvlist_t *dda_nvlist; -+} dsl_deleg_arg_t; -+ - static void --dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- nvlist_t *nvp = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_deleg_arg_t *dda = arg; -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; -- uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; -+ uint64_t zapobj; -+ -+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - -+ zapobj = dd->dd_phys->dd_deleg_zapobj; - if (zapobj == 0) { -@@ -164,3 +173,3 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) { -+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { - const char *whokey = nvpair_name(whopair); -@@ -170,3 +179,3 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); -+ perms = fnvpair_value_nvlist(whopair); - -@@ -183,8 +192,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - perm, 8, 1, &n, tx) == 0); -- spa_history_log_internal(LOG_DS_PERM_UPDATE, -- dd->dd_pool->dp_spa, tx, -- "%s %s dataset = %llu", whokey, perm, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission update", tx, -+ "%s %s", whokey, perm); - } - } -+ dsl_dir_rele(dd, FTAG); - } -@@ -192,14 +200,19 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- nvlist_t *nvp = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_deleg_arg_t *dda = arg; -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; -- uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; -+ uint64_t zapobj; - -- if (zapobj == 0) -+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); -+ zapobj = dd->dd_phys->dd_deleg_zapobj; -+ if (zapobj == 0) { -+ dsl_dir_rele(dd, FTAG); - return; -+ } - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) { -+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { - const char *whokey = nvpair_name(whopair); -@@ -215,6 +228,4 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, -- dd->dd_pool->dp_spa, tx, -- "%s dataset = %llu", whokey, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission who remove", -+ tx, "%s", whokey); - continue; -@@ -236,37 +247,40 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- spa_history_log_internal(LOG_DS_PERM_REMOVE, -- dd->dd_pool->dp_spa, tx, -- "%s %s dataset = %llu", whokey, perm, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission remove", tx, -+ "%s %s", whokey, perm); - } - } -+ dsl_dir_rele(dd, FTAG); - } - --int --dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) -+static int -+dsl_deleg_check(void *arg, dmu_tx_t *tx) - { -+ dsl_deleg_arg_t *dda = arg; - dsl_dir_t *dd; - int error; -- nvpair_t *whopair = NULL; -- int blocks_modified = 0; - -- error = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (error) -- return (error); -- -- if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < -+ if (spa_version(dmu_tx_pool(tx)->dp_spa) < - SPA_VERSION_DELEGATED_PERMS) { -- dsl_dir_close(dd, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) -- blocks_modified++; -+ error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); -+ if (error == 0) -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+} - -- error = dsl_sync_task_do(dd->dd_pool, NULL, -- unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, -- dd, nvp, blocks_modified); -- dsl_dir_close(dd, FTAG); -+int -+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) -+{ -+ dsl_deleg_arg_t dda; - -- return (error); -+ /* nvp must already have been verified to be valid */ -+ -+ dda.dda_name = ddname; -+ dda.dda_nvlist = nvp; -+ -+ return (dsl_sync_task(ddname, dsl_deleg_check, -+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, -+ &dda, fnvlist_num_pairs(nvp))); - } -@@ -301,5 +315,11 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- error = dsl_dir_open(ddname, FTAG, &startdd, NULL); -- if (error) -+ error = dsl_pool_hold(ddname, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); -+ } - -@@ -308,6 +328,6 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -- basezc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- baseza = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); -+ basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); -@@ -315,3 +335,2 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - for (dd = startdd; dd != NULL; dd = dd->dd_parent) { -@@ -320,11 +339,8 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- if (dd->dd_phys->dd_deleg_zapobj && -- (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, -- &n) == 0) && n) { -- VERIFY(nvlist_alloc(&sp_nvp, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -- } else { -+ if (dd->dd_phys->dd_deleg_zapobj == 0 || -+ zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 || -+ n == 0) - continue; -- } - -+ sp_nvp = fnvlist_alloc(); - for (zap_cursor_init(basezc, mos, -@@ -338,4 +354,3 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- VERIFY(nvlist_alloc(&perms_nvp, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ perms_nvp = fnvlist_alloc(); - for (zap_cursor_init(zc, mos, baseza->za_first_integer); -@@ -343,9 +358,7 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - zap_cursor_advance(zc)) { -- VERIFY(nvlist_add_boolean(perms_nvp, -- za->za_name) == 0); -+ fnvlist_add_boolean(perms_nvp, za->za_name); - } - zap_cursor_fini(zc); -- VERIFY(nvlist_add_nvlist(sp_nvp, baseza->za_name, -- perms_nvp) == 0); -- nvlist_free(perms_nvp); -+ fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp); -+ fnvlist_free(perms_nvp); - } -@@ -355,14 +368,14 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - dsl_dir_name(dd, source); -- VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); -+ fnvlist_add_nvlist(*nvp, source, sp_nvp); - nvlist_free(sp_nvp); - } -- rw_exit(&dp->dp_config_rwlock); - - kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); -- kmem_free(baseza, sizeof(zap_attribute_t)); -- kmem_free(basezc, sizeof(zap_cursor_t)); -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(baseza, sizeof (zap_attribute_t)); -+ kmem_free(basezc, sizeof (zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - -- dsl_dir_close(startdd, FTAG); -+ dsl_dir_rele(startdd, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (0); -@@ -415,3 +428,3 @@ dsl_check_access(objset_t *mos, uint64_t zapobj, - if (error == ENOENT) -- error = EPERM; -+ error = SET_ERROR(EPERM); - } -@@ -460,3 +473,3 @@ dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, - -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -535,8 +548,6 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, - /* -- * Check if user has requested permission. If descendent is set, must have -- * descendent perms. -+ * Check if user has requested permission. - */ - int --dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, -- cred_t *cr) -+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) - { -@@ -555,3 +566,3 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - if (dsl_delegation_on(mos) == B_FALSE) -- return (ECANCELED); -+ return (SET_ERROR(ECANCELED)); - -@@ -559,5 +570,5 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - SPA_VERSION_DELEGATED_PERMS) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -- if (dsl_dataset_is_snapshot(ds) || descendent) { -+ if (dsl_dataset_is_snapshot(ds)) { - /* -@@ -574,3 +585,3 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, -@@ -633,5 +644,4 @@ again: - } -- error = EPERM; -+ error = SET_ERROR(EPERM); - success: -- rw_exit(&dp->dp_config_rwlock); - -@@ -647,2 +657,3 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -650,8 +661,11 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) - -- error = dsl_dataset_hold(dsname, FTAG, &ds); -- if (error) -+ error = dsl_pool_hold(dsname, FTAG, &dp); -+ if (error != 0) - return (error); -- -- error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); -- dsl_dataset_rele(ds, FTAG); -+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_deleg_access_impl(ds, perm, cr); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ dsl_pool_rele(dp, FTAG); - -diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c -new file mode 100644 -index 0000000..351165d ---- /dev/null -+++ b/module/zfs/dsl_destroy.c -@@ -0,0 +1,950 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct dmu_snapshots_destroy_arg { -+ nvlist_t *dsda_snaps; -+ nvlist_t *dsda_successful_snaps; -+ boolean_t dsda_defer; -+ nvlist_t *dsda_errlist; -+} dmu_snapshots_destroy_arg_t; -+ -+int -+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) -+{ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (dsl_dataset_long_held(ds)) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Only allow deferred destroy on pools that support it. -+ * NOTE: deferred destroy is only supported on snapshots. -+ */ -+ if (defer) { -+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -+ SPA_VERSION_USERREFS) -+ return (SET_ERROR(ENOTSUP)); -+ return (0); -+ } -+ -+ /* -+ * If this snapshot has an elevated user reference count, -+ * we can't destroy it yet. -+ */ -+ if (ds->ds_userrefs > 0) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Can't delete a branch point. -+ */ -+ if (ds->ds_phys->ds_num_children > 1) -+ return (SET_ERROR(EEXIST)); -+ -+ return (0); -+} -+ -+static int -+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) -+{ -+ dmu_snapshots_destroy_arg_t *dsda = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ int error = 0; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { -+ dsl_dataset_t *ds; -+ -+ error = dsl_dataset_hold(dp, nvpair_name(pair), -+ FTAG, &ds); -+ -+ /* -+ * If the snapshot does not exist, silently ignore it -+ * (it's "already destroyed"). -+ */ -+ if (error == ENOENT) -+ continue; -+ -+ if (error == 0) { -+ error = dsl_destroy_snapshot_check_impl(ds, -+ dsda->dsda_defer); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error == 0) { -+ fnvlist_add_boolean(dsda->dsda_successful_snaps, -+ nvpair_name(pair)); -+ } else { -+ fnvlist_add_int32(dsda->dsda_errlist, -+ nvpair_name(pair), error); -+ } -+ } -+ -+ pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); -+ if (pair != NULL) -+ return (fnvpair_value_int32(pair)); -+ -+ return (0); -+} -+ -+struct process_old_arg { -+ dsl_dataset_t *ds; -+ dsl_dataset_t *ds_prev; -+ boolean_t after_branch_point; -+ zio_t *pio; -+ uint64_t used, comp, uncomp; -+}; -+ -+static int -+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -+{ -+ struct process_old_arg *poa = arg; -+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; -+ -+ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { -+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); -+ if (poa->ds_prev && !poa->after_branch_point && -+ bp->blk_birth > -+ poa->ds_prev->ds_phys->ds_prev_snap_txg) { -+ poa->ds_prev->ds_phys->ds_unique_bytes += -+ bp_get_dsize_sync(dp->dp_spa, bp); -+ } -+ } else { -+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp); -+ poa->comp += BP_GET_PSIZE(bp); -+ poa->uncomp += BP_GET_UCSIZE(bp); -+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); -+ } -+ return (0); -+} -+ -+static void -+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, -+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) -+{ -+ struct process_old_arg poa = { 0 }; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t deadlist_obj; -+ -+ ASSERT(ds->ds_deadlist.dl_oldfmt); -+ ASSERT(ds_next->ds_deadlist.dl_oldfmt); -+ -+ poa.ds = ds; -+ poa.ds_prev = ds_prev; -+ poa.after_branch_point = after_branch_point; -+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -+ VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, -+ process_old_cb, &poa, tx)); -+ VERIFY0(zio_wait(poa.pio)); -+ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); -+ -+ /* change snapused */ -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -+ -poa.used, -poa.comp, -poa.uncomp, tx); -+ -+ /* swap next's deadlist to our deadlist */ -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_close(&ds_next->ds_deadlist); -+ deadlist_obj = ds->ds_phys->ds_deadlist_obj; -+ ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; -+ ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; -+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&ds_next->ds_deadlist, mos, -+ ds_next->ds_phys->ds_deadlist_obj); -+} -+ -+static void -+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) -+{ -+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ zap_cursor_t *zc; -+ zap_attribute_t *za; -+ -+ /* -+ * If it is the old version, dd_clones doesn't exist so we can't -+ * find the clones, but dsl_deadlist_remove_key() is a no-op so it -+ * doesn't matter. -+ */ -+ if (ds->ds_dir->dd_phys->dd_clones == 0) -+ return; -+ -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ -+ for (zap_cursor_init(zc, mos, ds->ds_dir->dd_phys->dd_clones); -+ zap_cursor_retrieve(zc, za) == 0; -+ zap_cursor_advance(zc)) { -+ dsl_dataset_t *clone; -+ -+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -+ za->za_first_integer, FTAG, &clone)); -+ if (clone->ds_dir->dd_origin_txg > mintxg) { -+ dsl_deadlist_remove_key(&clone->ds_deadlist, -+ mintxg, tx); -+ dsl_dataset_remove_clones_key(clone, mintxg, tx); -+ } -+ dsl_dataset_rele(clone, FTAG); -+ } -+ zap_cursor_fini(zc); -+ -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); -+} -+ -+void -+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) -+{ -+#ifdef ZFS_DEBUG -+ int err; -+#endif -+ int after_branch_point = FALSE; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ dsl_dataset_t *ds_prev = NULL; -+ uint64_t obj, old_unique, used = 0, comp = 0, uncomp = 0; -+ dsl_dataset_t *ds_next, *ds_head, *hds; -+ -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ ASSERT(refcount_is_zero(&ds->ds_longholds)); -+ -+ if (defer && -+ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) { -+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; -+ spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); -+ return; -+ } -+ -+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -+ -+ /* We need to log before removing it from the namespace. */ -+ spa_history_log_internal_ds(ds, "destroy", tx, ""); -+ -+ dsl_scan_ds_destroyed(ds, tx); -+ -+ obj = ds->ds_object; -+ -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ ASSERT3P(ds->ds_prev, ==, NULL); -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); -+ after_branch_point = -+ (ds_prev->ds_phys->ds_next_snap_obj != obj); -+ -+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); -+ if (after_branch_point && -+ ds_prev->ds_phys->ds_next_clones_obj != 0) { -+ dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); -+ if (ds->ds_phys->ds_next_snap_obj != 0) { -+ VERIFY0(zap_add_int(mos, -+ ds_prev->ds_phys->ds_next_clones_obj, -+ ds->ds_phys->ds_next_snap_obj, tx)); -+ } -+ } -+ if (!after_branch_point) { -+ ds_prev->ds_phys->ds_next_snap_obj = -+ ds->ds_phys->ds_next_snap_obj; -+ } -+ } -+ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); -+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); -+ -+ old_unique = ds_next->ds_phys->ds_unique_bytes; -+ -+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx); -+ ds_next->ds_phys->ds_prev_snap_obj = -+ ds->ds_phys->ds_prev_snap_obj; -+ ds_next->ds_phys->ds_prev_snap_txg = -+ ds->ds_phys->ds_prev_snap_txg; -+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, -+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); -+ -+ if (ds_next->ds_deadlist.dl_oldfmt) { -+ process_old_deadlist(ds, ds_prev, ds_next, -+ after_branch_point, tx); -+ } else { -+ /* Adjust prev's unique space. */ -+ if (ds_prev && !after_branch_point) { -+ dsl_deadlist_space_range(&ds_next->ds_deadlist, -+ ds_prev->ds_phys->ds_prev_snap_txg, -+ ds->ds_phys->ds_prev_snap_txg, -+ &used, &comp, &uncomp); -+ ds_prev->ds_phys->ds_unique_bytes += used; -+ } -+ -+ /* Adjust snapused. */ -+ dsl_deadlist_space_range(&ds_next->ds_deadlist, -+ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -+ &used, &comp, &uncomp); -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -+ -used, -comp, -uncomp, tx); -+ -+ /* Move blocks to be freed to pool's free list. */ -+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, -+ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, -+ tx); -+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, -+ DD_USED_HEAD, used, comp, uncomp, tx); -+ -+ /* Merge our deadlist into next's and free it. */ -+ dsl_deadlist_merge(&ds_next->ds_deadlist, -+ ds->ds_phys->ds_deadlist_obj, tx); -+ } -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_deadlist_obj = 0; -+ -+ /* Collapse range in clone heads */ -+ dsl_dataset_remove_clones_key(ds, -+ ds->ds_phys->ds_creation_txg, tx); -+ -+ if (dsl_dataset_is_snapshot(ds_next)) { -+ dsl_dataset_t *ds_nextnext; -+ -+ /* -+ * Update next's unique to include blocks which -+ * were previously shared by only this snapshot -+ * and it. Those blocks will be born after the -+ * prev snap and before this snap, and will have -+ * died after the next snap and before the one -+ * after that (ie. be on the snap after next's -+ * deadlist). -+ */ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); -+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, -+ ds->ds_phys->ds_prev_snap_txg, -+ ds->ds_phys->ds_creation_txg, -+ &used, &comp, &uncomp); -+ ds_next->ds_phys->ds_unique_bytes += used; -+ dsl_dataset_rele(ds_nextnext, FTAG); -+ ASSERT3P(ds_next->ds_prev, ==, NULL); -+ -+ /* Collapse range in this head. */ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); -+ dsl_deadlist_remove_key(&hds->ds_deadlist, -+ ds->ds_phys->ds_creation_txg, tx); -+ dsl_dataset_rele(hds, FTAG); -+ -+ } else { -+ ASSERT3P(ds_next->ds_prev, ==, ds); -+ dsl_dataset_rele(ds_next->ds_prev, ds_next); -+ ds_next->ds_prev = NULL; -+ if (ds_prev) { -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, -+ ds_next, &ds_next->ds_prev)); -+ } -+ -+ dsl_dataset_recalc_head_uniq(ds_next); -+ -+ /* -+ * Reduce the amount of our unconsumed refreservation -+ * being charged to our parent by the amount of -+ * new unique data we have gained. -+ */ -+ if (old_unique < ds_next->ds_reserved) { -+ int64_t mrsdelta; -+ uint64_t new_unique = -+ ds_next->ds_phys->ds_unique_bytes; -+ -+ ASSERT(old_unique <= new_unique); -+ mrsdelta = MIN(new_unique - old_unique, -+ ds_next->ds_reserved - old_unique); -+ dsl_dir_diduse_space(ds->ds_dir, -+ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); -+ } -+ } -+ dsl_dataset_rele(ds_next, FTAG); -+ -+ /* -+ * This must be done after the dsl_traverse(), because it will -+ * re-open the objset. -+ */ -+ if (ds->ds_objset) { -+ dmu_objset_evict(ds->ds_objset); -+ ds->ds_objset = NULL; -+ } -+ -+ /* remove from snapshot namespace */ -+ ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); -+#ifdef ZFS_DEBUG -+ { -+ uint64_t val; -+ -+ err = dsl_dataset_snap_lookup(ds_head, -+ ds->ds_snapname, &val); -+ ASSERT0(err); -+ ASSERT3U(val, ==, obj); -+ } -+#endif -+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); -+ dsl_dataset_rele(ds_head, FTAG); -+ -+ if (ds_prev != NULL) -+ dsl_dataset_rele(ds_prev, FTAG); -+ -+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -+ -+ if (ds->ds_phys->ds_next_clones_obj != 0) { -+ ASSERTV(uint64_t count); -+ ASSERT0(zap_count(mos, -+ ds->ds_phys->ds_next_clones_obj, &count) && count == 0); -+ VERIFY0(dmu_object_free(mos, -+ ds->ds_phys->ds_next_clones_obj, tx)); -+ } -+ if (ds->ds_phys->ds_props_obj != 0) -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); -+ if (ds->ds_phys->ds_userrefs_obj != 0) -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); -+ dsl_dir_rele(ds->ds_dir, ds); -+ ds->ds_dir = NULL; -+ VERIFY0(dmu_object_free(mos, obj, tx)); -+} -+ -+static void -+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dmu_snapshots_destroy_arg_t *dsda = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); -+ pair != NULL; -+ pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); -+ -+ dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The semantics of this function are described in the comment above -+ * lzc_destroy_snaps(). To summarize: -+ * -+ * The snapshots must all be in the same pool. -+ * -+ * Snapshots that don't exist will be silently ignored (considered to be -+ * "already deleted"). -+ * -+ * On success, all snaps will be destroyed and this will return 0. -+ * On failure, no snaps will be destroyed, the errlist will be filled in, -+ * and this will return an errno. -+ */ -+int -+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, -+ nvlist_t *errlist) -+{ -+ dmu_snapshots_destroy_arg_t dsda; -+ int error; -+ nvpair_t *pair; -+ -+ pair = nvlist_next_nvpair(snaps, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ dsda.dsda_snaps = snaps; -+ VERIFY0(nvlist_alloc(&dsda.dsda_successful_snaps, -+ NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ dsda.dsda_defer = defer; -+ dsda.dsda_errlist = errlist; -+ -+ error = dsl_sync_task(nvpair_name(pair), -+ dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, -+ &dsda, 0); -+ fnvlist_free(dsda.dsda_successful_snaps); -+ -+ return (error); -+} -+ -+int -+dsl_destroy_snapshot(const char *name, boolean_t defer) -+{ -+ int error; -+ nvlist_t *nvl; -+ nvlist_t *errlist; -+ -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ VERIFY0(nvlist_alloc(&errlist, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ fnvlist_add_boolean(nvl, name); -+ error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); -+ fnvlist_free(errlist); -+ fnvlist_free(nvl); -+ return (error); -+} -+ -+struct killarg { -+ dsl_dataset_t *ds; -+ dmu_tx_t *tx; -+}; -+ -+/* ARGSUSED */ -+static int -+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, -+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) -+{ -+ struct killarg *ka = arg; -+ dmu_tx_t *tx = ka->tx; -+ -+ if (bp == NULL) -+ return (0); -+ -+ if (zb->zb_level == ZB_ZIL_LEVEL) { -+ ASSERT(zilog != NULL); -+ /* -+ * It's a block in the intent log. It has no -+ * accounting, so just free it. -+ */ -+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); -+ } else { -+ ASSERT(zilog == NULL); -+ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); -+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); -+ } -+ -+ return (0); -+} -+ -+static void -+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ struct killarg ka; -+ -+ /* -+ * Free everything that we point to (that's born after -+ * the previous snapshot, if we are a clone) -+ * -+ * NB: this should be very quick, because we already -+ * freed all the objects in open context. -+ */ -+ ka.ds = ds; -+ ka.tx = tx; -+ VERIFY0(traverse_dataset(ds, -+ ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, -+ kill_blkptr, &ka)); -+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); -+} -+ -+typedef struct dsl_destroy_head_arg { -+ const char *ddha_name; -+} dsl_destroy_head_arg_t; -+ -+int -+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) -+{ -+ int error; -+ uint64_t count; -+ objset_t *mos; -+ -+ if (dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (refcount_count(&ds->ds_longholds) != expected_holds) -+ return (SET_ERROR(EBUSY)); -+ -+ mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ -+ /* -+ * Can't delete a head dataset if there are snapshots of it. -+ * (Except if the only snapshots are from the branch we cloned -+ * from.) -+ */ -+ if (ds->ds_prev != NULL && -+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Can't delete if there are children of this fs. -+ */ -+ error = zap_count(mos, -+ ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); -+ if (error != 0) -+ return (error); -+ if (count != 0) -+ return (SET_ERROR(EEXIST)); -+ -+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && -+ ds->ds_prev->ds_phys->ds_num_children == 2 && -+ ds->ds_prev->ds_userrefs == 0) { -+ /* We need to remove the origin snapshot as well. */ -+ if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) -+ return (SET_ERROR(EBUSY)); -+ } -+ return (0); -+} -+ -+static int -+dsl_destroy_head_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ -+ error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_destroy_head_check_impl(ds, 0); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+} -+ -+static void -+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) -+{ -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ dd_used_t t; -+ -+ ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); -+ -+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); -+ -+ ASSERT0(dd->dd_phys->dd_head_dataset_obj); -+ -+ /* -+ * Remove our reservation. The impl() routine avoids setting the -+ * actual property, which would require the (already destroyed) ds. -+ */ -+ dsl_dir_set_reservation_sync_impl(dd, 0, tx); -+ -+ ASSERT0(dd->dd_phys->dd_used_bytes); -+ ASSERT0(dd->dd_phys->dd_reserved); -+ for (t = 0; t < DD_USED_NUM; t++) -+ ASSERT0(dd->dd_phys->dd_used_breakdown[t]); -+ -+ VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); -+ VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); -+ VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); -+ VERIFY0(zap_remove(mos, -+ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); -+ -+ dsl_dir_rele(dd, FTAG); -+ VERIFY0(dmu_object_free(mos, ddobj, tx)); -+} -+ -+void -+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t obj, ddobj, prevobj = 0; -+ boolean_t rmorigin; -+ zfeature_info_t *async_destroy; -+ objset_t *os; -+ -+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -+ ASSERT(ds->ds_prev == NULL || -+ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); -+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ /* We need to log before removing it from the namespace. */ -+ spa_history_log_internal_ds(ds, "destroy", tx, ""); -+ -+ rmorigin = (dsl_dir_is_clone(ds->ds_dir) && -+ DS_IS_DEFER_DESTROY(ds->ds_prev) && -+ ds->ds_prev->ds_phys->ds_num_children == 2 && -+ ds->ds_prev->ds_userrefs == 0); -+ -+ /* Remove our reservation */ -+ if (ds->ds_reserved != 0) { -+ dsl_dataset_set_refreservation_sync_impl(ds, -+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -+ 0, tx); -+ ASSERT0(ds->ds_reserved); -+ } -+ -+ dsl_scan_ds_destroyed(ds, tx); -+ -+ obj = ds->ds_object; -+ -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ /* This is a clone */ -+ ASSERT(ds->ds_prev != NULL); -+ ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj); -+ ASSERT0(ds->ds_phys->ds_next_snap_obj); -+ -+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -+ if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) { -+ dsl_dataset_remove_from_next_clones(ds->ds_prev, -+ obj, tx); -+ } -+ -+ ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1); -+ ds->ds_prev->ds_phys->ds_num_children--; -+ } -+ -+ async_destroy = -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; -+ -+ /* -+ * Destroy the deadlist. Unless it's a clone, the -+ * deadlist should be empty. (If it's a clone, it's -+ * safe to ignore the deadlist contents.) -+ */ -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_deadlist_obj = 0; -+ -+ VERIFY0(dmu_objset_from_ds(ds, &os)); -+ -+ if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { -+ old_synchronous_dataset_destroy(ds, tx); -+ } else { -+ /* -+ * Move the bptree into the pool's list of trees to -+ * clean up and update space accounting information. -+ */ -+ uint64_t used, comp, uncomp; -+ -+ zil_destroy_sync(dmu_objset_zil(os), tx); -+ -+ if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { -+ dsl_scan_t *scn = dp->dp_scan; -+ -+ spa_feature_incr(dp->dp_spa, async_destroy, tx); -+ dp->dp_bptree_obj = bptree_alloc(mos, tx); -+ VERIFY0(zap_add(mos, -+ DMU_POOL_DIRECTORY_OBJECT, -+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, -+ &dp->dp_bptree_obj, tx)); -+ ASSERT(!scn->scn_async_destroying); -+ scn->scn_async_destroying = B_TRUE; -+ } -+ -+ used = ds->ds_dir->dd_phys->dd_used_bytes; -+ comp = ds->ds_dir->dd_phys->dd_compressed_bytes; -+ uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; -+ -+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || -+ ds->ds_phys->ds_unique_bytes == used); -+ -+ bptree_add(mos, dp->dp_bptree_obj, -+ &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, -+ used, comp, uncomp, tx); -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -+ -used, -comp, -uncomp, tx); -+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -+ used, comp, uncomp, tx); -+ } -+ -+ if (ds->ds_prev != NULL) { -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -+ VERIFY0(zap_remove_int(mos, -+ ds->ds_prev->ds_dir->dd_phys->dd_clones, -+ ds->ds_object, tx)); -+ } -+ prevobj = ds->ds_prev->ds_object; -+ dsl_dataset_rele(ds->ds_prev, ds); -+ ds->ds_prev = NULL; -+ } -+ -+ /* -+ * This must be done after the dsl_traverse(), because it will -+ * re-open the objset. -+ */ -+ if (ds->ds_objset) { -+ dmu_objset_evict(ds->ds_objset); -+ ds->ds_objset = NULL; -+ } -+ -+ /* Erase the link in the dir */ -+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -+ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; -+ ddobj = ds->ds_dir->dd_object; -+ ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx)); -+ -+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -+ -+ ASSERT0(ds->ds_phys->ds_next_clones_obj); -+ ASSERT0(ds->ds_phys->ds_props_obj); -+ ASSERT0(ds->ds_phys->ds_userrefs_obj); -+ dsl_dir_rele(ds->ds_dir, ds); -+ ds->ds_dir = NULL; -+ VERIFY0(dmu_object_free(mos, obj, tx)); -+ -+ dsl_dir_destroy_sync(ddobj, tx); -+ -+ if (rmorigin) { -+ dsl_dataset_t *prev; -+ VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); -+ dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); -+ dsl_dataset_rele(prev, FTAG); -+ } -+} -+ -+static void -+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); -+ dsl_destroy_head_sync_impl(ds, tx); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+static void -+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); -+ -+ /* Mark it as inconsistent on-disk, in case we crash */ -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -+ -+ spa_history_log_internal_ds(ds, "destroy begin", tx, ""); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_destroy_head(const char *name) -+{ -+ dsl_destroy_head_arg_t ddha; -+ int error; -+ spa_t *spa; -+ boolean_t isenabled; -+ -+#ifdef _KERNEL -+ zfs_destroy_unmount_origin(name); -+#endif -+ -+ error = spa_open(name, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ isenabled = spa_feature_is_enabled(spa, -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); -+ spa_close(spa, FTAG); -+ -+ ddha.ddha_name = name; -+ -+ if (!isenabled) { -+ objset_t *os; -+ -+ error = dsl_sync_task(name, dsl_destroy_head_check, -+ dsl_destroy_head_begin_sync, &ddha, 0); -+ if (error != 0) -+ return (error); -+ -+ /* -+ * Head deletion is processed in one txg on old pools; -+ * remove the objects from open context so that the txg sync -+ * is not too long. -+ */ -+ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); -+ if (error == 0) { -+ uint64_t obj; -+ uint64_t prev_snap_txg = -+ dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg; -+ for (obj = 0; error == 0; -+ error = dmu_object_next(os, &obj, FALSE, -+ prev_snap_txg)) -+ (void) dmu_free_long_object(os, obj); -+ /* sync out all frees */ -+ txg_wait_synced(dmu_objset_pool(os), 0); -+ dmu_objset_disown(os, FTAG); -+ } -+ } -+ -+ return (dsl_sync_task(name, dsl_destroy_head_check, -+ dsl_destroy_head_sync, &ddha, 0)); -+} -+ -+/* -+ * Note, this function is used as the callback for dmu_objset_find(). We -+ * always return 0 so that we will continue to find and process -+ * inconsistent datasets, even if we encounter an error trying to -+ * process one of them. -+ */ -+/* ARGSUSED */ -+int -+dsl_destroy_inconsistent(const char *dsname, void *arg) -+{ -+ objset_t *os; -+ -+ if (dmu_objset_hold(dsname, FTAG, &os) == 0) { -+ boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); -+ dmu_objset_rele(os, FTAG); -+ if (inconsistent) -+ (void) dsl_destroy_head(dsname); -+ } -+ return (0); -+} -+ -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+EXPORT_SYMBOL(dsl_destroy_head); -+EXPORT_SYMBOL(dsl_destroy_head_sync_impl); -+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one); -+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl); -+EXPORT_SYMBOL(dsl_destroy_inconsistent); -+EXPORT_SYMBOL(dsl_dataset_user_release_tmp); -+EXPORT_SYMBOL(dsl_destroy_head_check_impl); -+#endif -diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c -index 69f68c2..fb7cd2c 100644 ---- a/module/zfs/dsl_dir.c -+++ b/module/zfs/dsl_dir.c -@@ -22,2 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. - */ -@@ -38,2 +40,3 @@ - #include -+#include - #include "zfs_namecheck.h" -@@ -41,4 +44,2 @@ - static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); --static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); -- - -@@ -49,4 +50,4 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - dsl_dir_t *dd = arg; -- ASSERTV(dsl_pool_t *dp = dd->dd_pool;) - int t; -+ ASSERTV(dsl_pool_t *dp = dd->dd_pool); - -@@ -59,3 +60,3 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - -@@ -73,3 +74,3 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - int --dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, -+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **ddp) -@@ -80,7 +81,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -+ ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -111,5 +111,5 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - if (dd->dd_phys->dd_parent_obj) { -- err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, -+ err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, - NULL, dd, &dd->dd_parent); -- if (err) -+ if (err != 0) - goto errout; -@@ -130,3 +130,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - } -- if (err) -+ if (err != 0) - goto errout; -@@ -147,3 +147,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); -- if (err) -+ if (err != 0) - goto errout; -@@ -159,3 +159,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); -@@ -186,3 +186,3 @@ errout: - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); -@@ -194,3 +194,3 @@ errout: - void --dsl_dir_close(dsl_dir_t *dd, void *tag) -+dsl_dir_rele(dsl_dir_t *dd, void *tag) - { -@@ -251,4 +251,5 @@ getcomponent(const char *path, char *component, const char **nextp) - char *p; -+ - if ((path == NULL) || (path[0] == '\0')) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - /* This would be a good place to reserve some namespace... */ -@@ -257,3 +258,3 @@ getcomponent(const char *path, char *component, const char **nextp) - /* two separators in a row */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -267,5 +268,5 @@ getcomponent(const char *path, char *component, const char **nextp) - (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - if (strlen(path) >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strcpy(component, path); -@@ -273,6 +274,6 @@ getcomponent(const char *path, char *component, const char **nextp) - } else if (p[0] == '/') { -- if (p-path >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (p - path >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); -- component[p-path] = '\0'; -+ component[p - path] = '\0'; - p++; -@@ -284,9 +285,9 @@ getcomponent(const char *path, char *component, const char **nextp) - if (strchr(path, '/')) -- return (EINVAL); -- if (p-path >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(EINVAL)); -+ if (p - path >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); -- component[p-path] = '\0'; -+ component[p - path] = '\0'; - } else { -- ASSERT(!"invalid p"); -+ panic("invalid p=%p", (void *)p); - } -@@ -297,7 +298,10 @@ getcomponent(const char *path, char *component, const char **nextp) - /* -- * same as dsl_dir_open, ignore the first component of name and use the -- * spa instead -+ * Return the dsl_dir_t, and possibly the last component which couldn't -+ * be found in *tail. The name must be in the specified dsl_pool_t. This -+ * thread must hold the dp_config_rwlock for the pool. Returns NULL if the -+ * path is bogus, or if tail==NULL and we couldn't parse the whole name. -+ * (*tail)[0] == '@' means that the last component is a snapshot. - */ - int --dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, -+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, - dsl_dir_t **ddp, const char **tailp) -@@ -305,10 +309,6 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - char *buf; -- const char *next, *nextnext = NULL; -+ const char *spaname, *next, *nextnext = NULL; - int err; - dsl_dir_t *dd; -- dsl_pool_t *dp; - uint64_t ddobj; -- int openedspa = FALSE; -- -- dprintf("%s\n", name); - -@@ -316,24 +316,16 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - err = getcomponent(name, buf, &next); -- if (err) -+ if (err != 0) - goto error; -- if (spa == NULL) { -- err = spa_open(buf, &spa, FTAG); -- if (err) { -- dprintf("spa_open(%s) failed\n", buf); -- goto error; -- } -- openedspa = TRUE; - -- /* XXX this assertion belongs in spa_open */ -- ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); -+ /* Make sure the name is in the specified pool. */ -+ spaname = spa_name(dp->dp_spa); -+ if (strcmp(buf, spaname) != 0) { -+ err = SET_ERROR(EINVAL); -+ goto error; - } - -- dp = spa_get_dsl(spa); -+ ASSERT(dsl_pool_config_held(dp)); - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); -- if (err) { -- rw_exit(&dp->dp_config_rwlock); -- if (openedspa) -- spa_close(spa, FTAG); -+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); -+ if (err != 0) { - goto error; -@@ -344,3 +336,3 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - err = getcomponent(next, buf, &nextnext); -- if (err) -+ if (err != 0) - break; -@@ -355,3 +347,3 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - buf, sizeof (ddobj), 1, &ddobj); -- if (err) { -+ if (err != 0) { - if (err == ENOENT) -@@ -361,6 +353,6 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - -- err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); -- if (err) -+ err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); -+ if (err != 0) - break; -- dsl_dir_close(dd, tag); -+ dsl_dir_rele(dd, tag); - dd = child_ds; -@@ -368,8 +360,5 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - } -- rw_exit(&dp->dp_config_rwlock); - -- if (err) { -- dsl_dir_close(dd, tag); -- if (openedspa) -- spa_close(spa, FTAG); -+ if (err != 0) { -+ dsl_dir_rele(dd, tag); - goto error; -@@ -384,10 +373,8 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - /* bad path name */ -- dsl_dir_close(dd, tag); -+ dsl_dir_rele(dd, tag); - dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -- if (tailp) -+ if (tailp != NULL) - *tailp = next; -- if (openedspa) -- spa_close(spa, FTAG); - *ddp = dd; -@@ -398,14 +385,2 @@ error: - --/* -- * Return the dsl_dir_t, and possibly the last component which couldn't -- * be found in *tail. Return NULL if the path is bogus, or if -- * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' -- * means that the last component is a snapshot. -- */ --int --dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) --{ -- return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); --} -- - uint64_t -@@ -447,73 +422,2 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, - --/* ARGSUSED */ --int --dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_pool_t *dp = dd->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- int err; -- uint64_t count; -- -- /* -- * There should be exactly two holds, both from -- * dsl_dataset_destroy: one on the dd directory, and one on its -- * head ds. If there are more holds, then a concurrent thread is -- * performing a lookup inside this dir while we're trying to destroy -- * it. To minimize this possibility, we perform this check only -- * in syncing context and fail the operation if we encounter -- * additional holds. The dp_config_rwlock ensures that nobody else -- * opens it after we check. -- */ -- if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) -- return (EBUSY); -- -- err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); -- if (err) -- return (err); -- if (count != 0) -- return (EEXIST); -- -- return (0); --} -- --void --dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- dsl_prop_setarg_t psa; -- uint64_t value = 0; -- uint64_t obj; -- dd_used_t t; -- -- ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); -- ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); -- -- /* Remove our reservation. */ -- dsl_prop_setarg_init_uint64(&psa, "reservation", -- (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -- &value); -- psa.psa_effective_value = 0; /* predict default value */ -- -- dsl_dir_set_reservation_sync(ds, &psa, tx); -- -- ASSERT0(dd->dd_phys->dd_used_bytes); -- ASSERT0(dd->dd_phys->dd_reserved); -- for (t = 0; t < DD_USED_NUM; t++) -- ASSERT0(dd->dd_phys->dd_used_breakdown[t]); -- -- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); -- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); -- VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); -- VERIFY(0 == zap_remove(mos, -- dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); -- -- obj = dd->dd_object; -- dsl_dir_close(dd, tag); -- VERIFY(0 == dmu_object_free(mos, obj, tx)); --} -- - boolean_t -@@ -540,2 +444,4 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - dd->dd_phys->dd_compressed_bytes)); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, -+ dd->dd_phys->dd_uncompressed_bytes); - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { -@@ -553,3 +459,2 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - if (dsl_dir_is_clone(dd)) { -@@ -558,3 +463,3 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - -- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, -+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &ds)); -@@ -564,3 +469,2 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - } -- rw_exit(&dd->dd_pool->dp_config_rwlock); - } -@@ -574,3 +478,3 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) - -- if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { -+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { - /* up the hold count until we can be written out */ -@@ -688,3 +592,2 @@ struct tempreserve { - list_node_t tr_node; -- dsl_pool_t *tr_dp; - dsl_dir_t *tr_ds; -@@ -734,2 +637,3 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - mutex_exit(&dd->dd_lock); -+ DMU_TX_STAT_BUMP(dmu_tx_quota); - return (error); -@@ -782,3 +686,4 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - mutex_exit(&dd->dd_lock); -- return (retval); -+ DMU_TX_STAT_BUMP(dmu_tx_quota); -+ return (SET_ERROR(retval)); - } -@@ -839,10 +744,17 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - list_insert_tail(tr_list, tr); -- -- err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); - } else { - if (err == EAGAIN) { -- txg_delay(dd->dd_pool, tx->tx_txg, 1); -- err = ERESTART; -+ /* -+ * If arc_memory_throttle() detected that pageout -+ * is running and we are low on memory, we delay new -+ * non-pageout transactions to give pageout an -+ * advantage. -+ * -+ * It is unfortunate to be delaying while the caller's -+ * locks are held. -+ */ -+ txg_delay(dd->dd_pool, tx->tx_txg, -+ MSEC2NSEC(10), MSEC2NSEC(10)); -+ err = SET_ERROR(ERESTART); - } -- dsl_pool_memory_pressure(dd->dd_pool); - } -@@ -850,9 +762,2 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - if (err == 0) { -- struct tempreserve *tr; -- -- tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); -- tr->tr_dp = dd->dd_pool; -- tr->tr_size = asize; -- list_insert_tail(tr_list, tr); -- - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, -@@ -861,3 +766,3 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - -- if (err) -+ if (err != 0) - dsl_dir_tempreserve_clear(tr_list, tx); -@@ -885,6 +790,4 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) - -- while ((tr = list_head(tr_list))) { -- if (tr->tr_dp) { -- dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); -- } else if (tr->tr_ds) { -+ while ((tr = list_head(tr_list)) != NULL) { -+ if (tr->tr_ds) { - mutex_enter(&tr->tr_ds->dd_lock); -@@ -904,4 +807,14 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) - --static void --dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) -+/* -+ * This should be called from open context when we think we're going to write -+ * or free space, for example when dirtying data. Be conservative; it's okay -+ * to write less space or free more, but we don't want to write more or free -+ * less than the amount specified. -+ * -+ * NOTE: The behavior of this function is identical to the Illumos / FreeBSD -+ * version however it has been adjusted to use an iterative rather then -+ * recursive algorithm to minimize stack usage. -+ */ -+void -+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) - { -@@ -910,28 +823,18 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) - -- mutex_enter(&dd->dd_lock); -- if (space > 0) -- dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; -- -- est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; -- parent_space = parent_delta(dd, est_used, space); -- mutex_exit(&dd->dd_lock); -+ do { -+ mutex_enter(&dd->dd_lock); -+ if (space > 0) -+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - -- /* Make sure that we clean up dd_space_to* */ -- dsl_dir_dirty(dd, tx); -+ est_used = dsl_dir_space_towrite(dd) + -+ dd->dd_phys->dd_used_bytes; -+ parent_space = parent_delta(dd, est_used, space); -+ mutex_exit(&dd->dd_lock); - -- /* XXX this is potentially expensive and unnecessary... */ -- if (parent_space && dd->dd_parent) -- dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); --} -+ /* Make sure that we clean up dd_space_to* */ -+ dsl_dir_dirty(dd, tx); - --/* -- * Call in open context when we think we're going to write/free space, -- * eg. when dirtying data. Be conservative (ie. OK to write less than -- * this or free more than this, but don't write more or free less). -- */ --void --dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) --{ -- dsl_pool_willuse_space(dd->dd_pool, space, tx); -- dsl_dir_willuse_space_impl(dd, space, tx); -+ dd = dd->dd_parent; -+ space = parent_space; -+ } while (space && dd); - } -@@ -944,2 +847,10 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - int64_t accounted_delta; -+ -+ /* -+ * dsl_dataset_set_refreservation_sync_impl() calls this with -+ * dd_lock held, so that it can atomically update -+ * ds->ds_reserved and the dsl_dir accounting, so that -+ * dsl_dataset_check_quota() can see dataset and dir accounting -+ * consistently. -+ */ - boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); -@@ -949,2 +860,4 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - -+ dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ - if (needlock) -@@ -957,3 +870,2 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); -- dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_used_bytes += used; -@@ -992,4 +904,2 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - { -- boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); -- - ASSERT(dmu_tx_is_syncing(tx)); -@@ -1001,4 +911,4 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - -- if (needlock) -- mutex_enter(&dd->dd_lock); -+ dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ mutex_enter(&dd->dd_lock); - ASSERT(delta > 0 ? -@@ -1007,25 +917,39 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); -- dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_used_breakdown[oldtype] -= delta; - dd->dd_phys->dd_used_breakdown[newtype] += delta; -- if (needlock) -- mutex_exit(&dd->dd_lock); -+ mutex_exit(&dd->dd_lock); - } - -+typedef struct dsl_dir_set_qr_arg { -+ const char *ddsqra_name; -+ zprop_source_t ddsqra_source; -+ uint64_t ddsqra_value; -+} dsl_dir_set_qr_arg_t; -+ - static int --dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- int err; -- uint64_t towrite; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t towrite, newval; - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_prop_predict(ds->ds_dir, "quota", -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -- if (psa->psa_effective_value == 0) -+ if (newval == 0) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -- mutex_enter(&dd->dd_lock); -+ mutex_enter(&ds->ds_dir->dd_lock); - /* -@@ -1036,30 +960,41 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- towrite = dsl_dir_space_towrite(dd); -+ towrite = dsl_dir_space_towrite(ds->ds_dir); - if ((dmu_tx_is_syncing(tx) || towrite == 0) && -- (psa->psa_effective_value < dd->dd_phys->dd_reserved || -- psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { -- err = ENOSPC; -+ (newval < ds->ds_dir->dd_phys->dd_reserved || -+ newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { -+ error = SET_ERROR(ENOSPC); - } -- mutex_exit(&dd->dd_lock); -- return (err); -+ mutex_exit(&ds->ds_dir->dd_lock); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - --extern dsl_syncfunc_t dsl_prop_set_sync; -- - static void --dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ uint64_t newval; - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(dd, psa); -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - -- dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { -+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- mutex_enter(&dd->dd_lock); -- dd->dd_phys->dd_quota = effective_value; -- mutex_exit(&dd->dd_lock); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); -+ } else { -+ newval = ddsqra->ddsqra_value; -+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", -+ zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); -+ } -+ -+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -+ mutex_enter(&ds->ds_dir->dd_lock); -+ ds->ds_dir->dd_phys->dd_quota = newval; -+ mutex_exit(&ds->ds_dir->dd_lock); -+ dsl_dataset_rele(ds, FTAG); - } -@@ -1069,33 +1004,10 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) - { -- dsl_dir_t *dd; -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -+ dsl_dir_set_qr_arg_t ddsqra; - -- dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); -+ ddsqra.ddsqra_name = ddname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = quota; - -- err = dsl_dataset_hold(ddname, FTAG, &ds); -- if (err) -- return (err); -- -- err = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -- } -- -- ASSERT(ds->ds_dir == dd); -- -- /* -- * If someone removes a file, then tries to set the quota, we want to -- * make sure the file freeing takes effect. -- */ -- txg_wait_open(dd->dd_pool, 0); -- -- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, -- dsl_dir_set_quota_sync, ds, &psa, 0); -- -- dsl_dir_close(dd, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(ddname, dsl_dir_set_quota_check, -+ dsl_dir_set_quota_sync, &ddsqra, 0)); - } -@@ -1103,15 +1015,15 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) - int --dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value; -- uint64_t used, avail; -- int err; -- -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ dsl_dir_t *dd; -+ uint64_t newval, used, avail; -+ int error; - -- effective_value = psa->psa_effective_value; -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ dd = ds->ds_dir; - -@@ -1121,4 +1033,14 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (!dmu_tx_is_syncing(tx)) -+ if (!dmu_tx_is_syncing(tx)) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } -+ -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -@@ -1135,23 +1057,19 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { -- uint64_t delta = MAX(used, effective_value) - -+ if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { -+ uint64_t delta = MAX(used, newval) - - MAX(used, dd->dd_phys->dd_reserved); - -- if (delta > avail) -- return (ENOSPC); -- if (dd->dd_phys->dd_quota > 0 && -- effective_value > dd->dd_phys->dd_quota) -- return (ENOSPC); -+ if (delta > avail || -+ (dd->dd_phys->dd_quota > 0 && -+ newval > dd->dd_phys->dd_quota)) -+ error = SET_ERROR(ENOSPC); - } - -- return (0); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - --static void --dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; - uint64_t used; -@@ -1159,5 +1077,2 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(dd, psa); -- - dmu_buf_will_dirty(dd->dd_dbuf, tx); -@@ -1166,5 +1081,4 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - used = dd->dd_phys->dd_used_bytes; -- delta = MAX(used, effective_value) - -- MAX(used, dd->dd_phys->dd_reserved); -- dd->dd_phys->dd_reserved = effective_value; -+ delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved); -+ dd->dd_phys->dd_reserved = value; - -@@ -1178,31 +1092,43 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --int --dsl_dir_set_reservation(const char *ddname, zprop_source_t source, -- uint64_t reservation) -+static void -+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -+ uint64_t newval; - -- dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - -- err = dsl_dataset_hold(ddname, FTAG, &ds); -- if (err) -- return (err); -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { -+ dsl_prop_set_sync_impl(ds, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- err = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); -+ } else { -+ newval = ddsqra->ddsqra_value; -+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ (longlong_t)newval); - } - -- ASSERT(ds->ds_dir == dd); -+ dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_dir_set_reservation(const char *ddname, zprop_source_t source, -+ uint64_t reservation) -+{ -+ dsl_dir_set_qr_arg_t ddsqra; - -- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, -- dsl_dir_set_reservation_sync, ds, &psa, 0); -+ ddsqra.ddsqra_name = ddname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = reservation; - -- dsl_dir_close(dd, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, -+ dsl_dir_set_reservation_sync, &ddsqra, 0)); - } -@@ -1238,36 +1164,70 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) - --struct renamearg { -- dsl_dir_t *newparent; -- const char *mynewname; --}; -+typedef struct dsl_dir_rename_arg { -+ const char *ddra_oldname; -+ const char *ddra_newname; -+} dsl_dir_rename_arg_t; - -+/* ARGSUSED */ - static int --dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) - { -- dsl_dir_t *dd = arg1; -- struct renamearg *ra = arg2; -- dsl_pool_t *dp = dd->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- int err; -- uint64_t val; -+ int *deltap = arg; -+ char namebuf[MAXNAMELEN]; - -- /* -- * There should only be one reference, from dmu_objset_rename(). -- * Fleeting holds are also possible (eg, from "zfs list" getting -- * stats), but any that are present in open context will likely -- * be gone by syncing context, so only fail from syncing -- * context. -- */ -- if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) -- return (EBUSY); -- -- /* check for existing name */ -- err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, -- ra->mynewname, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ dsl_dataset_name(ds, namebuf); - -- if (ra->newparent != dd->dd_parent) { -+ if (strlen(namebuf) + *deltap >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); -+ return (0); -+} -+ -+static int -+dsl_dir_rename_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dir_rename_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *dd, *newparent; -+ const char *mynewname; -+ int error; -+ int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); -+ -+ /* target dir should exist */ -+ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); -+ if (error != 0) -+ return (error); -+ -+ /* new parent should exist */ -+ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, -+ &newparent, &mynewname); -+ if (error != 0) { -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } -+ -+ /* can't rename to different pool */ -+ if (dd->dd_pool != newparent->dd_pool) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(ENXIO)); -+ } -+ -+ /* new name should not already exist */ -+ if (mynewname == NULL) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(EEXIST)); -+ } -+ -+ /* if the name length is growing, validate child name lengths */ -+ if (delta > 0) { -+ error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, -+ &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ if (error != 0) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } -+ } -+ -+ if (newparent != dd->dd_parent) { - /* is there enough space? */ -@@ -1277,10 +1237,19 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) - /* no rename into our descendant */ -- if (closest_common_ancestor(dd, ra->newparent) == dd) -- return (EINVAL); -+ if (closest_common_ancestor(dd, newparent) == dd) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- if ((err = dsl_dir_transfer_possible(dd->dd_parent, -- ra->newparent, myspace))) -- return (err); -+ error = dsl_dir_transfer_possible(dd->dd_parent, -+ newparent, myspace); -+ if (error != 0) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } - } - -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); - return (0); -@@ -1289,13 +1258,20 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct renamearg *ra = arg2; -- dsl_pool_t *dp = dd->dd_pool; -+ dsl_dir_rename_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *dd, *newparent; -+ const char *mynewname; -+ int error; - objset_t *mos = dp->dp_meta_objset; -- int err; - -- ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); -+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); -+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, -+ &mynewname)); - -- if (ra->newparent != dd->dd_parent) { -+ /* Log this before we change the name. */ -+ spa_history_log_internal_dd(dd, "rename", tx, -+ "-> %s", ddra->ddra_newname); -+ -+ if (newparent != dd->dd_parent) { - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -@@ -1304,3 +1280,3 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -dd->dd_phys->dd_uncompressed_bytes, tx); -- dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, -+ dsl_dir_diduse_space(newparent, DD_USED_CHILD, - dd->dd_phys->dd_used_bytes, -@@ -1315,3 +1291,3 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -unused_rsrv, 0, 0, tx); -- dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, -+ dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, - unused_rsrv, 0, 0, tx); -@@ -1323,19 +1299,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* remove from old parent zapobj */ -- err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, -+ error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, tx); -- ASSERT0(err); -+ ASSERT0(error); - -- (void) strcpy(dd->dd_myname, ra->mynewname); -- dsl_dir_close(dd->dd_parent, dd); -- dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; -- VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, -- ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); -+ (void) strcpy(dd->dd_myname, mynewname); -+ dsl_dir_rele(dd->dd_parent, dd); -+ dd->dd_phys->dd_parent_obj = newparent->dd_object; -+ VERIFY0(dsl_dir_hold_obj(dp, -+ newparent->dd_object, NULL, dd, &dd->dd_parent)); - - /* add to new parent zapobj */ -- err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, -- dd->dd_myname, 8, 1, &dd->dd_object, tx); -- ASSERT0(err); -+ VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, -+ dd->dd_myname, 8, 1, &dd->dd_object, tx)); -+ -+#ifdef _KERNEL -+ zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); -+#endif - -- spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, -- tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); -+ dsl_prop_notify_all(dd); -+ -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); - } -@@ -1343,30 +1324,11 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --dsl_dir_rename(dsl_dir_t *dd, const char *newname) -+dsl_dir_rename(const char *oldname, const char *newname) - { -- struct renamearg ra; -- int err; -- -- /* new parent should exist */ -- err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); -- if (err) -- return (err); -- -- /* can't rename to different pool */ -- if (dd->dd_pool != ra.newparent->dd_pool) { -- err = ENXIO; -- goto out; -- } -- -- /* new name should not already exist */ -- if (ra.mynewname == NULL) { -- err = EEXIST; -- goto out; -- } -+ dsl_dir_rename_arg_t ddra; - -- err = dsl_sync_task_do(dd->dd_pool, -- dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); -+ ddra.ddra_oldname = oldname; -+ ddra.ddra_newname = newname; - --out: -- dsl_dir_close(ra.newparent, FTAG); -- return (err); -+ return (dsl_sync_task(oldname, -+ dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); - } -@@ -1384,3 +1346,3 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) - if (avail < space) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -1415,4 +1377,2 @@ EXPORT_SYMBOL(dsl_dir_set_quota); - EXPORT_SYMBOL(dsl_dir_set_reservation); --EXPORT_SYMBOL(dsl_dir_open); --EXPORT_SYMBOL(dsl_dir_close); - #endif -diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c -index 7795d80..0ef5071 100644 ---- a/module/zfs/dsl_pool.c -+++ b/module/zfs/dsl_pool.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -45,210 +46,86 @@ - #include -+#include - --int zfs_no_write_throttle = 0; --int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ --int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ --int zfs_txg_history = 60; /* statistics for the last N txgs */ -- --unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ --unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ --unsigned long zfs_write_limit_inflated = 0; --unsigned long zfs_write_limit_override = 0; -- --kmutex_t zfs_write_limit_lock; -- --static pgcnt_t old_physmem = 0; -- --static void --dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata) --{ -- kstat_named_t *ks; -- char name[KSTAT_STRLEN]; -- int i, data_size = ndata * sizeof(kstat_named_t); -- -- (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s", -- spa_name(dp->dp_spa)); -- -- dp->dp_tx_assign_size = ndata; -- -- if (data_size) -- dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP); -- else -- dp->dp_tx_assign_buckets = NULL; -- -- for (i = 0; i < dp->dp_tx_assign_size; i++) { -- ks = &dp->dp_tx_assign_buckets[i]; -- ks->data_type = KSTAT_DATA_UINT64; -- ks->value.ui64 = 0; -- (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i); -- } -- -- dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc", -- KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); -- -- if (dp->dp_tx_assign_kstat) { -- dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets; -- dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size; -- dp->dp_tx_assign_kstat->ks_data_size = data_size; -- kstat_install(dp->dp_tx_assign_kstat); -- } --} -- --static void --dsl_pool_tx_assign_destroy(dsl_pool_t *dp) --{ -- if (dp->dp_tx_assign_buckets) -- kmem_free(dp->dp_tx_assign_buckets, -- dp->dp_tx_assign_size * sizeof(kstat_named_t)); -- -- if (dp->dp_tx_assign_kstat) -- kstat_delete(dp->dp_tx_assign_kstat); --} -- --void --dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs) --{ -- uint64_t idx = 0; -- -- while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1)) -- idx++; -- -- atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64); --} -- --static int --dsl_pool_txg_history_update(kstat_t *ksp, int rw) --{ -- dsl_pool_t *dp = ksp->ks_private; -- txg_history_t *th; -- int i = 0; -- -- if (rw == KSTAT_WRITE) -- return (EACCES); -- -- if (ksp->ks_data) -- kmem_free(ksp->ks_data, ksp->ks_data_size); -- -- mutex_enter(&dp->dp_lock); -- -- ksp->ks_ndata = dp->dp_txg_history_size; -- ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); -- if (ksp->ks_data_size > 0) -- ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); -- -- /* Traversed oldest to youngest for the most readable kstat output */ -- for (th = list_tail(&dp->dp_txg_history); th != NULL; -- th = list_prev(&dp->dp_txg_history, th)) { -- mutex_enter(&th->th_lock); -- ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); -- memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); -- i += sizeof(kstat_txg_t); -- mutex_exit(&th->th_lock); -- } -- -- mutex_exit(&dp->dp_lock); -- -- return (0); --} -- --static void --dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) --{ -- char name[KSTAT_STRLEN]; -- -- list_create(&dp->dp_txg_history, sizeof (txg_history_t), -- offsetof(txg_history_t, th_link)); -- dsl_pool_txg_history_add(dp, txg); -- -- (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); -- dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", -- KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); -- if (dp->dp_txg_kstat) { -- dp->dp_txg_kstat->ks_data = NULL; -- dp->dp_txg_kstat->ks_private = dp; -- dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; -- kstat_install(dp->dp_txg_kstat); -- } --} -- --static void --dsl_pool_txg_history_destroy(dsl_pool_t *dp) --{ -- txg_history_t *th; -- -- if (dp->dp_txg_kstat) { -- if (dp->dp_txg_kstat->ks_data) -- kmem_free(dp->dp_txg_kstat->ks_data, -- dp->dp_txg_kstat->ks_data_size); -- -- kstat_delete(dp->dp_txg_kstat); -- } -- -- mutex_enter(&dp->dp_lock); -- while ((th = list_remove_head(&dp->dp_txg_history))) { -- dp->dp_txg_history_size--; -- mutex_destroy(&th->th_lock); -- kmem_free(th, sizeof(txg_history_t)); -- } -- -- ASSERT3U(dp->dp_txg_history_size, ==, 0); -- list_destroy(&dp->dp_txg_history); -- mutex_exit(&dp->dp_lock); --} -- --txg_history_t * --dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) --{ -- txg_history_t *th, *rm; -- -- th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); -- mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); -- th->th_kstat.txg = txg; -- th->th_kstat.state = TXG_STATE_OPEN; -- th->th_kstat.birth = gethrtime(); -- -- mutex_enter(&dp->dp_lock); -- -- list_insert_head(&dp->dp_txg_history, th); -- dp->dp_txg_history_size++; -- -- while (dp->dp_txg_history_size > zfs_txg_history) { -- dp->dp_txg_history_size--; -- rm = list_remove_tail(&dp->dp_txg_history); -- mutex_destroy(&rm->th_lock); -- kmem_free(rm, sizeof(txg_history_t)); -- } -- -- mutex_exit(&dp->dp_lock); -+/* -+ * ZFS Write Throttle -+ * ------------------ -+ * -+ * ZFS must limit the rate of incoming writes to the rate at which it is able -+ * to sync data modifications to the backend storage. Throttling by too much -+ * creates an artificial limit; throttling by too little can only be sustained -+ * for short periods and would lead to highly lumpy performance. On a per-pool -+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change -+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount -+ * of dirty data decreases. When the amount of dirty data exceeds a -+ * predetermined threshold further modifications are blocked until the amount -+ * of dirty data decreases (as data is synced out). -+ * -+ * The limit on dirty data is tunable, and should be adjusted according to -+ * both the IO capacity and available memory of the system. The larger the -+ * window, the more ZFS is able to aggregate and amortize metadata (and data) -+ * changes. However, memory is a limited resource, and allowing for more dirty -+ * data comes at the cost of keeping other useful data in memory (for example -+ * ZFS data cached by the ARC). -+ * -+ * Implementation -+ * -+ * As buffers are modified dsl_pool_willuse_space() increments both the per- -+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of -+ * dirty space used; dsl_pool_dirty_space() decrements those values as data -+ * is synced out from dsl_pool_sync(). While only the poolwide value is -+ * relevant, the per-txg value is useful for debugging. The tunable -+ * zfs_dirty_data_max determines the dirty space limit. Once that value is -+ * exceeded, new writes are halted until space frees up. -+ * -+ * The zfs_dirty_data_sync tunable dictates the threshold at which we -+ * ensure that there is a txg syncing (see the comment in txg.c for a full -+ * description of transaction group stages). -+ * -+ * The IO scheduler uses both the dirty space limit and current amount of -+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS -+ * issues. See the comment in vdev_queue.c for details of the IO scheduler. -+ * -+ * The delay is also calculated based on the amount of dirty data. See the -+ * comment above dmu_tx_delay() for details. -+ */ - -- return (th); --} -+/* -+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, -+ * capped at zfs_dirty_data_max_max. It can also be overridden with a module -+ * parameter. -+ */ -+unsigned long zfs_dirty_data_max = 0; -+unsigned long zfs_dirty_data_max_max = 0; -+int zfs_dirty_data_max_percent = 10; -+int zfs_dirty_data_max_max_percent = 25; - - /* -- * Traversed youngest to oldest because lookups are only done for open -- * or syncing txgs which are guaranteed to be at the head of the list. -- * The txg_history_t structure will be returned locked. -+ * If there is at least this much dirty data, push out a txg. - */ --txg_history_t * --dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) --{ -- txg_history_t *th; -+unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024; - -- mutex_enter(&dp->dp_lock); -- for (th = list_head(&dp->dp_txg_history); th != NULL; -- th = list_next(&dp->dp_txg_history, th)) { -- if (th->th_kstat.txg == txg) { -- mutex_enter(&th->th_lock); -- break; -- } -- } -- mutex_exit(&dp->dp_lock); -+/* -+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in -+ * and delay each transaction. -+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. -+ */ -+int zfs_delay_min_dirty_percent = 60; - -- return (th); --} -+/* -+ * This controls how quickly the delay approaches infinity. -+ * Larger values cause it to delay more for a given amount of dirty data. -+ * Therefore larger values will cause there to be less dirty data for a -+ * given throughput. -+ * -+ * For the smoothest delay, this value should be about 1 billion divided -+ * by the maximum number of operations per second. This will smoothly -+ * handle between 10x and 1/10th this number. -+ * -+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the -+ * multiply in dmu_tx_delay(). -+ */ -+unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; - --void --dsl_pool_txg_history_put(txg_history_t *th) --{ -- mutex_exit(&th->th_lock); --} -+hrtime_t zfs_throttle_delay = MSEC2NSEC(10); -+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); - -@@ -266,3 +143,3 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) - -- return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); -+ return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); - } -@@ -278,4 +155,3 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - dp->dp_meta_rootbp = *bp; -- rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); -- dp->dp_write_limit = zfs_write_limit_min; -+ rrw_init(&dp->dp_config_rwlock, B_TRUE); - txg_init(dp, txg); -@@ -289,5 +165,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - txg_list_create(&dp->dp_sync_tasks, -- offsetof(dsl_sync_task_group_t, dstg_node)); -+ offsetof(dsl_sync_task_t, dst_node)); - - mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); -+ cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - -@@ -296,5 +173,2 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - -- dsl_pool_txg_history_init(dp, txg); -- dsl_pool_tx_assign_init(dp, 32); -- - return (dp); -@@ -326,3 +200,3 @@ dsl_pool_open(dsl_pool_t *dp) - -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -@@ -333,3 +207,3 @@ dsl_pool_open(dsl_pool_t *dp) - -- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, -+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir); -@@ -354,3 +228,3 @@ dsl_pool_open(dsl_pool_t *dp) - } -- dsl_dir_close(dd, dp); -+ dsl_dir_rele(dd, dp); - if (err) -@@ -369,3 +243,3 @@ dsl_pool_open(dsl_pool_t *dp) - goto out; -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); -@@ -402,3 +276,3 @@ dsl_pool_open(dsl_pool_t *dp) - out: -- rw_exit(&dp->dp_config_rwlock); -+ rrw_exit(&dp->dp_config_rwlock, FTAG); - return (err); -@@ -409,5 +283,5 @@ dsl_pool_close(dsl_pool_t *dp) - { -- /* drop our references from dsl_pool_open() */ -- - /* -+ * Drop our references from dsl_pool_open(). -+ * - * Since we held the origin_snap from "syncing" context (which -@@ -417,9 +291,9 @@ dsl_pool_close(dsl_pool_t *dp) - if (dp->dp_origin_snap) -- dsl_dataset_drop_ref(dp->dp_origin_snap, dp); -+ dsl_dataset_rele(dp->dp_origin_snap, dp); - if (dp->dp_mos_dir) -- dsl_dir_close(dp->dp_mos_dir, dp); -+ dsl_dir_rele(dp->dp_mos_dir, dp); - if (dp->dp_free_dir) -- dsl_dir_close(dp->dp_free_dir, dp); -+ dsl_dir_rele(dp->dp_free_dir, dp); - if (dp->dp_root_dir) -- dsl_dir_close(dp->dp_root_dir, dp); -+ dsl_dir_rele(dp->dp_root_dir, dp); - -@@ -439,5 +313,3 @@ dsl_pool_close(dsl_pool_t *dp) - dsl_scan_fini(dp); -- dsl_pool_tx_assign_destroy(dp); -- dsl_pool_txg_history_destroy(dp); -- rw_destroy(&dp->dp_config_rwlock); -+ rrw_destroy(&dp->dp_config_rwlock); - mutex_destroy(&dp->dp_lock); -@@ -459,2 +331,4 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ - /* create and open the MOS (meta-objset) */ -@@ -469,3 +343,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - /* Initialize scan structures */ -- VERIFY3U(0, ==, dsl_scan_init(dp, txg)); -+ VERIFY0(dsl_scan_init(dp, txg)); - -@@ -473,3 +347,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); -- VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, -+ VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir)); -@@ -478,3 +352,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - MOS_DIR_NAME, &dp->dp_mos_dir)); -@@ -485,3 +359,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - FREE_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); -@@ -492,3 +366,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); -@@ -503,3 +377,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - /* create the root objset */ -- VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); -+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); - VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds, -@@ -513,2 +387,4 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - -+ rrw_exit(&dp->dp_config_rwlock, FTAG); -+ - return (dp); -@@ -535,6 +411,3 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - dsl_deadlist_t *dl = arg; -- dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); -- rw_enter(&dp->dp_config_rwlock, RW_READER); - dsl_deadlist_insert(dl, bp, tx); -- rw_exit(&dp->dp_config_rwlock); - return (0); -@@ -542,2 +415,30 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - -+static void -+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -+ dmu_objset_sync(dp->dp_meta_objset, zio, tx); -+ VERIFY0(zio_wait(zio)); -+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); -+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); -+} -+ -+static void -+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) -+{ -+ ASSERT(MUTEX_HELD(&dp->dp_lock)); -+ -+ if (delta < 0) -+ ASSERT3U(-delta, <=, dp->dp_dirty_total); -+ -+ dp->dp_dirty_total += delta; -+ -+ /* -+ * Note: we signal even when increasing dp_dirty_total. -+ * This ensures forward progress -- each thread wakes the next waiter. -+ */ -+ if (dp->dp_dirty_total <= zfs_dirty_data_max) -+ cv_signal(&dp->dp_spaceavail_cv); -+} -+ - void -@@ -550,5 +451,2 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - objset_t *mos = dp->dp_meta_objset; -- hrtime_t start, write_time; -- uint64_t data_written; -- int err; - list_t synced_datasets; -@@ -558,17 +456,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- /* -- * We need to copy dp_space_towrite() before doing -- * dsl_sync_task_group_sync(), because -- * dsl_dataset_snapshot_reserve_space() will increase -- * dp_space_towrite but not actually write anything. -- */ -- data_written = dp->dp_space_towrite[txg & TXG_MASK]; -- - tx = dmu_tx_create_assigned(dp, txg); - -- dp->dp_read_overhead = 0; -- start = gethrtime(); -- -+ /* -+ * Write out all dirty blocks of dirty datasets. -+ */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { -+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - /* -@@ -582,8 +472,12 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - } -- DTRACE_PROBE(pool_sync__1setup); -- err = zio_wait(zio); -+ VERIFY0(zio_wait(zio)); - -- write_time = gethrtime() - start; -- ASSERT(err == 0); -- DTRACE_PROBE(pool_sync__2rootzio); -+ /* -+ * We have written all of the accounted dirty data, so our -+ * dp_space_towrite should now be zero. However, some seldom-used -+ * code paths do not adhere to this (e.g. dbuf_undirty(), also -+ * rounding error in dbuf_write_physdone). -+ * Shore up the accounting of any dirtied space now. -+ */ -+ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - -@@ -593,5 +487,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- for (ds = list_head(&synced_datasets); ds; -- ds = list_next(&synced_datasets, ds)) -+ for (ds = list_head(&synced_datasets); ds != NULL; -+ ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); -+ } - -@@ -605,3 +500,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { -+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - ASSERT(list_link_active(&ds->ds_synced_link)); -@@ -610,3 +505,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - } -- err = zio_wait(zio); -+ VERIFY0(zio_wait(zio)); - -@@ -617,6 +512,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - * - move dead blocks from the pending deadlist to the on-disk deadlist -- * - clean up zil records - * - release hold from dsl_dataset_dirty() - */ -- while ((ds = list_remove_head(&synced_datasets))) { -+ while ((ds = list_remove_head(&synced_datasets)) != NULL) { - ASSERTV(objset_t *os = ds->ds_objset); -@@ -628,6 +522,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- start = gethrtime(); -- while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) -+ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { - dsl_dir_sync(dd, tx); -- write_time += gethrtime() - start; -+ } - -@@ -649,16 +542,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- start = gethrtime(); - if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { -- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- dmu_objset_sync(mos, zio, tx); -- err = zio_wait(zio); -- ASSERT(err == 0); -- dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); -- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); -+ dsl_pool_sync_mos(dp, tx); - } -- write_time += gethrtime() - start; -- DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, -- hrtime_t, dp->dp_read_overhead); -- write_time -= dp->dp_read_overhead; - -@@ -673,5 +556,4 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- DTRACE_PROBE(pool_sync__3task); - if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { -- dsl_sync_task_group_t *dstg; -+ dsl_sync_task_t *dst; - /* -@@ -680,5 +562,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- ASSERT(spa_sync_pass(dp->dp_spa) == 1); -- while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) -- dsl_sync_task_group_sync(dstg, tx); -+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); -+ while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) -+ dsl_sync_task_sync(dst, tx); - } -@@ -687,43 +569,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- dp->dp_space_towrite[txg & TXG_MASK] = 0; -- ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); -- -- /* -- * If the write limit max has not been explicitly set, set it -- * to a fraction of available physical memory (default 1/8th). -- * Note that we must inflate the limit because the spa -- * inflates write sizes to account for data replication. -- * Check this each sync phase to catch changing memory size. -- */ -- if (physmem != old_physmem && zfs_write_limit_shift) { -- mutex_enter(&zfs_write_limit_lock); -- old_physmem = physmem; -- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; -- zfs_write_limit_inflated = MAX(zfs_write_limit_min, -- spa_get_asize(dp->dp_spa, zfs_write_limit_max)); -- mutex_exit(&zfs_write_limit_lock); -- } -- -- /* -- * Attempt to keep the sync time consistent by adjusting the -- * amount of write traffic allowed into each transaction group. -- * Weight the throughput calculation towards the current value: -- * thru = 3/4 old_thru + 1/4 new_thru -- * -- * Note: write_time is in nanosecs, so write_time/MICROSEC -- * yields millisecs -- */ -- ASSERT(zfs_write_limit_min > 0); -- if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { -- uint64_t throughput = data_written / (write_time / MICROSEC); -- -- if (dp->dp_throughput) -- dp->dp_throughput = throughput / 4 + -- 3 * dp->dp_throughput / 4; -- else -- dp->dp_throughput = throughput; -- dp->dp_write_limit = MIN(zfs_write_limit_inflated, -- MAX(zfs_write_limit_min, -- dp->dp_throughput * zfs_txg_synctime_ms)); -- } -+ DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); - } -@@ -734,6 +576,5 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) - zilog_t *zilog; -- dsl_dataset_t *ds; - - while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { -- ds = dmu_objset_ds(zilog->zl_os); -+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - zil_clean(zilog, txg); -@@ -779,44 +620,15 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) - --int --dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) -+boolean_t -+dsl_pool_need_dirty_delay(dsl_pool_t *dp) - { -- uint64_t reserved = 0; -- uint64_t write_limit = (zfs_write_limit_override ? -- zfs_write_limit_override : dp->dp_write_limit); -- -- if (zfs_no_write_throttle) { -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -- space); -- return (0); -- } -- -- /* -- * Check to see if we have exceeded the maximum allowed IO for -- * this transaction group. We can do this without locks since -- * a little slop here is ok. Note that we do the reserved check -- * with only half the requested reserve: this is because the -- * reserve requests are worst-case, and we really don't want to -- * throttle based off of worst-case estimates. -- */ -- if (write_limit > 0) { -- reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] -- + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; -- -- if (reserved && reserved > write_limit) { -- DMU_TX_STAT_BUMP(dmu_tx_write_limit); -- return (ERESTART); -- } -- } -- -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); -- -- /* -- * If this transaction group is over 7/8ths capacity, delay -- * the caller 1 clock tick. This will slow down the "fill" -- * rate until the sync process can catch up with us. -- */ -- if (reserved && reserved > (write_limit - (write_limit >> 3))) -- txg_delay(dp, tx->tx_txg, 1); -+ uint64_t delay_min_bytes = -+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; -+ boolean_t rv; - -- return (0); -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_total > zfs_dirty_data_sync) -+ txg_kick(dp); -+ rv = (dp->dp_dirty_total > delay_min_bytes); -+ mutex_exit(&dp->dp_lock); -+ return (rv); - } -@@ -824,6 +636,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) - void --dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) -+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - { -- ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); -+ if (space > 0) { -+ mutex_enter(&dp->dp_lock); -+ dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; -+ dsl_pool_dirty_delta(dp, space); -+ mutex_exit(&dp->dp_lock); -+ } - } -@@ -831,26 +647,18 @@ dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - void --dsl_pool_memory_pressure(dsl_pool_t *dp) -+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) - { -- uint64_t space_inuse = 0; -- int i; -- -- if (dp->dp_write_limit == zfs_write_limit_min) -+ ASSERT3S(space, >=, 0); -+ if (space == 0) - return; - -- for (i = 0; i < TXG_SIZE; i++) { -- space_inuse += dp->dp_space_towrite[i]; -- space_inuse += dp->dp_tempreserved[i]; -- } -- dp->dp_write_limit = MAX(zfs_write_limit_min, -- MIN(dp->dp_write_limit, space_inuse / 4)); --} -- --void --dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) --{ -- if (space > 0) { -- mutex_enter(&dp->dp_lock); -- dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; -- mutex_exit(&dp->dp_lock); -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { -+ /* XXX writing something we didn't dirty? */ -+ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; - } -+ ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); -+ dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; -+ ASSERT3U(dp->dp_dirty_total, >=, space); -+ dsl_pool_dirty_delta(dp, -space); -+ mutex_exit(&dp->dp_lock); - } -@@ -859,3 +667,3 @@ dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - static int --upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -864,5 +672,4 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa_get_dsl(spa); - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -892,3 +699,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - */ -- ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); -+ ASSERT0(prev->ds_phys->ds_bp.blk_birth); - -@@ -912,3 +719,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - ASSERT(ds->ds_prev == NULL); -- VERIFY(0 == dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); -@@ -917,4 +724,4 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); -- ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); -+ ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); -+ ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); - -@@ -926,3 +733,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - } -- VERIFY(0 == zap_add_int(dp->dp_meta_objset, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); -@@ -941,3 +748,3 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) - -- VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, - tx, DS_FIND_CHILDREN)); -@@ -947,15 +754,11 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) - static int --upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) - { - dmu_tx_t *tx = arg; -- dsl_dataset_t *ds; -- dsl_pool_t *dp = spa_get_dsl(spa); - objset_t *mos = dp->dp_meta_objset; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- -- if (ds->ds_dir->dd_phys->dd_origin_obj) { -+ if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { - dsl_dataset_t *origin; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); -@@ -968,4 +771,4 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -- origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); - -@@ -973,4 +776,2 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - } -- -- dsl_dataset_rele(ds, FTAG); - return (0); -@@ -986,3 +787,3 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); -@@ -996,8 +797,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) - SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); -- VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -- dp->dp_meta_objset, obj)); -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); - -- VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); -@@ -1013,13 +813,12 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) - ASSERT(dp->dp_origin_snap == NULL); -+ ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); - - /* create the origin dir, ds, & snap-ds */ -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); - dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, - NULL, 0, kcred, tx); -- VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); -- VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, -+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -+ dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); -+ VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap)); - dsl_dataset_rele(ds, FTAG); -- rw_exit(&dp->dp_config_rwlock); - } -@@ -1043,2 +842,3 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - uint64_t zapobj = dp->dp_tmp_userrefs_obj; -+ nvlist_t *holds; - -@@ -1048,2 +848,4 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - -+ holds = fnvlist_alloc(); -+ - for (zap_cursor_init(&zc, mos, zapobj); -@@ -1052,3 +854,3 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - char *htag; -- uint64_t dsobj; -+ nvlist_t *tags; - -@@ -1057,5 +859,13 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - ++htag; -- dsobj = strtonum(za.za_name, NULL); -- (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); -+ if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { -+ tags = fnvlist_alloc(); -+ fnvlist_add_boolean(tags, htag); -+ fnvlist_add_nvlist(holds, za.za_name, tags); -+ fnvlist_free(tags); -+ } else { -+ fnvlist_add_boolean(tags, htag); -+ } - } -+ dsl_dataset_user_release_tmp(dp, holds); -+ fnvlist_free(holds); - zap_cursor_fini(&zc); -@@ -1080,3 +890,3 @@ static int - dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, -- const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) -+ const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) - { -@@ -1099,3 +909,3 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - } else { -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1105,3 +915,3 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - if (holding) -- error = zap_add(mos, zapobj, name, 8, 1, now, tx); -+ error = zap_add(mos, zapobj, name, 8, 1, &now, tx); - else -@@ -1118,3 +928,3 @@ int - dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, -- uint64_t *now, dmu_tx_t *tx) -+ uint64_t now, dmu_tx_t *tx) - { -@@ -1130,3 +940,3 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - { -- return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, -+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, - tx, B_FALSE)); -@@ -1134,26 +944,134 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - -+/* -+ * DSL Pool Configuration Lock -+ * -+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset -+ * creation / destruction / rename / property setting). It must be held for -+ * read to hold a dataset or dsl_dir. I.e. you must call -+ * dsl_pool_config_enter() or dsl_pool_hold() before calling -+ * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock -+ * must be held continuously until all datasets and dsl_dirs are released. -+ * -+ * The only exception to this rule is that if a "long hold" is placed on -+ * a dataset, then the dp_config_rwlock may be dropped while the dataset -+ * is still held. The long hold will prevent the dataset from being -+ * destroyed -- the destroy will fail with EBUSY. A long hold can be -+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset -+ * (by calling dsl_{dataset,objset}_{try}own{_obj}). -+ * -+ * Legitimate long-holders (including owners) should be long-running, cancelable -+ * tasks that should cause "zfs destroy" to fail. This includes DMU -+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), -+ * "zfs send", and "zfs diff". There are several other long-holders whose -+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). -+ * -+ * The usual formula for long-holding would be: -+ * dsl_pool_hold() -+ * dsl_dataset_hold() -+ * ... perform checks ... -+ * dsl_dataset_long_hold() -+ * dsl_pool_rele() -+ * ... perform long-running task ... -+ * dsl_dataset_long_rele() -+ * dsl_dataset_rele() -+ * -+ * Note that when the long hold is released, the dataset is still held but -+ * the pool is not held. The dataset may change arbitrarily during this time -+ * (e.g. it could be destroyed). Therefore you shouldn't do anything to the -+ * dataset except release it. -+ * -+ * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only -+ * or modifying operations. -+ * -+ * Modifying operations should generally use dsl_sync_task(). The synctask -+ * infrastructure enforces proper locking strategy with respect to the -+ * dp_config_rwlock. See the comment above dsl_sync_task() for details. -+ * -+ * Read-only operations will manually hold the pool, then the dataset, obtain -+ * information from the dataset, then release the pool and dataset. -+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool -+ * hold/rele. -+ */ -+ -+int -+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) -+{ -+ spa_t *spa; -+ int error; -+ -+ error = spa_open(name, &spa, tag); -+ if (error == 0) { -+ *dp = spa_get_dsl(spa); -+ dsl_pool_config_enter(*dp, tag); -+ } -+ return (error); -+} -+ -+void -+dsl_pool_rele(dsl_pool_t *dp, void *tag) -+{ -+ dsl_pool_config_exit(dp, tag); -+ spa_close(dp->dp_spa, tag); -+} -+ -+void -+dsl_pool_config_enter(dsl_pool_t *dp, void *tag) -+{ -+ /* -+ * We use a "reentrant" reader-writer lock, but not reentrantly. -+ * -+ * The rrwlock can (with the track_all flag) track all reading threads, -+ * which is very useful for debugging which code path failed to release -+ * the lock, and for verifying that the *current* thread does hold -+ * the lock. -+ * -+ * (Unlike a rwlock, which knows that N threads hold it for -+ * read, but not *which* threads, so rw_held(RW_READER) returns TRUE -+ * if any thread holds it for read, even if this thread doesn't). -+ */ -+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); -+ rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); -+} -+ -+void -+dsl_pool_config_exit(dsl_pool_t *dp, void *tag) -+{ -+ rrw_exit(&dp->dp_config_rwlock, tag); -+} -+ -+boolean_t -+dsl_pool_config_held(dsl_pool_t *dp) -+{ -+ return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) --module_param(zfs_no_write_throttle, int, 0644); --MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling"); -+EXPORT_SYMBOL(dsl_pool_config_enter); -+EXPORT_SYMBOL(dsl_pool_config_exit); - --module_param(zfs_write_limit_shift, int, 0444); --MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); -+/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_percent, int, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty"); - --module_param(zfs_txg_synctime_ms, int, 0644); --MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); -+/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_max_percent, int, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_max_percent, -+ "zfs_dirty_data_max upper bound as % of RAM"); - --module_param(zfs_txg_history, int, 0644); --MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); -+module_param(zfs_delay_min_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold"); - --module_param(zfs_write_limit_min, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); -+module_param(zfs_dirty_data_max, ulong, 0644); -+MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit"); - --module_param(zfs_write_limit_max, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit"); -+/* zfs_dirty_data_max_max only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_max, ulong, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_max, -+ "zfs_dirty_data_max upper bound in bytes"); - --module_param(zfs_write_limit_inflated, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit"); -+module_param(zfs_dirty_data_sync, ulong, 0644); -+MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); - --module_param(zfs_write_limit_override, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit"); -+module_param(zfs_delay_scale, ulong, 0644); -+MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); - #endif -diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c -index e44a948..079ef97 100644 ---- a/module/zfs/dsl_prop.c -+++ b/module/zfs/dsl_prop.c -@@ -22,2 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. - */ -@@ -53,3 +55,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -57,3 +59,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - if (intsz != 1) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - (void) strncpy(buf, zfs_prop_default_string(prop), -@@ -62,3 +64,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - if (intsz != 8 || numints < 1) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - -@@ -83,3 +85,3 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - -- ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); -+ ASSERT(dsl_pool_config_held(dd->dd_pool)); - -@@ -98,4 +100,2 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - for (; dd != NULL; dd = dd->dd_parent) { -- ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); -- - if (dd != target || snapshot) { -@@ -147,3 +147,3 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - */ -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -@@ -168,3 +168,3 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, - -- ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); -@@ -232,3 +232,2 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - dsl_dir_t *dd = ds->ds_dir; -- dsl_pool_t *dp = dd->dd_pool; - uint64_t value; -@@ -236,14 +235,9 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - int err; -- int need_rwlock; -+ ASSERTV(dsl_pool_t *dp = dd->dd_pool); - -- need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); -- if (need_rwlock) -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - -- err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); -- if (err != 0) { -- if (need_rwlock) -- rw_exit(&dp->dp_config_rwlock); -+ err = dsl_prop_get_int_ds(ds, propname, &value); -+ if (err != 0) - return (err); -- } - -@@ -260,5 +254,2 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - cbr->cbr_func(cbr->cbr_arg, value); -- -- if (need_rwlock) -- rw_exit(&dp->dp_config_rwlock); - return (0); -@@ -270,15 +261,14 @@ dsl_prop_get(const char *dsname, const char *propname, - { -- dsl_dataset_t *ds; -- int err; -+ objset_t *os; -+ int error; - -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -+ error = dmu_objset_hold(dsname, FTAG, &os); -+ if (error != 0) -+ return (error); - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -+ error = dsl_prop_get_ds(dmu_objset_ds(os), propname, -+ intsz, numints, buf, setpoint); - -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ dmu_objset_rele(os, FTAG); -+ return (error); - } -@@ -300,13 +290,7 @@ dsl_prop_get_integer(const char *ddname, const char *propname, - --void --dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, -- zprop_source_t source, uint64_t *value) -+int -+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, -+ uint64_t *valuep) - { -- psa->psa_name = propname; -- psa->psa_source = source; -- psa->psa_intsz = 8; -- psa->psa_numints = 1; -- psa->psa_value = value; -- -- psa->psa_effective_value = -1ULL; -+ return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); - } -@@ -324,7 +308,6 @@ dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - int --dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) -+dsl_prop_predict(dsl_dir_t *dd, const char *propname, -+ zprop_source_t source, uint64_t value, uint64_t *newvalp) - { -- const char *propname = psa->psa_name; - zfs_prop_t prop = zfs_name_to_prop(propname); -- zprop_source_t source = psa->psa_source; - objset_t *mos; -@@ -360,9 +343,8 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - /* Revert to the received value, if any. */ -- err = zap_lookup(mos, zapobj, recvdstr, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = 0; -+ *newvalp = 0; - break; - case ZPROP_SRC_LOCAL: -- psa->psa_effective_value = *(uint64_t *)psa->psa_value; -+ *newvalp = value; - break; -@@ -373,6 +355,5 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - */ -- err = zap_lookup(mos, zapobj, propname, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = *(uint64_t *)psa->psa_value; -+ *newvalp = value; - break; -@@ -383,9 +364,8 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - */ -- err = zap_lookup(mos, zapobj, propname, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = 0; -+ *newvalp = 0; - break; - default: -- cmn_err(CE_PANIC, "unexpected property source: %d", source); -+ panic("unexpected property source: %d", source); - } -@@ -400,38 +380,5 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - --#ifdef ZFS_DEBUG --void --dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) --{ -- zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); -- uint64_t intval; -- char setpoint[MAXNAMELEN]; -- uint64_t version = spa_version(dd->dd_pool->dp_spa); -- int err; -- -- if (version < SPA_VERSION_RECVD_PROPS) { -- switch (prop) { -- case ZFS_PROP_QUOTA: -- case ZFS_PROP_RESERVATION: -- return; -- default: -- break; -- } -- } -- -- err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, -- setpoint, B_FALSE); -- if (err == 0 && intval != psa->psa_effective_value) { -- cmn_err(CE_PANIC, "%s property, source: %x, " -- "predicted effective value: %llu, " -- "actual effective value: %llu (setpoint: %s)", -- psa->psa_name, psa->psa_source, -- (unsigned long long)psa->psa_effective_value, -- (unsigned long long)intval, setpoint); -- } --} --#endif -- - /* - * Unregister this callback. Return 0 on success, ENOENT if ddname is -- * invalid, ENOMSG if no matching callback registered. -+ * invalid, or ENOMSG if no matching callback registered. - */ -@@ -456,3 +403,3 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - mutex_exit(&dd->dd_lock); -- return (ENOMSG); -+ return (SET_ERROR(ENOMSG)); - } -@@ -467,21 +414,53 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - --/* -- * Return the number of callbacks that are registered for this dataset. -- */ --int --dsl_prop_numcb(dsl_dataset_t *ds) -+boolean_t -+dsl_prop_hascb(dsl_dataset_t *ds) - { - dsl_dir_t *dd = ds->ds_dir; -+ boolean_t rv = B_FALSE; - dsl_prop_cb_record_t *cbr; -- int num = 0; - - mutex_enter(&dd->dd_lock); -- for (cbr = list_head(&dd->dd_prop_cbs); -- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { -- if (cbr->cbr_ds == ds) -- num++; -+ for (cbr = list_head(&dd->dd_prop_cbs); cbr; -+ cbr = list_next(&dd->dd_prop_cbs, cbr)) { -+ if (cbr->cbr_ds == ds) { -+ rv = B_TRUE; -+ break; -+ } - } - mutex_exit(&dd->dd_lock); -+ return (rv); -+} - -- return (num); -+/* ARGSUSED */ -+static int -+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -+{ -+ dsl_dir_t *dd = ds->ds_dir; -+ dsl_prop_cb_record_t *cbr; -+ -+ mutex_enter(&dd->dd_lock); -+ for (cbr = list_head(&dd->dd_prop_cbs); cbr; -+ cbr = list_next(&dd->dd_prop_cbs, cbr)) { -+ uint64_t value; -+ -+ if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname, -+ sizeof (value), 1, &value, NULL) == 0) -+ cbr->cbr_func(cbr->cbr_arg, value); -+ } -+ mutex_exit(&dd->dd_lock); -+ -+ return (0); -+} -+ -+/* -+ * Update all property values for ddobj & its descendants. This is used -+ * when renaming the dir. -+ */ -+void -+dsl_prop_notify_all(dsl_dir_t *dd) -+{ -+ dsl_pool_t *dp = dd->dd_pool; -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, -+ NULL, DS_FIND_CHILDREN); - } -@@ -499,4 +478,4 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); -- err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); - if (err) -@@ -511,3 +490,3 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - if (err == 0) { -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - return; -@@ -546,3 +525,3 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - zap_cursor_fini(&zc); -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - } -@@ -550,6 +529,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - void --dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, -+ zprop_source_t source, int intsz, int numints, const void *value, -+ dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -@@ -558,3 +537,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - char valbuf[32]; -- char *valstr = NULL; -+ const char *valstr = NULL; - char *inheritstr; -@@ -564,4 +543,2 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- const char *propname = psa->psa_name; -- zprop_source_t source = psa->psa_source; - -@@ -583,6 +560,2 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (version < SPA_VERSION_RECVD_PROPS) { -- zfs_prop_t prop = zfs_name_to_prop(propname); -- if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) -- return; -- - if (source & ZPROP_SRC_NONE) -@@ -615,4 +588,4 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - ASSERT(err == 0 || err == ENOENT); -- VERIFY(0 == zap_update(mos, zapobj, propname, -- psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); -+ VERIFY0(zap_update(mos, zapobj, propname, -+ intsz, numints, value, tx)); - break; -@@ -627,8 +600,6 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (version >= SPA_VERSION_RECVD_PROPS && -- dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, -- NULL) == 0) { -+ dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { - dummy = 0; -- err = zap_update(mos, zapobj, inheritstr, -- 8, 1, &dummy, tx); -- ASSERT(err == 0); -+ VERIFY0(zap_update(mos, zapobj, inheritstr, -+ 8, 1, &dummy, tx)); - } -@@ -640,3 +611,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - err = zap_update(mos, zapobj, recvdstr, -- psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); -+ intsz, numints, value, tx); - ASSERT(err == 0); -@@ -670,3 +641,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (isint) { -- VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); -+ VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - -@@ -697,3 +668,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (source == ZPROP_SRC_LOCAL) { -- valstr = (char *)psa->psa_value; -+ valstr = value; - } else { -@@ -706,7 +677,5 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal((source == ZPROP_SRC_NONE || -- source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : -- LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, -- "%s=%s dataset = %llu", propname, -- (valstr == NULL ? "" : valstr), ds->ds_object); -+ spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || -+ source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, -+ "%s=%s", propname, (valstr == NULL ? "" : valstr)); - -@@ -716,61 +685,26 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --void --dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+int -+dsl_prop_set_int(const char *dsname, const char *propname, -+ zprop_source_t source, uint64_t value) - { -- dsl_dataset_t *ds = arg1; -- dsl_props_arg_t *pa = arg2; -- nvlist_t *props = pa->pa_props; -- dsl_prop_setarg_t psa; -- nvpair_t *elem = NULL; -- -- psa.psa_source = pa->pa_source; -- -- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -- nvpair_t *pair = elem; -- -- psa.psa_name = nvpair_name(pair); -- -- if (nvpair_type(pair) == DATA_TYPE_NVLIST) { -- /* -- * dsl_prop_get_all_impl() returns properties in this -- * format. -- */ -- nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, -- &pair) == 0); -- } -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- if (nvpair_type(pair) == DATA_TYPE_STRING) { -- VERIFY(nvpair_value_string(pair, -- (char **)&psa.psa_value) == 0); -- psa.psa_intsz = 1; -- psa.psa_numints = strlen(psa.psa_value) + 1; -- } else { -- uint64_t intval; -- VERIFY(nvpair_value_uint64(pair, &intval) == 0); -- psa.psa_intsz = sizeof (intval); -- psa.psa_numints = 1; -- psa.psa_value = &intval; -- } -- dsl_prop_set_sync(ds, &psa, tx); -- } -+ fnvlist_add_uint64(nvl, propname, value); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } - --void --dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, -- dmu_tx_t *tx) -+int -+dsl_prop_set_string(const char *dsname, const char *propname, -+ zprop_source_t source, const char *value) - { -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- uint64_t zapobj = dd->dd_phys->dd_props_zapobj; -- -- ASSERT(dmu_tx_is_syncing(tx)); -- -- VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); -- -- dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, -- "%s=%llu dataset = %llu", name, (u_longlong_t)val, -- dd->dd_phys->dd_head_dataset_obj); -+ fnvlist_add_string(nvl, propname, value); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } -@@ -778,50 +712,25 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - int --dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, -- int intsz, int numints, const void *buf) -+dsl_prop_inherit(const char *dsname, const char *propname, -+ zprop_source_t source) - { -- dsl_dataset_t *ds; -- uint64_t version; -- int err; -- dsl_prop_setarg_t psa; -- -- /* -- * We must do these checks before we get to the syncfunc, since -- * it can't fail. -- */ -- if (strlen(propname) >= ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? -- ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { -- dsl_dataset_rele(ds, FTAG); -- return (E2BIG); -- } -- if (dsl_dataset_is_snapshot(ds) && -- version < SPA_VERSION_SNAP_PROPS) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -- } -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- psa.psa_name = propname; -- psa.psa_source = source; -- psa.psa_intsz = intsz; -- psa.psa_numints = numints; -- psa.psa_value = buf; -- psa.psa_effective_value = -1ULL; -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- NULL, dsl_prop_set_sync, ds, &psa, 2); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ fnvlist_add_boolean(nvl, propname); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } - --int --dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) -+typedef struct dsl_props_set_arg { -+ const char *dpsa_dsname; -+ zprop_source_t dpsa_source; -+ nvlist_t *dpsa_props; -+} dsl_props_set_arg_t; -+ -+static int -+dsl_props_set_check(void *arg, dmu_tx_t *tx) - { -+ dsl_props_set_arg_t *dpsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -@@ -829,19 +738,16 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) - nvpair_t *elem = NULL; -- dsl_props_arg_t pa; - int err; - -- if ((err = dsl_dataset_hold(dsname, FTAG, &ds))) -+ err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); -+ if (err != 0) - return (err); -- /* -- * Do these checks before the syncfunc, since it can't fail. -- */ -+ - version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -+ while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { - if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { - dsl_dataset_rele(ds, FTAG); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } - if (nvpair_type(elem) == DATA_TYPE_STRING) { -- char *valstr; -- VERIFY(nvpair_value_string(elem, &valstr) == 0); -+ char *valstr = fnvpair_value_string(elem); - if (strlen(valstr) >= (version < -@@ -855,16 +761,79 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) - -- if (dsl_dataset_is_snapshot(ds) && -- version < SPA_VERSION_SNAP_PROPS) { -+ if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+void -+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, -+ nvlist_t *props, dmu_tx_t *tx) -+{ -+ nvpair_t *elem = NULL; -+ -+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -+ nvpair_t *pair = elem; - -- pa.pa_props = props; -- pa.pa_source = source; -+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { -+ /* -+ * dsl_prop_get_all_impl() returns properties in this -+ * format. -+ */ -+ nvlist_t *attrs = fnvpair_value_nvlist(pair); -+ pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); -+ } - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- NULL, dsl_props_set_sync, ds, &pa, 2); -+ if (nvpair_type(pair) == DATA_TYPE_STRING) { -+ const char *value = fnvpair_value_string(pair); -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, 1, strlen(value) + 1, value, tx); -+ } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { -+ uint64_t intval = fnvpair_value_uint64(pair); -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, sizeof (intval), 1, &intval, tx); -+ } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, 0, 0, NULL, tx); -+ } else { -+ panic("invalid nvpair type"); -+ } -+ } -+} -+ -+static void -+dsl_props_set_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_props_set_arg_t *dpsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; - -+ VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); -+ dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); - dsl_dataset_rele(ds, FTAG); -- return (err); -+} -+ -+/* -+ * All-or-nothing; if any prop can't be set, nothing will be modified. -+ */ -+int -+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) -+{ -+ dsl_props_set_arg_t dpsa; -+ int nblks = 0; -+ -+ dpsa.dpsa_dsname = dsname; -+ dpsa.dpsa_source = source; -+ dpsa.dpsa_props = props; -+ -+ /* -+ * If the source includes NONE, then we will only be removing entries -+ * from the ZAP object. In that case don't check for ENOSPC. -+ */ -+ if ((source & ZPROP_SRC_NONE) == 0) -+ nblks = 2 * fnvlist_num_pairs(props); -+ -+ return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, -+ &dpsa, nblks)); - } -@@ -1018,3 +987,3 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - -@@ -1043,3 +1012,2 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, - out: -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -1048,29 +1016,24 @@ out: - boolean_t --dsl_prop_get_hasrecvd(objset_t *os) -+dsl_prop_get_hasrecvd(const char *dsname) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; -- int rc; - uint64_t dummy; - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -- ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); -- return (rc == 0); -+ return (0 == -+ dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); - } - --static void --dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) -+static int -+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; -- uint64_t dummy = 0; -- dsl_prop_setarg_t psa; -- -- if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) -- return; -+ uint64_t version; -+ spa_t *spa; -+ int error = 0; - -- dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); -+ VERIFY0(spa_open(dsname, &spa, FTAG)); -+ version = spa_version(spa); -+ spa_close(spa, FTAG); - -- (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, -- dsl_prop_set_sync, ds, &psa, 2); -+ if (version >= SPA_VERSION_RECVD_PROPS) -+ error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); -+ return (error); - } -@@ -1081,10 +1044,9 @@ dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) - */ --void --dsl_prop_set_hasrecvd(objset_t *os) -+int -+dsl_prop_set_hasrecvd(const char *dsname) - { -- if (dsl_prop_get_hasrecvd(os)) { -- ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); -- return; -- } -- dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); -+ int error = 0; -+ if (!dsl_prop_get_hasrecvd(dsname)) -+ error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); -+ return (error); - } -@@ -1092,5 +1054,5 @@ dsl_prop_set_hasrecvd(objset_t *os) - void --dsl_prop_unset_hasrecvd(objset_t *os) -+dsl_prop_unset_hasrecvd(const char *dsname) - { -- dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); -+ VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); - } -@@ -1104,4 +1066,7 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) - int --dsl_prop_get_received(objset_t *os, nvlist_t **nvp) -+dsl_prop_get_received(const char *dsname, nvlist_t **nvp) - { -+ objset_t *os; -+ int error; -+ - /* -@@ -1111,5 +1076,11 @@ dsl_prop_get_received(objset_t *os, nvlist_t **nvp) - */ -- dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? -+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? - DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); -- return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); -+ -+ error = dmu_objset_hold(dsname, FTAG, &os); -+ if (error != 0) -+ return (error); -+ error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); -+ dmu_objset_rele(os, FTAG); -+ return (error); - } -@@ -1159,4 +1130,2 @@ EXPORT_SYMBOL(dsl_prop_register); - EXPORT_SYMBOL(dsl_prop_unregister); --EXPORT_SYMBOL(dsl_prop_numcb); --EXPORT_SYMBOL(dsl_prop_set); - EXPORT_SYMBOL(dsl_prop_get); -@@ -1166,3 +1135,9 @@ EXPORT_SYMBOL(dsl_prop_get_received); - EXPORT_SYMBOL(dsl_prop_get_ds); -+EXPORT_SYMBOL(dsl_prop_get_int_ds); - EXPORT_SYMBOL(dsl_prop_get_dd); -+EXPORT_SYMBOL(dsl_props_set); -+EXPORT_SYMBOL(dsl_prop_set_int); -+EXPORT_SYMBOL(dsl_prop_set_string); -+EXPORT_SYMBOL(dsl_prop_inherit); -+EXPORT_SYMBOL(dsl_prop_predict); - EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64); -diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c -index 34a4f03..7807f84 100644 ---- a/module/zfs/dsl_scan.c -+++ b/module/zfs/dsl_scan.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -55,3 +55,3 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - static scan_cb_t dsl_scan_scrub_cb; --static dsl_syncfunc_t dsl_scan_cancel_sync; -+static void dsl_scan_cancel_sync(void *, dmu_tx_t *); - static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); -@@ -93,2 +93,11 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) - -+ /* -+ * It's possible that we're resuming a scan after a reboot so -+ * make sure that the scan_async_destroying flag is initialized -+ * appropriately. -+ */ -+ ASSERT(!scn->scn_async_destroying); -+ scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); -+ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -@@ -116,2 +125,38 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) - &scn->scn_phys); -+ /* -+ * Detect if the pool contains the signature of #2094. If it -+ * does properly update the scn->scn_phys structure and notify -+ * the administrator by setting an errata for the pool. -+ */ -+ if (err == EOVERFLOW) { -+ uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; -+ VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); -+ VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, -+ (23 * sizeof (uint64_t))); -+ -+ err = zap_lookup(dp->dp_meta_objset, -+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, -+ sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); -+ if (err == 0) { -+ uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; -+ -+ if (overflow & ~DSL_SCAN_FLAGS_MASK || -+ scn->scn_async_destroying) { -+ spa->spa_errata = -+ ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; -+ return (EOVERFLOW); -+ } -+ -+ bcopy(zaptmp, &scn->scn_phys, -+ SCAN_PHYS_NUMINTS * sizeof (uint64_t)); -+ scn->scn_phys.scn_flags = overflow; -+ -+ /* Required scrub already in progress. */ -+ if (scn->scn_phys.scn_state == DSS_FINISHED || -+ scn->scn_phys.scn_state == DSS_CANCELED) -+ spa->spa_errata = -+ ZPOOL_ERRATA_ZOL_2094_SCRUB; -+ } -+ } -+ - if (err == ENOENT) -@@ -152,8 +197,8 @@ dsl_scan_fini(dsl_pool_t *dp) - static int --dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_setup_check(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (scn->scn_phys.scn_state == DSS_SCANNING) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -162,8 +207,7 @@ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) - --/* ARGSUSED */ - static void --dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -- pool_scan_func_t *funcp = arg2; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; -+ pool_scan_func_t *funcp = arg; - dmu_object_type_t ot = 0; -@@ -184,2 +228,3 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - scn->scn_restart_txg = 0; -+ scn->scn_done_txg = 0; - spa_scan_stat_init(spa); -@@ -194,5 +239,7 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { -- spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START); -+ spa_event_notify(spa, NULL, -+ FM_EREPORT_ZFS_RESILVER_START); - } else { -- spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START); -+ spa_event_notify(spa, NULL, -+ FM_EREPORT_ZFS_SCRUB_START); - } -@@ -226,3 +273,3 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_SCAN, spa, tx, -+ spa_history_log_internal(spa, "scan setup", tx, - "func=%u mintxg=%llu maxtxg=%llu", -@@ -275,3 +322,3 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, -+ spa_history_log_internal(spa, "scan done", tx, - "complete=%u", complete); -@@ -310,2 +357,5 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - scn->scn_phys.scn_end_time = gethrestime_sec(); -+ -+ if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) -+ spa->spa_errata = 0; - } -@@ -314,8 +364,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - static int --dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (scn->scn_phys.scn_state != DSS_SCANNING) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - return (0); -@@ -325,5 +375,5 @@ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - -@@ -336,8 +386,4 @@ dsl_scan_cancel(dsl_pool_t *dp) - { -- boolean_t complete = B_FALSE; -- int err; -- -- err = dsl_sync_task_do(dp, dsl_scan_cancel_check, -- dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); -- return (err); -+ return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, -+ dsl_scan_cancel_sync, NULL, 3)); - } -@@ -377,3 +423,3 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) - { -- VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, -+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, -@@ -407,3 +453,3 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || -- (elapsed_nanosecs / MICROSEC > mintime && -+ (NSEC2MSEC(elapsed_nanosecs) > mintime && - txg_sync_waiting(scn->scn_dp)) || -@@ -774,3 +820,3 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, - */ -- if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { -+ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); -@@ -780,3 +826,3 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, - out: -- kmem_free(bp_toread, sizeof(blkptr_t)); -+ kmem_free(bp_toread, sizeof (blkptr_t)); - } -@@ -961,3 +1007,3 @@ struct enqueue_clones_arg { - static int --enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -966,6 +1012,8 @@ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj) -+ return (0); -+ -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -973,17 +1021,15 @@ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { -- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { -- dsl_dataset_t *prev; -- err = dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); -+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { -+ dsl_dataset_t *prev; -+ err = dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - -- dsl_dataset_rele(ds, FTAG); -- if (err) -- return (err); -- ds = prev; -- } -- VERIFY(zap_add_int_key(dp->dp_meta_objset, -- scn->scn_phys.scn_queue_obj, ds->ds_object, -- ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); -+ dsl_dataset_rele(ds, FTAG); -+ if (err) -+ return (err); -+ ds = prev; - } -+ VERIFY(zap_add_int_key(dp->dp_meta_objset, -+ scn->scn_phys.scn_queue_obj, ds->ds_object, -+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); - dsl_dataset_rele(ds, FTAG); -@@ -1077,6 +1123,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) - if (usenext) { -- VERIFY(zap_join_key(dp->dp_meta_objset, -+ VERIFY0(zap_join_key(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - scn->scn_phys.scn_queue_obj, -- ds->ds_phys->ds_creation_txg, tx) == 0); -+ ds->ds_phys->ds_creation_txg, tx)); - } else { -@@ -1086,4 +1132,4 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) - -- (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, -- NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, -+ enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); - } -@@ -1097,3 +1143,3 @@ out: - static int --enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -1102,6 +1148,5 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -1226,3 +1271,3 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - if (ddp->ddp_phys_birth == 0 || -- ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) -+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; -@@ -1263,4 +1308,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { -- VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, -- NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, -+ enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { -@@ -1288,4 +1333,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); - -@@ -1323,4 +1368,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - out: -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - } -@@ -1332,5 +1377,8 @@ dsl_scan_free_should_pause(dsl_scan_t *scn) - -+ if (zfs_recover) -+ return (B_FALSE); -+ - elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || -- (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && -+ (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && - txg_sync_waiting(scn->scn_dp)) || -@@ -1347,3 +1395,3 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - if (dsl_scan_free_should_pause(scn)) -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -1370,9 +1418,6 @@ dsl_scan_active(dsl_scan_t *scn) - -- if (scn->scn_phys.scn_state == DSS_SCANNING) -+ if (scn->scn_phys.scn_state == DSS_SCANNING || -+ scn->scn_async_destroying) - return (B_TRUE); - -- if (spa_feature_is_active(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -- return (B_TRUE); -- } - if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { -@@ -1404,3 +1449,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - func, tx->tx_txg); -- dsl_scan_setup_sync(scn, &func, tx); -+ dsl_scan_setup_sync(&func, tx); - } -@@ -1432,2 +1477,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -+ ASSERT(scn->scn_async_destroying); - scn->scn_is_bptree = B_TRUE; -@@ -1438,17 +1484,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - scn, tx); -- VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); -- if (err != 0) -- return; -- -- /* disable async destroy feature */ -- spa_feature_decr(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); -- ASSERT(!spa_feature_is_active(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); -- VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, -- DMU_POOL_DIRECTORY_OBJECT, -- DMU_POOL_BPTREE_OBJ, tx)); -- VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, -- dp->dp_bptree_obj, tx)); -- dp->dp_bptree_obj = 0; -+ VERIFY0(zio_wait(scn->scn_zio_root)); -+ -+ if (err == 0) { -+ zfeature_info_t *feat = &spa_feature_table -+ [SPA_FEATURE_ASYNC_DESTROY]; -+ /* finished; deactivate async destroy feature */ -+ spa_feature_decr(spa, feat, tx); -+ ASSERT(!spa_feature_is_active(spa, feat)); -+ VERIFY0(zap_remove(dp->dp_meta_objset, -+ DMU_POOL_DIRECTORY_OBJECT, -+ DMU_POOL_BPTREE_OBJ, tx)); -+ VERIFY0(bptree_free(dp->dp_meta_objset, -+ dp->dp_bptree_obj, tx)); -+ dp->dp_bptree_obj = 0; -+ scn->scn_async_destroying = B_FALSE; -+ } - } -@@ -1459,3 +1506,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - (longlong_t) -- (gethrtime() - scn->scn_sync_start_time) / MICROSEC, -+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg); -@@ -1475,2 +1522,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - -+ if (scn->scn_done_txg == tx->tx_txg) { -+ ASSERT(!scn->scn_pausing); -+ /* finished with scan. */ -+ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); -+ dsl_scan_done(scn, B_TRUE, tx); -+ ASSERT3U(spa->spa_scrub_inflight, ==, 0); -+ dsl_scan_sync_state(scn, tx); -+ return; -+ } -+ - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= -@@ -1499,3 +1556,5 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - NULL, ZIO_FLAG_CANFAIL); -+ dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); -+ dsl_pool_config_exit(dp, FTAG); - (void) zio_wait(scn->scn_zio_root); -@@ -1505,8 +1564,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - (longlong_t)scn->scn_visited_this_txg, -- (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); -+ (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); - - if (!scn->scn_pausing) { -- /* finished with scan. */ -- zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); -- dsl_scan_done(scn, B_TRUE, tx); -+ scn->scn_done_txg = tx->tx_txg + 1; -+ zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", -+ tx->tx_txg, scn->scn_done_txg); - } -@@ -1634,3 +1693,2 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; -- int zio_priority = 0; - int scan_delay = 0; -@@ -1647,8 +1705,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - zio_flags |= ZIO_FLAG_SCRUB; -- zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - scan_delay = zfs_scrub_delay; -- } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { -+ } else { -+ ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); - zio_flags |= ZIO_FLAG_RESILVER; -- zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; -@@ -1710,3 +1767,3 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - zio_nowait(zio_read(NULL, spa, bp, data, size, -- dsl_scan_scrub_done, NULL, zio_priority, -+ dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); -@@ -1736,4 +1793,4 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) - -- return (dsl_sync_task_do(dp, dsl_scan_setup_check, -- dsl_scan_setup_sync, dp->dp_scan, &func, 0)); -+ return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, -+ dsl_scan_setup_sync, &func, 0)); - } -diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c -index 75eb507..5f345f4 100644 ---- a/module/zfs/dsl_synctask.c -+++ b/module/zfs/dsl_synctask.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -35,3 +36,3 @@ - static int --dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_null_checkfunc(void *arg, dmu_tx_t *tx) - { -@@ -40,78 +41,64 @@ dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) - --dsl_sync_task_group_t * --dsl_sync_task_group_create(dsl_pool_t *dp) --{ -- dsl_sync_task_group_t *dstg; -- -- dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); -- list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), -- offsetof(dsl_sync_task_t, dst_node)); -- dstg->dstg_pool = dp; -- -- return (dstg); --} -- --void --dsl_sync_task_create(dsl_sync_task_group_t *dstg, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified) --{ -- dsl_sync_task_t *dst; -- -- if (checkfunc == NULL) -- checkfunc = dsl_null_checkfunc; -- dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); -- dst->dst_checkfunc = checkfunc; -- dst->dst_syncfunc = syncfunc; -- dst->dst_arg1 = arg1; -- dst->dst_arg2 = arg2; -- list_insert_tail(&dstg->dstg_tasks, dst); -- -- dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; --} -- -+/* -+ * Called from open context to perform a callback in syncing context. Waits -+ * for the operation to complete. -+ * -+ * The checkfunc will be called from open context as a preliminary check -+ * which can quickly fail. If it succeeds, it will be called again from -+ * syncing context. The checkfunc should generally be designed to work -+ * properly in either context, but if necessary it can check -+ * dmu_tx_is_syncing(tx). -+ * -+ * The synctask infrastructure enforces proper locking strategy with respect -+ * to the dp_config_rwlock -- the lock will always be held when the callbacks -+ * are called. It will be held for read during the open-context (preliminary) -+ * call to the checkfunc, and then held for write from syncing context during -+ * the calls to the check and sync funcs. -+ * -+ * A dataset or pool name can be passed as the first argument. Typically, -+ * the check func will hold, check the return value of the hold, and then -+ * release the dataset. The sync func will VERIFYO(hold()) the dataset. -+ * This is safe because no changes can be made between the check and sync funcs, -+ * and the sync func will only be called if the check func successfully opened -+ * the dataset. -+ */ - int --dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) -+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, -+ dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified) - { -+ spa_t *spa; - dmu_tx_t *tx; -- uint64_t txg; -- dsl_sync_task_t *dst; -+ int err; -+ dsl_sync_task_t dst = { { { NULL } } }; -+ dsl_pool_t *dp; - --top: -- tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); -- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); -- -- txg = dmu_tx_get_txg(tx); -- -- /* Do a preliminary error check. */ -- dstg->dstg_err = 0; -- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { --#ifdef ZFS_DEBUG -- /* -- * Only check half the time, otherwise, the sync-context -- * check will almost never fail. -- */ -- if (spa_get_random(2) == 0) -- continue; --#endif -- dst->dst_err = -- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); -- if (dst->dst_err) -- dstg->dstg_err = dst->dst_err; -- } -- rw_exit(&dstg->dstg_pool->dp_config_rwlock); -+ err = spa_open(pool, &spa, FTAG); -+ if (err != 0) -+ return (err); -+ dp = spa_get_dsl(spa); - -- if (dstg->dstg_err) { -+top: -+ tx = dmu_tx_create_dd(dp->dp_mos_dir); -+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); -+ -+ dst.dst_pool = dp; -+ dst.dst_txg = dmu_tx_get_txg(tx); -+ dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; -+ dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; -+ dst.dst_syncfunc = syncfunc; -+ dst.dst_arg = arg; -+ dst.dst_error = 0; -+ dst.dst_nowaiter = B_FALSE; -+ -+ dsl_pool_config_enter(dp, FTAG); -+ err = dst.dst_checkfunc(arg, tx); -+ dsl_pool_config_exit(dp, FTAG); -+ -+ if (err != 0) { - dmu_tx_commit(tx); -- return (dstg->dstg_err); -+ spa_close(spa, FTAG); -+ return (err); - } - -- /* -- * We don't generally have many sync tasks, so pay the price of -- * add_tail to get the tasks executed in the right order. -- */ -- VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, -- dstg, txg)); -+ VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); - -@@ -119,6 +106,6 @@ top: - -- txg_wait_synced(dstg->dstg_pool, txg); -+ txg_wait_synced(dp, dst.dst_txg); - -- if (dstg->dstg_err == EAGAIN) { -- txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); -+ if (dst.dst_error == EAGAIN) { -+ txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); - goto top; -@@ -126,3 +113,4 @@ top: - -- return (dstg->dstg_err); -+ spa_close(spa, FTAG); -+ return (dst.dst_error); - } -@@ -130,36 +118,29 @@ top: - void --dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) -+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, -+ int blocks_modified, dmu_tx_t *tx) - { -- uint64_t txg; -- -- dstg->dstg_nowaiter = B_TRUE; -- txg = dmu_tx_get_txg(tx); -- /* -- * We don't generally have many sync tasks, so pay the price of -- * add_tail to get the tasks executed in the right order. -- */ -- VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, -- dstg, txg)); --} -+ dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - --void --dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) --{ -- dsl_sync_task_t *dst; -+ dst->dst_pool = dp; -+ dst->dst_txg = dmu_tx_get_txg(tx); -+ dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; -+ dst->dst_checkfunc = dsl_null_checkfunc; -+ dst->dst_syncfunc = syncfunc; -+ dst->dst_arg = arg; -+ dst->dst_error = 0; -+ dst->dst_nowaiter = B_TRUE; - -- while ((dst = list_head(&dstg->dstg_tasks))) { -- list_remove(&dstg->dstg_tasks, dst); -- kmem_free(dst, sizeof (dsl_sync_task_t)); -- } -- kmem_free(dstg, sizeof (dsl_sync_task_group_t)); -+ VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); - } - -+/* -+ * Called in syncing context to execute the synctask. -+ */ - void --dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) -+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) - { -- dsl_sync_task_t *dst; -- dsl_pool_t *dp = dstg->dstg_pool; -+ dsl_pool_t *dp = dst->dst_pool; - uint64_t quota, used; - -- ASSERT0(dstg->dstg_err); -+ ASSERT0(dst->dst_error); - -@@ -175,4 +156,6 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) - /* MOS space is triple-dittoed, so we multiply by 3. */ -- if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { -- dstg->dstg_err = ENOSPC; -+ if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { -+ dst->dst_error = SET_ERROR(ENOSPC); -+ if (dst->dst_nowaiter) -+ kmem_free(dst, sizeof (*dst)); - return; -@@ -181,60 +164,11 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) - /* -- * Check for errors by calling checkfuncs. -+ * Check for errors by calling checkfunc. - */ -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- dst->dst_err = -- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); -- if (dst->dst_err) -- dstg->dstg_err = dst->dst_err; -- } -- -- if (dstg->dstg_err == 0) { -- /* -- * Execute sync tasks. -- */ -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); -- } -- } -- rw_exit(&dp->dp_config_rwlock); -- -- if (dstg->dstg_nowaiter) -- dsl_sync_task_group_destroy(dstg); --} -- --int --dsl_sync_task_do(dsl_pool_t *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified) --{ -- dsl_sync_task_group_t *dstg; -- int err; -- -- ASSERT(spa_writeable(dp->dp_spa)); -- -- dstg = dsl_sync_task_group_create(dp); -- dsl_sync_task_create(dstg, checkfunc, syncfunc, -- arg1, arg2, blocks_modified); -- err = dsl_sync_task_group_wait(dstg); -- dsl_sync_task_group_destroy(dstg); -- return (err); --} -- --void --dsl_sync_task_do_nowait(dsl_pool_t *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) --{ -- dsl_sync_task_group_t *dstg; -- -- if (!spa_writeable(dp->dp_spa)) -- return; -- -- dstg = dsl_sync_task_group_create(dp); -- dsl_sync_task_create(dstg, checkfunc, syncfunc, -- arg1, arg2, blocks_modified); -- dsl_sync_task_group_nowait(dstg, tx); -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); -+ if (dst->dst_error == 0) -+ dst->dst_syncfunc(dst->dst_arg, tx); -+ rrw_exit(&dp->dp_config_rwlock, FTAG); -+ if (dst->dst_nowaiter) -+ kmem_free(dst, sizeof (*dst)); - } -@@ -242,4 +176,4 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp, - #if defined(_KERNEL) && defined(HAVE_SPL) --EXPORT_SYMBOL(dsl_sync_task_do); --EXPORT_SYMBOL(dsl_sync_task_do_nowait); -+EXPORT_SYMBOL(dsl_sync_task); -+EXPORT_SYMBOL(dsl_sync_task_nowait); - #endif -diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c -new file mode 100644 -index 0000000..e24ed64 ---- /dev/null -+++ b/module/zfs/dsl_userhold.c -@@ -0,0 +1,675 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct dsl_dataset_user_hold_arg { -+ nvlist_t *dduha_holds; -+ nvlist_t *dduha_chkholds; -+ nvlist_t *dduha_errlist; -+ minor_t dduha_minor; -+} dsl_dataset_user_hold_arg_t; -+ -+/* -+ * If you add new checks here, you may need to add additional checks to the -+ * "temporary" case in snapshot_check() in dmu_objset.c. -+ */ -+int -+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, -+ boolean_t temphold, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ int error = 0; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ if (strlen(htag) > MAXNAMELEN) -+ return (SET_ERROR(E2BIG)); -+ /* Tempholds have a more restricted length */ -+ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -+ return (SET_ERROR(E2BIG)); -+ -+ /* tags must be unique (if ds already exists) */ -+ if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) { -+ uint64_t value; -+ -+ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, -+ htag, 8, 1, &value); -+ if (error == 0) -+ error = SET_ERROR(EEXIST); -+ else if (error == ENOENT) -+ error = 0; -+ } -+ -+ return (error); -+} -+ -+static int -+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_hold_arg_t *dduha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) -+ return (SET_ERROR(ENOTSUP)); -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { -+ dsl_dataset_t *ds; -+ int error = 0; -+ char *htag, *name; -+ -+ /* must be a snapshot */ -+ name = nvpair_name(pair); -+ if (strchr(name, '@') == NULL) -+ error = SET_ERROR(EINVAL); -+ -+ if (error == 0) -+ error = nvpair_value_string(pair, &htag); -+ -+ if (error == 0) -+ error = dsl_dataset_hold(dp, name, FTAG, &ds); -+ -+ if (error == 0) { -+ error = dsl_dataset_user_hold_check_one(ds, htag, -+ dduha->dduha_minor != 0, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error == 0) { -+ fnvlist_add_string(dduha->dduha_chkholds, name, htag); -+ } else { -+ /* -+ * We register ENOENT errors so they can be correctly -+ * reported if needed, such as when all holds fail. -+ */ -+ fnvlist_add_int32(dduha->dduha_errlist, name, error); -+ if (error != ENOENT) -+ return (error); -+ } -+ } -+ -+ return (0); -+} -+ -+ -+static void -+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, -+ const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t zapobj; -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ if (ds->ds_phys->ds_userrefs_obj == 0) { -+ /* -+ * This is the first user hold for this dataset. Create -+ * the userrefs zap object. -+ */ -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ zapobj = ds->ds_phys->ds_userrefs_obj = -+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); -+ } else { -+ zapobj = ds->ds_phys->ds_userrefs_obj; -+ } -+ ds->ds_userrefs++; -+ -+ VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); -+ -+ if (minor != 0) { -+ char name[MAXNAMELEN]; -+ nvlist_t *tags; -+ -+ VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, -+ htag, now, tx)); -+ (void) snprintf(name, sizeof (name), "%llx", -+ (u_longlong_t)ds->ds_object); -+ -+ if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { -+ VERIFY0(nvlist_alloc(&tags, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ fnvlist_add_boolean(tags, htag); -+ fnvlist_add_nvlist(tmpholds, name, tags); -+ fnvlist_free(tags); -+ } else { -+ fnvlist_add_boolean(tags, htag); -+ } -+ } -+ -+ spa_history_log_internal_ds(ds, "hold", tx, -+ "tag=%s temp=%d refs=%llu", -+ htag, minor != 0, ds->ds_userrefs); -+} -+ -+typedef struct zfs_hold_cleanup_arg { -+ char zhca_spaname[MAXNAMELEN]; -+ uint64_t zhca_spa_load_guid; -+ nvlist_t *zhca_holds; -+} zfs_hold_cleanup_arg_t; -+ -+static void -+dsl_dataset_user_release_onexit(void *arg) -+{ -+ zfs_hold_cleanup_arg_t *ca = arg; -+ spa_t *spa; -+ int error; -+ -+ error = spa_open(ca->zhca_spaname, &spa, FTAG); -+ if (error != 0) { -+ zfs_dbgmsg("couldn't release holds on pool=%s " -+ "because pool is no longer loaded", -+ ca->zhca_spaname); -+ return; -+ } -+ if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { -+ zfs_dbgmsg("couldn't release holds on pool=%s " -+ "because pool is no longer loaded (guid doesn't match)", -+ ca->zhca_spaname); -+ spa_close(spa, FTAG); -+ return; -+ } -+ -+ (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); -+ fnvlist_free(ca->zhca_holds); -+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); -+ spa_close(spa, FTAG); -+} -+ -+static void -+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) -+{ -+ zfs_hold_cleanup_arg_t *ca; -+ -+ if (minor == 0 || nvlist_empty(holds)) { -+ fnvlist_free(holds); -+ return; -+ } -+ -+ ASSERT(spa != NULL); -+ ca = kmem_alloc(sizeof (*ca), KM_PUSHPAGE); -+ -+ (void) strlcpy(ca->zhca_spaname, spa_name(spa), -+ sizeof (ca->zhca_spaname)); -+ ca->zhca_spa_load_guid = spa_load_guid(spa); -+ ca->zhca_holds = holds; -+ VERIFY0(zfs_onexit_add_cb(minor, -+ dsl_dataset_user_release_onexit, ca, NULL)); -+} -+ -+void -+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, -+ minor_t minor, uint64_t now, dmu_tx_t *tx) -+{ -+ nvlist_t *tmpholds; -+ -+ if (minor != 0) -+ VERIFY0(nvlist_alloc(&tmpholds, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ else -+ tmpholds = NULL; -+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); -+ dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); -+} -+ -+static void -+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_hold_arg_t *dduha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvlist_t *tmpholds; -+ nvpair_t *pair; -+ uint64_t now = gethrestime_sec(); -+ -+ if (dduha->dduha_minor != 0) -+ VERIFY0(nvlist_alloc(&tmpholds, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ else -+ tmpholds = NULL; -+ for (pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); -+ pair != NULL; -+ pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); -+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, -+ fnvpair_value_string(pair), dduha->dduha_minor, now, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); -+} -+ -+/* -+ * The full semantics of this function are described in the comment above -+ * lzc_hold(). -+ * -+ * To summarize: -+ * holds is nvl of snapname -> holdname -+ * errlist will be filled in with snapname -> error -+ * -+ * The snaphosts must all be in the same pool. -+ * -+ * Holds for snapshots that don't exist will be skipped. -+ * -+ * If none of the snapshots for requested holds exist then ENOENT will be -+ * returned. -+ * -+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned -+ * up when the process exits. -+ * -+ * On success all the holds, for snapshots that existed, will be created and 0 -+ * will be returned. -+ * -+ * On failure no holds will be created, the errlist will be filled in, -+ * and an errno will returned. -+ * -+ * In all cases the errlist will contain entries for holds where the snapshot -+ * didn't exist. -+ */ -+int -+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) -+{ -+ dsl_dataset_user_hold_arg_t dduha; -+ nvpair_t *pair; -+ int ret; -+ -+ pair = nvlist_next_nvpair(holds, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ dduha.dduha_holds = holds; -+ VERIFY0(nvlist_alloc(&dduha.dduha_chkholds, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ dduha.dduha_errlist = errlist; -+ dduha.dduha_minor = cleanup_minor; -+ -+ ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, -+ dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds)); -+ fnvlist_free(dduha.dduha_chkholds); -+ -+ return (ret); -+} -+ -+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, -+ dsl_dataset_t **dsp); -+ -+typedef struct dsl_dataset_user_release_arg { -+ dsl_holdfunc_t *ddura_holdfunc; -+ nvlist_t *ddura_holds; -+ nvlist_t *ddura_todelete; -+ nvlist_t *ddura_errlist; -+ nvlist_t *ddura_chkholds; -+} dsl_dataset_user_release_arg_t; -+ -+/* Place a dataset hold on the snapshot identified by passed dsobj string */ -+static int -+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, -+ dsl_dataset_t **dsp) -+{ -+ return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp)); -+} -+ -+static int -+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, -+ dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) -+{ -+ uint64_t zapobj; -+ nvlist_t *holds_found; -+ nvpair_t *pair; -+ objset_t *mos; -+ int numholds; -+ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (nvlist_empty(holds)) -+ return (0); -+ -+ numholds = 0; -+ mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ zapobj = ds->ds_phys->ds_userrefs_obj; -+ VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ uint64_t tmp; -+ int error; -+ const char *holdname = nvpair_name(pair); -+ -+ if (zapobj != 0) -+ error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); -+ else -+ error = SET_ERROR(ENOENT); -+ -+ /* -+ * Non-existent holds are put on the errlist, but don't -+ * cause an overall failure. -+ */ -+ if (error == ENOENT) { -+ if (ddura->ddura_errlist != NULL) { -+ char *errtag = kmem_asprintf("%s#%s", -+ snapname, holdname); -+ fnvlist_add_int32(ddura->ddura_errlist, errtag, -+ ENOENT); -+ strfree(errtag); -+ } -+ continue; -+ } -+ -+ if (error != 0) { -+ fnvlist_free(holds_found); -+ return (error); -+ } -+ -+ fnvlist_add_boolean(holds_found, holdname); -+ numholds++; -+ } -+ -+ if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && -+ ds->ds_userrefs == numholds) { -+ /* we need to destroy the snapshot as well */ -+ if (dsl_dataset_long_held(ds)) { -+ fnvlist_free(holds_found); -+ return (SET_ERROR(EBUSY)); -+ } -+ fnvlist_add_boolean(ddura->ddura_todelete, snapname); -+ } -+ -+ if (numholds != 0) { -+ fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, -+ holds_found); -+ } -+ fnvlist_free(holds_found); -+ -+ return (0); -+} -+ -+static int -+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_release_arg_t *ddura; -+ dsl_holdfunc_t *holdfunc; -+ dsl_pool_t *dp; -+ nvpair_t *pair; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ dp = dmu_tx_pool(tx); -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ ddura = arg; -+ holdfunc = ddura->ddura_holdfunc; -+ -+ for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { -+ int error; -+ dsl_dataset_t *ds; -+ nvlist_t *holds; -+ const char *snapname = nvpair_name(pair); -+ -+ error = nvpair_value_nvlist(pair, &holds); -+ if (error != 0) -+ error = (SET_ERROR(EINVAL)); -+ else -+ error = holdfunc(dp, snapname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_dataset_user_release_check_one(ddura, ds, -+ holds, snapname); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ if (error != 0) { -+ if (ddura->ddura_errlist != NULL) { -+ fnvlist_add_int32(ddura->ddura_errlist, -+ snapname, error); -+ } -+ /* -+ * Non-existent snapshots are put on the errlist, -+ * but don't cause an overall failure. -+ */ -+ if (error != ENOENT) -+ return (error); -+ } -+ } -+ -+ return (0); -+} -+ -+static void -+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, -+ dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ int error; -+ const char *holdname = nvpair_name(pair); -+ -+ /* Remove temporary hold if one exists. */ -+ error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); -+ VERIFY(error == 0 || error == ENOENT); -+ -+ VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname, -+ tx)); -+ ds->ds_userrefs--; -+ -+ spa_history_log_internal_ds(ds, "release", tx, -+ "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); -+ } -+} -+ -+static void -+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_release_arg_t *ddura = arg; -+ dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ for (pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, -+ pair)) { -+ dsl_dataset_t *ds; -+ const char *name = nvpair_name(pair); -+ -+ VERIFY0(holdfunc(dp, name, FTAG, &ds)); -+ -+ dsl_dataset_user_release_sync_one(ds, -+ fnvpair_value_nvlist(pair), tx); -+ if (nvlist_exists(ddura->ddura_todelete, name)) { -+ ASSERT(ds->ds_userrefs == 0 && -+ ds->ds_phys->ds_num_children == 1 && -+ DS_IS_DEFER_DESTROY(ds)); -+ dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The full semantics of this function are described in the comment above -+ * lzc_release(). -+ * -+ * To summarize: -+ * Releases holds specified in the nvl holds. -+ * -+ * holds is nvl of snapname -> { holdname, ... } -+ * errlist will be filled in with snapname -> error -+ * -+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, -+ * otherwise they should be the names of shapshots. -+ * -+ * As a release may cause snapshots to be destroyed this trys to ensure they -+ * aren't mounted. -+ * -+ * The release of non-existent holds are skipped. -+ * -+ * At least one hold must have been released for the this function to succeed -+ * and return 0. -+ */ -+static int -+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, -+ dsl_pool_t *tmpdp) -+{ -+ dsl_dataset_user_release_arg_t ddura; -+ nvpair_t *pair; -+ char *pool; -+ int error; -+ -+ pair = nvlist_next_nvpair(holds, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ /* -+ * The release may cause snapshots to be destroyed; make sure they -+ * are not mounted. -+ */ -+ if (tmpdp != NULL) { -+ /* Temporary holds are specified by dsobj string. */ -+ ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; -+ pool = spa_name(tmpdp->dp_spa); -+#ifdef _KERNEL -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ dsl_dataset_t *ds; -+ -+ dsl_pool_config_enter(tmpdp, FTAG); -+ error = dsl_dataset_hold_obj_string(tmpdp, -+ nvpair_name(pair), FTAG, &ds); -+ if (error == 0) { -+ char name[MAXNAMELEN]; -+ dsl_dataset_name(ds, name); -+ dsl_pool_config_exit(tmpdp, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ (void) zfs_unmount_snap(name); -+ } else { -+ dsl_pool_config_exit(tmpdp, FTAG); -+ } -+ } -+#endif -+ } else { -+ /* Non-temporary holds are specified by name. */ -+ ddura.ddura_holdfunc = dsl_dataset_hold; -+ pool = nvpair_name(pair); -+#ifdef _KERNEL -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ (void) zfs_unmount_snap(nvpair_name(pair)); -+ } -+#endif -+ } -+ -+ ddura.ddura_holds = holds; -+ ddura.ddura_errlist = errlist; -+ VERIFY0(nvlist_alloc(&ddura.ddura_todelete, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ VERIFY0(nvlist_alloc(&ddura.ddura_chkholds, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ -+ error = dsl_sync_task(pool, dsl_dataset_user_release_check, -+ dsl_dataset_user_release_sync, &ddura, -+ fnvlist_num_pairs(holds)); -+ fnvlist_free(ddura.ddura_todelete); -+ fnvlist_free(ddura.ddura_chkholds); -+ -+ return (error); -+} -+ -+/* -+ * holds is nvl of snapname -> { holdname, ... } -+ * errlist will be filled in with snapname -> error -+ */ -+int -+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) -+{ -+ return (dsl_dataset_user_release_impl(holds, errlist, NULL)); -+} -+ -+/* -+ * holds is nvl of snapdsobj -> { holdname, ... } -+ */ -+void -+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) -+{ -+ ASSERT(dp != NULL); -+ (void) dsl_dataset_user_release_impl(holds, NULL, dp); -+} -+ -+int -+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ int err; -+ -+ err = dsl_pool_hold(dsname, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (ds->ds_phys->ds_userrefs_obj != 0) { -+ zap_attribute_t *za; -+ zap_cursor_t zc; -+ -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, -+ ds->ds_phys->ds_userrefs_obj); -+ zap_cursor_retrieve(&zc, za) == 0; -+ zap_cursor_advance(&zc)) { -+ fnvlist_add_uint64(nvl, za->za_name, -+ za->za_first_integer); -+ } -+ zap_cursor_fini(&zc); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (0); -+} -diff --git a/module/zfs/fm.c b/module/zfs/fm.c -index c004032..246b3d2 100644 ---- a/module/zfs/fm.c -+++ b/module/zfs/fm.c -@@ -86,2 +86,10 @@ static int zevent_flags = 0; - -+/* -+ * The EID (Event IDentifier) is used to uniquely tag a zevent when it is -+ * posted. The posted EIDs are monotonically increasing but not persistent. -+ * They will be reset to the initial value (1) each time the kernel module is -+ * loaded. -+ */ -+static uint64_t zevent_eid = 0; -+ - static kmutex_t zevent_lock; -@@ -278,4 +286,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -292,4 +300,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -306,4 +314,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -320,4 +328,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -334,4 +342,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -348,4 +356,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -362,4 +370,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -376,4 +384,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -420,7 +428,7 @@ zfs_zevent_alloc(void) - -- ev = kmem_zalloc(sizeof(zevent_t), KM_PUSHPAGE); -+ ev = kmem_zalloc(sizeof (zevent_t), KM_PUSHPAGE); - if (ev == NULL) -- return NULL; -+ return (NULL); - -- list_create(&ev->ev_ze_list, sizeof(zfs_zevent_t), -+ list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), - offsetof(zfs_zevent_t, ze_node)); -@@ -428,3 +436,3 @@ zfs_zevent_alloc(void) - -- return ev; -+ return (ev); - } -@@ -438,3 +446,3 @@ zfs_zevent_free(zevent_t *ev) - list_destroy(&ev->ev_ze_list); -- kmem_free(ev, sizeof(zevent_t)); -+ kmem_free(ev, sizeof (zevent_t)); - } -@@ -500,2 +508,3 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - timestruc_t tv; -+ uint64_t eid; - size_t nvl_size = 0; -@@ -511,2 +520,8 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - -+ eid = atomic_inc_64_nv(&zevent_eid); -+ if (nvlist_add_uint64(nvl, FM_EREPORT_EID, eid)) { -+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); -+ return; -+ } -+ - (void) nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); -@@ -526,5 +541,6 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - -- ev->ev_nvl = nvl; -+ ev->ev_nvl = nvl; - ev->ev_detector = detector; - ev->ev_cb = cb; -+ ev->ev_eid = eid; - -@@ -552,8 +568,8 @@ zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) - -- fp = getf(fd); -- if (fp == NULL) -- return (EBADF); -+ fp = getf(fd); -+ if (fp == NULL) -+ return (EBADF); - -- *minorp = zfsdev_getminor(fp->f_file); -- error = zfs_zevent_minor_to_state(*minorp, ze); -+ *minorp = zfsdev_getminor(fp->f_file); -+ error = zfs_zevent_minor_to_state(*minorp, ze); - -@@ -579,3 +595,3 @@ int - zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, -- uint64_t *dropped) -+ uint64_t *dropped) - { -@@ -594,4 +610,6 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, - } else { -- /* Existing stream continue with the next element and remove -- * ourselves from the wait queue for the previous element */ -+ /* -+ * Existing stream continue with the next element and remove -+ * ourselves from the wait queue for the previous element -+ */ - ev = list_prev(&zevent_list, ze->ze_zevent); -@@ -621,3 +639,3 @@ out: - -- return error; -+ return (error); - } -@@ -645,3 +663,64 @@ out: - -- return error; -+ return (error); -+} -+ -+/* -+ * The caller may seek to a specific EID by passing that EID. If the EID -+ * is still available in the posted list of events the cursor is positioned -+ * there. Otherwise ENOENT is returned and the cursor is not moved. -+ * -+ * There are two reserved EIDs which may be passed and will never fail. -+ * ZEVENT_SEEK_START positions the cursor at the start of the list, and -+ * ZEVENT_SEEK_END positions the cursor at the end of the list. -+ */ -+int -+zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) -+{ -+ zevent_t *ev; -+ int error = 0; -+ -+ mutex_enter(&zevent_lock); -+ -+ if (eid == ZEVENT_SEEK_START) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ze->ze_zevent = NULL; -+ goto out; -+ } -+ -+ if (eid == ZEVENT_SEEK_END) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ev = list_head(&zevent_list); -+ if (ev) { -+ ze->ze_zevent = ev; -+ list_insert_head(&ev->ev_ze_list, ze); -+ } else { -+ ze->ze_zevent = NULL; -+ } -+ -+ goto out; -+ } -+ -+ for (ev = list_tail(&zevent_list); ev != NULL; -+ ev = list_prev(&zevent_list, ev)) { -+ if (ev->ev_eid == eid) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ze->ze_zevent = ev; -+ list_insert_head(&ev->ev_ze_list, ze); -+ break; -+ } -+ } -+ -+ if (ev == NULL) -+ error = ENOENT; -+ -+out: -+ mutex_exit(&zevent_lock); -+ -+ return (error); - } -@@ -1514,3 +1593,4 @@ fm_init(void) - mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); -- list_create(&zevent_list, sizeof(zevent_t), offsetof(zevent_t, ev_node)); -+ list_create(&zevent_list, sizeof (zevent_t), -+ offsetof(zevent_t, ev_node)); - cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); -diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c -index 155404e..011fb91 100644 ---- a/module/zfs/gzip.c -+++ b/module/zfs/gzip.c -@@ -37,4 +37,4 @@ - typedef size_t zlen_t; --#define compress_func z_compress_level --#define uncompress_func z_uncompress -+#define compress_func z_compress_level -+#define uncompress_func z_uncompress - -@@ -46,4 +46,4 @@ typedef size_t zlen_t; - typedef uLongf zlen_t; --#define compress_func compress2 --#define uncompress_func uncompress -+#define compress_func compress2 -+#define uncompress_func uncompress - -diff --git a/module/zfs/lz4.c b/module/zfs/lz4.c -index 8afaad1..df96373 100644 ---- a/module/zfs/lz4.c -+++ b/module/zfs/lz4.c -@@ -49,3 +49,4 @@ static kmem_cache_t *lz4_cache; - size_t --lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, -+ size_t d_len, int n) - { -@@ -76,3 +77,4 @@ lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - int --lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, -+ size_t d_len, int n) - { -@@ -145,4 +147,4 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated -- * by the caller (either on the stack or using kmem_cache_alloc). Passing NULL -- * isn't valid. -+ * by the caller (either on the stack or using kmem_cache_alloc). Passing -+ * NULL isn't valid. - * -@@ -153,4 +155,4 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated -- * by the caller (either on the stack or using kmem_cache_alloc). Passing NULL -- * isn't valid. -+ * by the caller (either on the stack or using kmem_cache_alloc). Passing -+ * NULL isn't valid. - */ -@@ -238,2 +240,5 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - #undef LZ4_FORCE_SW_BITCOUNT -+#if defined(__sparc) -+#define LZ4_FORCE_SW_BITCOUNT -+#endif - -@@ -269,3 +274,3 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - --#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ -+#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ - (((x) & 0xffu) << 8))) -@@ -1011,2 +1016 @@ lz4_fini(void) - } -- -diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c -index 43d0df0..83ff409 100644 ---- a/module/zfs/lzjb.c -+++ b/module/zfs/lzjb.c -@@ -52,3 +52,4 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - uchar_t *dst = d_start; -- uchar_t *cpy, *copymap = NULL; -+ uchar_t *cpy; -+ uchar_t *copymap = NULL; - int copymask = 1 << (NBBY - 1); -@@ -62,3 +63,4 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { -- kmem_free(lempel, LEMPEL_SIZE*sizeof(uint16_t)); -+ kmem_free(lempel, -+ LEMPEL_SIZE*sizeof (uint16_t)); - return (s_len); -@@ -106,3 +108,4 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - uchar_t *d_end = (uchar_t *)d_start + d_len; -- uchar_t *cpy, copymap = 0; -+ uchar_t *cpy; -+ uchar_t copymap = 0; - int copymask = 1 << (NBBY - 1); -diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c -index cd1b6ce..6356f79 100644 ---- a/module/zfs/metaslab.c -+++ b/module/zfs/metaslab.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ -@@ -33,3 +34,3 @@ - --#define WITH_DF_BLOCK_ALLOCATOR -+#define WITH_DF_BLOCK_ALLOCATOR - -@@ -61,5 +62,21 @@ int zfs_condense_pct = 200; - * If a device reaches this threshold in a given txg then we consider skipping -- * allocations on that device. -+ * allocations on that device. The value of zfs_mg_alloc_failures is computed -+ * in zio_init() unless it has been overridden in /etc/system. - */ --int zfs_mg_alloc_failures; -+int zfs_mg_alloc_failures = 0; -+ -+/* -+ * The zfs_mg_noalloc_threshold defines which metaslab groups should -+ * be eligible for allocation. The value is defined as a percentage of -+ * a free space. Metaslab groups that have more free space than -+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once -+ * a metaslab group's free space is less than or equal to the -+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that -+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. -+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all -+ * groups are allowed to accept allocations. Gang blocks are always -+ * eligible to allocate on any metaslab group. The default value of 0 means -+ * no metaslab group will be excluded based on this criterion. -+ */ -+int zfs_mg_noalloc_threshold = 0; - -@@ -103,2 +120,7 @@ int metaslab_smo_bonus_pct = 150; - /* -+ * Should we be willing to write data to degraded vdevs? -+ */ -+boolean_t zfs_write_to_degraded = B_FALSE; -+ -+/* - * ========================================================================== -@@ -224,2 +246,49 @@ metaslab_compare(const void *x1, const void *x2) - -+/* -+ * Update the allocatable flag and the metaslab group's capacity. -+ * The allocatable flag is set to true if the capacity is below -+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions -+ * from allocatable to non-allocatable or vice versa then the metaslab -+ * group's class is updated to reflect the transition. -+ */ -+static void -+metaslab_group_alloc_update(metaslab_group_t *mg) -+{ -+ vdev_t *vd = mg->mg_vd; -+ metaslab_class_t *mc = mg->mg_class; -+ vdev_stat_t *vs = &vd->vdev_stat; -+ boolean_t was_allocatable; -+ -+ ASSERT(vd == vd->vdev_top); -+ -+ mutex_enter(&mg->mg_lock); -+ was_allocatable = mg->mg_allocatable; -+ -+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / -+ (vs->vs_space + 1); -+ -+ mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); -+ -+ /* -+ * The mc_alloc_groups maintains a count of the number of -+ * groups in this metaslab class that are still above the -+ * zfs_mg_noalloc_threshold. This is used by the allocating -+ * threads to determine if they should avoid allocations to -+ * a given group. The allocator will avoid allocations to a group -+ * if that group has reached or is below the zfs_mg_noalloc_threshold -+ * and there are still other groups that are above the threshold. -+ * When a group transitions from allocatable to non-allocatable or -+ * vice versa we update the metaslab class to reflect that change. -+ * When the mc_alloc_groups value drops to 0 that means that all -+ * groups have reached the zfs_mg_noalloc_threshold making all groups -+ * eligible for allocations. This effectively means that all devices -+ * are balanced again. -+ */ -+ if (was_allocatable && !mg->mg_allocatable) -+ mc->mc_alloc_groups--; -+ else if (!was_allocatable && mg->mg_allocatable) -+ mc->mc_alloc_groups++; -+ mutex_exit(&mg->mg_lock); -+} -+ - metaslab_group_t * -@@ -274,2 +343,3 @@ metaslab_group_activate(metaslab_group_t *mg) - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); -+ metaslab_group_alloc_update(mg); - -@@ -359,2 +429,25 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) - /* -+ * Determine if a given metaslab group should skip allocations. A metaslab -+ * group should avoid allocations if its used capacity has crossed the -+ * zfs_mg_noalloc_threshold and there is at least one metaslab group -+ * that can still handle allocations. -+ */ -+static boolean_t -+metaslab_group_allocatable(metaslab_group_t *mg) -+{ -+ vdev_t *vd = mg->mg_vd; -+ spa_t *spa = vd->vdev_spa; -+ metaslab_class_t *mc = mg->mg_class; -+ -+ /* -+ * A metaslab group is considered allocatable if its free capacity -+ * is greater than the set value of zfs_mg_noalloc_threshold, it's -+ * associated with a slog, or there are no other metaslab groups -+ * with free capacity greater than zfs_mg_noalloc_threshold. -+ */ -+ return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || -+ mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); -+} -+ -+/* - * ========================================================================== -@@ -833,2 +926,12 @@ metaslab_weight(metaslab_t *msp) - /* -+ * This vdev is in the process of being removed so there is nothing -+ * for us to do here. -+ */ -+ if (vd->vdev_removing) { -+ ASSERT0(smo->smo_alloc); -+ ASSERT0(vd->vdev_ms_shift); -+ return (0); -+ } -+ -+ /* - * The baseline weight is the metaslab's free space. -@@ -1214,4 +1317,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - space_map_t *sm = msp->ms_map; -- space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -- space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; -+ space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -+ space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; - metaslab_group_t *mg = msp->ms_group; -@@ -1229,4 +1332,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - */ -- if (freed_map == NULL) { -- ASSERT(defer_map == NULL); -+ if (*freed_map == NULL) { -+ ASSERT(*defer_map == NULL); - for (t = 0; t < TXG_SIZE; t++) { -@@ -1249,4 +1352,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - -- freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -- defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; -+ freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -+ defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; - -@@ -1256,3 +1359,3 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - alloc_delta = smosync->smo_alloc - smo->smo_alloc; -- defer_delta = freed_map->sm_space - defer_map->sm_space; -+ defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; - -@@ -1266,8 +1369,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - * so that we have a consistent view of the in-core space map. -- * Then, add defer_map (oldest deferred frees) to this map and -- * transfer freed_map (this txg's frees) to defer_map. - */ - space_map_load_wait(sm); -- space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); -- space_map_vacate(freed_map, space_map_add, defer_map); -+ -+ /* -+ * Move the frees from the defer_map to this map (if it's loaded). -+ * Swap the freed_map and the defer_map -- this is safe to do -+ * because we've just emptied out the defer_map. -+ */ -+ space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); -+ ASSERT0((*defer_map)->sm_space); -+ ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); -+ space_map_swap(freed_map, defer_map); - -@@ -1286,2 +1395,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - -+ metaslab_group_alloc_update(mg); -+ - /* -@@ -1390,2 +1501,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - } -+ -+ /* -+ * If the selected metaslab is condensing, skip it. -+ */ -+ if (msp->ms_map->sm_condensing) -+ continue; -+ - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; -@@ -1408,2 +1526,4 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - -+ mutex_enter(&msp->ms_lock); -+ - /* -@@ -1424,15 +1544,4 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - mg->mg_alloc_failures); -- return (-1ULL); -- } -- -- mutex_enter(&msp->ms_lock); -- -- /* -- * If this metaslab is currently condensing then pick again as -- * we can't manipulate this metaslab until it's committed -- * to disk. -- */ -- if (msp->ms_map->sm_condensing) { - mutex_exit(&msp->ms_lock); -- continue; -+ return (-1ULL); - } -@@ -1465,2 +1574,12 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - -+ /* -+ * If this metaslab is currently condensing then pick again as -+ * we can't manipulate this metaslab until it's committed -+ * to disk. -+ */ -+ if (msp->ms_map->sm_condensing) { -+ mutex_exit(&msp->ms_lock); -+ continue; -+ } -+ - if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) -@@ -1508,3 +1627,3 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -1593,2 +1712,17 @@ top: - } -+ -+ /* -+ * Determine if the selected metaslab group is eligible -+ * for allocations. If we're ganging or have requested -+ * an allocation for the smallest gang block size -+ * then we don't want to avoid allocating to the this -+ * metaslab group. If we're in this condition we should -+ * try to allocate from any device possible so that we -+ * don't inadvertently return ENOSPC and suspend the pool -+ * even though space is still available. -+ */ -+ if (allocatable && CAN_FASTGANG(flags) && -+ psize > SPA_GANGBLOCKSIZE) -+ allocatable = metaslab_group_allocatable(mg); -+ - if (!allocatable) -@@ -1598,2 +1732,3 @@ top: - * Avoid writing single-copy data to a failing vdev -+ * unless the user instructs us that it is okay. - */ -@@ -1601,3 +1736,5 @@ top: - vd->vdev_state < VDEV_STATE_HEALTHY) && -- d == 0 && dshift == 3) { -+ d == 0 && dshift == 3 && -+ !(zfs_write_to_degraded && vd->vdev_state == -+ VDEV_STATE_DEGRADED)) { - all_zero = B_FALSE; -@@ -1691,3 +1828,4 @@ next: - mutex_exit(&mc->mc_fastwrite_lock); -- return (ENOSPC); -+ -+ return (SET_ERROR(ENOSPC)); - } -@@ -1760,3 +1898,3 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -1773,3 +1911,3 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) - if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -1808,3 +1946,3 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - spa_config_exit(spa, SCL_ALLOC, FTAG); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -1885,3 +2023,4 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) - --void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -+void -+metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) - { -@@ -1907,3 +2046,4 @@ void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) - --void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -+void -+metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) - { -@@ -1930,2 +2070,42 @@ void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) - -+static void -+checkmap(space_map_t *sm, uint64_t off, uint64_t size) -+{ -+ space_seg_t *ss; -+ avl_index_t where; -+ -+ mutex_enter(sm->sm_lock); -+ ss = space_map_find(sm, off, size, &where); -+ if (ss != NULL) -+ panic("freeing free block; ss=%p", (void *)ss); -+ mutex_exit(sm->sm_lock); -+} -+ -+void -+metaslab_check_free(spa_t *spa, const blkptr_t *bp) -+{ -+ int i, j; -+ -+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) -+ return; -+ -+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); -+ for (i = 0; i < BP_GET_NDVAS(bp); i++) { -+ uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); -+ vdev_t *vd = vdev_lookup_top(spa, vdid); -+ uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); -+ uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); -+ metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; -+ -+ if (ms->ms_map->sm_loaded) -+ checkmap(ms->ms_map, off, size); -+ -+ for (j = 0; j < TXG_SIZE; j++) -+ checkmap(ms->ms_freemap[j], off, size); -+ for (j = 0; j < TXG_DEFER_SIZE; j++) -+ checkmap(ms->ms_defermap[j], off, size); -+ } -+ spa_config_exit(spa, SCL_VDEV, FTAG); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) -diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c -index e43807c..49980ef 100644 ---- a/module/zfs/refcount.c -+++ b/module/zfs/refcount.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -34,3 +35,3 @@ int reference_tracking_enable = TRUE; - #endif --int reference_history = 4; /* tunable */ -+int reference_history = 3; /* tunable */ - -@@ -66,2 +67,10 @@ refcount_create(refcount_t *rc) - rc->rc_removed_count = 0; -+ rc->rc_tracked = reference_tracking_enable; -+} -+ -+void -+refcount_create_untracked(refcount_t *rc) -+{ -+ refcount_create(rc); -+ rc->rc_tracked = B_FALSE; - } -@@ -98,3 +107,2 @@ refcount_is_zero(refcount_t *rc) - { -- ASSERT(rc->rc_count >= 0); - return (rc->rc_count == 0); -@@ -105,3 +113,2 @@ refcount_count(refcount_t *rc) - { -- ASSERT(rc->rc_count >= 0); - return (rc->rc_count); -@@ -115,3 +122,3 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) - -- if (reference_tracking_enable) { -+ if (rc->rc_tracked) { - ref = kmem_cache_alloc(reference_cache, KM_PUSHPAGE); -@@ -122,3 +129,3 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) - ASSERT(rc->rc_count >= 0); -- if (reference_tracking_enable) -+ if (rc->rc_tracked) - list_insert_head(&rc->rc_list, ref); -@@ -146,3 +153,3 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) - -- if (!reference_tracking_enable) { -+ if (!rc->rc_tracked) { - rc->rc_count -= number; -@@ -163,3 +170,3 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) - rc->rc_removed_count++; -- if (rc->rc_removed_count >= reference_history) { -+ if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); -diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c -index 4cef53f..357afbf 100644 ---- a/module/zfs/rrwlock.c -+++ b/module/zfs/rrwlock.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -74,4 +77,5 @@ uint_t rrw_tsd_key; - typedef struct rrw_node { -- struct rrw_node *rn_next; -- rrwlock_t *rn_rrl; -+ struct rrw_node *rn_next; -+ rrwlock_t *rn_rrl; -+ void *rn_tag; - } rrw_node_t; -@@ -97,3 +101,3 @@ rrn_find(rrwlock_t *rrl) - static void --rrn_add(rrwlock_t *rrl) -+rrn_add(rrwlock_t *rrl, void *tag) - { -@@ -101,5 +105,6 @@ rrn_add(rrwlock_t *rrl) - -- rn = kmem_alloc(sizeof (*rn), KM_SLEEP); -+ rn = kmem_alloc(sizeof (*rn), KM_PUSHPAGE); - rn->rn_rrl = rrl; - rn->rn_next = tsd_get(rrw_tsd_key); -+ rn->rn_tag = tag; - VERIFY(tsd_set(rrw_tsd_key, rn) == 0); -@@ -112,3 +117,3 @@ rrn_add(rrwlock_t *rrl) - static boolean_t --rrn_find_and_remove(rrwlock_t *rrl) -+rrn_find_and_remove(rrwlock_t *rrl, void *tag) - { -@@ -121,3 +126,3 @@ rrn_find_and_remove(rrwlock_t *rrl) - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { -- if (rn->rn_rrl == rrl) { -+ if (rn->rn_rrl == rrl && rn->rn_tag == tag) { - if (prev) -@@ -135,3 +140,3 @@ rrn_find_and_remove(rrwlock_t *rrl) - void --rrw_init(rrwlock_t *rrl) -+rrw_init(rrwlock_t *rrl, boolean_t track_all) - { -@@ -143,2 +148,3 @@ rrw_init(rrwlock_t *rrl) - rrl->rr_writer_wanted = B_FALSE; -+ rrl->rr_track_all = track_all; - } -@@ -155,3 +161,3 @@ rrw_destroy(rrwlock_t *rrl) - --static void -+void - rrw_enter_read(rrwlock_t *rrl, void *tag) -@@ -160,3 +166,4 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - #if !defined(DEBUG) && defined(_KERNEL) -- if (!rrl->rr_writer && !rrl->rr_writer_wanted) { -+ if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && -+ !rrl->rr_track_all) { - rrl->rr_anon_rcount.rc_count++; -@@ -170,3 +177,3 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - -- while (rrl->rr_writer || (rrl->rr_writer_wanted && -+ while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && - refcount_is_zero(&rrl->rr_anon_rcount) && -@@ -175,5 +182,5 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - -- if (rrl->rr_writer_wanted) { -+ if (rrl->rr_writer_wanted || rrl->rr_track_all) { - /* may or may not be a re-entrant enter */ -- rrn_add(rrl); -+ rrn_add(rrl, tag); - (void) refcount_add(&rrl->rr_linked_rcount, tag); -@@ -186,3 +193,3 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - --static void -+void - rrw_enter_write(rrwlock_t *rrl) -@@ -232,6 +239,8 @@ rrw_exit(rrwlock_t *rrl, void *tag) - int64_t count; -- if (rrn_find_and_remove(rrl)) -+ if (rrn_find_and_remove(rrl, tag)) { - count = refcount_remove(&rrl->rr_linked_rcount, tag); -- else -+ } else { -+ ASSERT(!rrl->rr_track_all); - count = refcount_remove(&rrl->rr_anon_rcount, tag); -+ } - if (count == 0) -@@ -248,2 +257,7 @@ rrw_exit(rrwlock_t *rrl, void *tag) - -+/* -+ * If the lock was created with track_all, rrw_held(RW_READER) will return -+ * B_TRUE iff the current thread has the lock for reader. Otherwise it may -+ * return B_TRUE if any thread has the lock for reader. -+ */ - boolean_t -@@ -258,3 +272,3 @@ rrw_held(rrwlock_t *rrl, krw_t rw) - held = (!refcount_is_zero(&rrl->rr_anon_rcount) || -- !refcount_is_zero(&rrl->rr_linked_rcount)); -+ rrn_find(rrl) != NULL); - } -@@ -264 +278,11 @@ rrw_held(rrwlock_t *rrl, krw_t rw) - } -+ -+void -+rrw_tsd_destroy(void *arg) -+{ -+ rrw_node_t *rn = arg; -+ if (rn != NULL) { -+ panic("thread %p terminating with rrw lock %p held", -+ (void *)curthread, (void *)rn->rn_rrl); -+ } -+} -diff --git a/module/zfs/sa.c b/module/zfs/sa.c -index 581cf4b..fcc5f3b 100644 ---- a/module/zfs/sa.c -+++ b/module/zfs/sa.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -112,2 +112,3 @@ - * Byteswap implications: -+ * - * Since the SA attributes are not entirely self describing we can't do -@@ -190,3 +191,2 @@ sa_attr_reg_t sa_legacy_attrs[] = { - /* -- * ZPL legacy layout - * This is only used for objects of type DMU_OT_ZNODE -@@ -200,3 +200,2 @@ sa_attr_type_t sa_legacy_zpl_layout[] = { - */ -- - sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; -@@ -254,3 +253,3 @@ sa_spill_alloc(int flags) - { -- return kmem_cache_alloc(spill_cache, flags); -+ return (kmem_cache_alloc(spill_cache, flags)); - } -@@ -392,3 +391,3 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, - if (bulk[i].sa_addr == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if (bulk[i].sa_data) { -@@ -524,3 +523,3 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) - ASSERT(0); -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - } else { -@@ -574,6 +573,5 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - int i; -- int j = -1; - int full_space; - int hdrsize; -- boolean_t done = B_FALSE; -+ int extra_hdrsize; - -@@ -588,6 +586,5 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *total = 0; -+ *will_spill = B_FALSE; - -- if (buftype == SA_BONUS) -- *will_spill = B_FALSE; -- -+ extra_hdrsize = 0; - hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : -@@ -603,4 +600,4 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *total += attr_desc[i].sa_length; -- if (done) -- goto next; -+ if (*will_spill) -+ continue; - -@@ -612,3 +609,8 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - if (is_var_sz && var_size > 1) { -- if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + -+ /* -+ * Don't worry that the spill block might overflow. -+ * It will be resized if needed in sa_build_layouts(). -+ */ -+ if (buftype == SA_SPILL || -+ P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + - *total < full_space) { -@@ -617,12 +619,14 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - * optional sizes of variable-length attributes. -- * Record the index in case this increase needs -- * to be reversed due to spill-over. -+ * Record the extra header size in case this -+ * increase needs to be reversed due to -+ * spill-over. - */ - hdrsize += sizeof (uint16_t); -- j = i; -+ if (*index != -1) -+ extra_hdrsize += sizeof (uint16_t); - } else { -- done = B_TRUE; -- *index = i; -- if (buftype == SA_BONUS) -- *will_spill = B_TRUE; -+ ASSERT(buftype == SA_BONUS); -+ if (*index == -1) -+ *index = i; -+ *will_spill = B_TRUE; - continue; -@@ -641,6 +645,4 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *index = i; -- done = B_TRUE; - } - --next: - if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && -@@ -650,9 +652,4 @@ next: - -- /* -- * j holds the index of the last variable-sized attribute for -- * which hdrsize was increased. Reverse the increase if that -- * attribute will be relocated to the spill block. -- */ -- if (*will_spill && j == *index) -- hdrsize -= sizeof (uint16_t); -+ if (*will_spill) -+ hdrsize -= extra_hdrsize; - -@@ -681,3 +678,4 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - int i, lot_count; -- int hdrsize, spillhdrsize = 0; -+ int hdrsize; -+ int spillhdrsize = 0; - int used; -@@ -697,3 +695,3 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - if (used > SPA_MAXBLOCKSIZE) -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - -@@ -721,3 +719,3 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - if (spill_used > SPA_MAXBLOCKSIZE) -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - -@@ -878,3 +876,3 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) - if (error == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto bail; -@@ -909,3 +907,3 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - switch (error) { -@@ -1021,6 +1019,6 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - -- mutex_enter(&os->os_lock); -+ mutex_enter(&os->os_user_ptr_lock); - if (os->os_sa) { - mutex_enter(&os->os_sa->sa_lock); -- mutex_exit(&os->os_lock); -+ mutex_exit(&os->os_user_ptr_lock); - tb = os->os_sa->sa_user_table; -@@ -1037,3 +1035,3 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - mutex_enter(&sa->sa_lock); -- mutex_exit(&os->os_lock); -+ mutex_exit(&os->os_user_ptr_lock); - avl_create(&sa->sa_layout_num_tree, layout_num_compare, -@@ -1068,3 +1066,3 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - if (error == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto fail; -@@ -1147,3 +1145,4 @@ sa_tear_down(objset_t *os) - cookie = NULL; -- while ((layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))){ -+ while ((layout = -+ avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) { - sa_idx_tab_t *tab; -@@ -1156,3 +1155,3 @@ sa_tear_down(objset_t *os) - cookie = NULL; -- while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))){ -+ while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) { - kmem_free(layout->lot_attrs, -@@ -1735,3 +1734,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - for (; k != 2; k++) { -- /* iterate over each attribute in layout */ -+ /* -+ * Iterate over each attribute in layout. Fetch the -+ * size of variable-length attributes needing rewrite -+ * from sa_lengths[]. -+ */ - for (i = 0, length_idx = 0; i != count; i++) { -@@ -1740,3 +1743,6 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - attr = idx_tab->sa_layout->lot_attrs[i]; -+ length = SA_REGISTERED_LEN(sa, attr); - if (attr == newattr) { -+ if (length == 0) -+ ++length_idx; - if (action == SA_REMOVE) { -@@ -1745,3 +1751,3 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - } -- ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); -+ ASSERT(length == 0); - ASSERT(action == SA_REPLACE); -@@ -1750,6 +1756,4 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - } else { -- length = SA_REGISTERED_LEN(sa, attr); -- if (length == 0) { -+ if (length == 0) - length = hdr->sa_lengths[length_idx++]; -- } - -@@ -1775,3 +1779,3 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, -- datastart, buflen); -+ datastart, length); - } -diff --git a/module/zfs/spa.c b/module/zfs/spa.c -index 65f78b7..af93b7c 100644 ---- a/module/zfs/spa.c -+++ b/module/zfs/spa.c -@@ -23,4 +23,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -28,2 +28,4 @@ - /* -+ * SPA: Storage Pool Allocator -+ * - * This file contains all the routines used when modifying on-disk SPA state. -@@ -66,2 +68,3 @@ - #include -+#include - #include -@@ -82,3 +85,2 @@ typedef enum zti_modes { - ZTI_MODE_FIXED, /* value is # of threads (min 1) */ -- ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ -@@ -133,6 +135,4 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - --static dsl_syncfunc_t spa_sync_version; --static dsl_syncfunc_t spa_sync_props; --static dsl_checkfunc_t spa_change_guid_check; --static dsl_syncfunc_t spa_change_guid_sync; -+static void spa_sync_version(void *arg, dmu_tx_t *tx); -+static void spa_sync_props(void *arg, dmu_tx_t *tx); - static boolean_t spa_has_active_shared_spare(spa_t *spa); -@@ -143,3 +143,3 @@ static void spa_vdev_resilver_done(spa_t *spa); - --uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ -+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ - id_t zio_taskq_psrset_bind = PS_NONE; -@@ -290,3 +290,3 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - if (err) -- return err; -+ return (err); - -@@ -331,6 +331,6 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ dsl_pool_config_enter(dp, FTAG); - if ((err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds))) { -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_config_exit(dp, FTAG); - break; -@@ -343,3 +343,3 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - dsl_dataset_rele(ds, FTAG); -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_config_exit(dp, FTAG); - } else { -@@ -408,3 +408,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!zpool_prop_feature(propname)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -416,3 +416,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (nvpair_type(elem) != DATA_TYPE_UINT64) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -421,3 +421,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (nvpair_value_uint64(elem, &intval) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -426,3 +426,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (intval != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -432,3 +432,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (zfeature_lookup_name(fname, NULL) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -445,3 +445,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - has_feature)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -454,3 +454,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!error && intval > 1) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -464,3 +464,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (spa_version(spa) < SPA_VERSION_BOOTFS) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - break; -@@ -472,3 +472,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!vdev_is_bootable(spa->spa_root_vdev)) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - break; -@@ -490,3 +490,4 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - -- if ((error = dmu_objset_hold(strval,FTAG,&os))) -+ error = dmu_objset_hold(strval, FTAG, &os); -+ if (error) - break; -@@ -496,8 +497,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (dmu_objset_type(os) != DMU_OST_ZFS) { -- error = ENOTSUP; -- } else if ((error = dsl_prop_get_integer(strval, -+ error = SET_ERROR(ENOTSUP); -+ } else if ((error = -+ dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_COMPRESSION), -- &compress, NULL)) == 0 && -+ &compress)) == 0 && - !BOOTFS_COMPRESS_VALID(compress)) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } else { -@@ -513,3 +515,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - intval > ZIO_FAILURE_MODE_PANIC)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - -@@ -527,3 +529,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - spa->spa_failmode = intval; -- error = EIO; -+ error = SET_ERROR(EIO); - } -@@ -542,3 +544,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (strval[0] != '/') { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -551,3 +553,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - strcmp(slash, "/..") == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -559,3 +561,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!isprint(*check)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -565,3 +567,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (strlen(strval) > ZPROP_MAX_COMMENT) -- error = E2BIG; -+ error = SET_ERROR(E2BIG); - break; -@@ -570,3 +572,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (spa_version(spa) < SPA_VERSION_DEDUP) -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - else -@@ -575,3 +577,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - intval != 0 && intval < ZIO_DEDUPDITTO_MIN) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -663,4 +665,4 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) - */ -- error = dsl_sync_task_do(spa_get_dsl(spa), NULL, -- spa_sync_version, spa, &ver, 6); -+ error = dsl_sync_task(spa->spa_name, NULL, -+ spa_sync_version, &ver, 6); - if (error) -@@ -675,4 +677,4 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) - if (need_sync) { -- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, -- spa, nvp, 6)); -+ return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, -+ nvp, 6)); - } -@@ -698,8 +700,8 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) - static int --spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_change_guid_check(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *rvd = spa->spa_root_vdev; - uint64_t vdev_state; -- ASSERTV(uint64_t *newguid = arg2); -+ ASSERTV(uint64_t *newguid = arg); - -@@ -710,3 +712,3 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) - if (vdev_state != VDEV_STATE_HEALTHY) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -718,6 +720,6 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_change_guid_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- uint64_t *newguid = arg2; -+ uint64_t *newguid = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - uint64_t oldguid; -@@ -733,4 +735,4 @@ spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx, -- "old=%lld new=%lld", oldguid, *newguid); -+ spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", -+ oldguid, *newguid); - } -@@ -752,2 +754,3 @@ spa_change_guid(spa_t *spa) - -+ mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); -@@ -755,4 +758,4 @@ spa_change_guid(spa_t *spa) - -- error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, -- spa_change_guid_sync, spa, &guid, 5); -+ error = dsl_sync_task(spa->spa_name, spa_change_guid_check, -+ spa_change_guid_sync, &guid, 5); - -@@ -764,2 +767,3 @@ spa_change_guid(spa_t *spa) - mutex_exit(&spa_namespace_lock); -+ mutex_exit(&spa->spa_vdev_top_lock); - -@@ -835,27 +839,23 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) - -- for (i = 0; i < count; i++) { -- taskq_t *tq; -- -- switch (mode) { -- case ZTI_MODE_FIXED: -- ASSERT3U(value, >=, 1); -- value = MAX(value, 1); -- break; -+ switch (mode) { -+ case ZTI_MODE_FIXED: -+ ASSERT3U(value, >=, 1); -+ value = MAX(value, 1); -+ break; - -- case ZTI_MODE_BATCH: -- batch = B_TRUE; -- flags |= TASKQ_THREADS_CPU_PCT; -- value = zio_taskq_batch_pct; -- break; -+ case ZTI_MODE_BATCH: -+ batch = B_TRUE; -+ flags |= TASKQ_THREADS_CPU_PCT; -+ value = zio_taskq_batch_pct; -+ break; - -- case ZTI_MODE_ONLINE_PERCENT: -- flags |= TASKQ_THREADS_CPU_PCT; -- break; -+ default: -+ panic("unrecognized mode for %s_%s taskq (%u:%u) in " -+ "spa_activate()", -+ zio_type_name[t], zio_taskq_types[q], mode, value); -+ break; -+ } - -- default: -- panic("unrecognized mode for %s_%s taskq (%u:%u) in " -- "spa_activate()", -- zio_type_name[t], zio_taskq_types[q], mode, value); -- break; -- } -+ for (i = 0; i < count; i++) { -+ taskq_t *tq; - -@@ -876,3 +876,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) - } else { -- tq = taskq_create_proc(name, value, maxclsyspri, 50, -+ pri_t pri = maxclsyspri; -+ /* -+ * The write issue taskq can be extremely CPU -+ * intensive. Run it at slightly lower priority -+ * than the other taskqs. -+ */ -+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) -+ pri--; -+ -+ tq = taskq_create_proc(name, value, pri, 50, - INT_MAX, spa->spa_proc, flags); -@@ -1201,3 +1210,3 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, - *vdp = NULL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1370,3 +1379,3 @@ spa_load_spares(spa_t *spa) - */ -- spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), -+ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), - KM_PUSHPAGE); -@@ -1445,3 +1454,3 @@ spa_load_l2cache(spa_t *spa) - uint64_t guid; -- vdev_t *vd, **oldvdevs, **newvdevs = NULL; -+ vdev_t *vd, **oldvdevs, **newvdevs; - spa_aux_vdev_t *sav = &spa->spa_l2cache; -@@ -1456,2 +1465,3 @@ spa_load_l2cache(spa_t *spa) - nl2cache = 0; -+ newvdevs = NULL; - } -@@ -1598,3 +1608,4 @@ spa_check_removed(vdev_t *vd) - -- if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { -+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && -+ !vd->vdev_ishole) { - zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, -@@ -1731,5 +1742,7 @@ spa_config_valid(spa_t *spa, nvlist_t *config) - */ --static int -+static boolean_t - spa_check_logs(spa_t *spa) - { -+ boolean_t rv = B_FALSE; -+ - switch (spa->spa_log_state) { -@@ -1740,10 +1753,9 @@ spa_check_logs(spa_t *spa) - case SPA_LOG_UNKNOWN: -- if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, -- DS_FIND_CHILDREN)) { -+ rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, -+ NULL, DS_FIND_CHILDREN) != 0); -+ if (rv) - spa_set_log_state(spa, SPA_LOG_MISSING); -- return (1); -- } - break; - } -- return (0); -+ return (rv); - } -@@ -1795,7 +1807,7 @@ spa_offline_log(spa_t *spa) - { -- int error = 0; -- -- if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, -- NULL, DS_FIND_CHILDREN)) == 0) { -+ int error; - -+ error = dmu_objset_find(spa_name(spa), zil_vdev_offline, -+ NULL, DS_FIND_CHILDREN); -+ if (error == 0) { - /* -@@ -1920,3 +1932,3 @@ spa_load_verify(spa_t *spa) - if (error != ENXIO && error != EIO) -- error = EIO; -+ error = SET_ERROR(EIO); - return (error); -@@ -2048,3 +2060,3 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2067,3 +2079,3 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - spa_guid_exists(pool_guid, 0)) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - } else { -@@ -2134,3 +2146,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2194,3 +2206,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -2425,8 +2437,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - cmn_err(CE_WARN, "pool '%s' could not be " -- "loaded as it was last accessed by " -- "another system (host: %s hostid: 0x%lx). " -- "See: http://zfsonlinux.org/msg/ZFS-8000-EY", -+ "loaded as it was last accessed by another " -+ "system (host: %s hostid: 0x%lx). See: " -+ "http://zfsonlinux.org/msg/ZFS-8000-EY", - spa_name(spa), hostname, - (unsigned long)hostid); -- return (EBADF); -+ return (SET_ERROR(EBADF)); - } -@@ -2619,3 +2631,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -2718,2 +2730,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - /* -+ * Log the fact that we booted up (so that we can detect if -+ * we rebooted in the middle of an operation). -+ */ -+ spa_history_log_version(spa, "open"); -+ -+ /* - * Delete any inconsistent datasets. -@@ -2877,3 +2895,3 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -2912,3 +2930,3 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -3248,3 +3266,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - if (ndev == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3255,3 +3273,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - if (spa_version(spa) < version) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -3271,3 +3289,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - vdev_free(vd); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3282,3 +3300,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { -- error = ENOTBLK; -+ error = SET_ERROR(ENOTBLK); - vdev_free(vd); -@@ -3401,3 +3419,3 @@ int - spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, -- const char *history_str, nvlist_t *zplprops) -+ nvlist_t *zplprops) - { -@@ -3423,3 +3441,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - mutex_exit(&spa_namespace_lock); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -3476,3 +3494,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - if (error == 0 && !zfs_allocatable_devs(nvroot)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - -@@ -3606,3 +3624,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - spa_configfile_set(spa, props, B_FALSE); -- spa_sync_props(spa, props, tx); -+ spa_sync_props(props, tx); - } -@@ -3622,5 +3640,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - -- if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) -- (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); -- spa_history_log_version(spa, LOG_POOL_CREATE); -+ spa_history_log_version(spa, "create"); - -@@ -3752,3 +3768,3 @@ spa_import_rootpool(char *devpath, char *devid) - devpath); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -3795,3 +3811,3 @@ spa_import_rootpool(char *devpath, char *devid) - (u_longlong_t)guid); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - goto out; -@@ -3807,3 +3823,3 @@ spa_import_rootpool(char *devpath, char *devid) - "try booting from '%s'", avd->vdev_path); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3821,3 +3837,3 @@ spa_import_rootpool(char *devpath, char *devid) - vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3826,3 +3842,2 @@ spa_import_rootpool(char *devpath, char *devid) - error = 0; -- spa_history_log_version(spa, LOG_POOL_IMPORT); - out: -@@ -3843,3 +3858,3 @@ out: - int --spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) -+spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - { -@@ -3862,3 +3877,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -3888,3 +3903,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- spa_history_log_version(spa, LOG_POOL_IMPORT); -+ spa_history_log_version(spa, "import"); - -@@ -4019,3 +4034,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- spa_history_log_version(spa, LOG_POOL_IMPORT); -+ spa_history_log_version(spa, "import"); - -@@ -4070,2 +4085,4 @@ spa_tryimport(nvlist_t *tryconfig) - spa->spa_load_info) == 0); -+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, -+ spa->spa_errata) == 0); - -@@ -4086,3 +4103,5 @@ spa_tryimport(nvlist_t *tryconfig) - char *cp; -- char *dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ char *dsname; -+ -+ dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); - -@@ -4139,3 +4158,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - if (!(spa_mode_global & FWRITE)) -- return (EROFS); -+ return (SET_ERROR(EROFS)); - -@@ -4144,3 +4163,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -4178,3 +4197,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -4191,3 +4210,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -4387,3 +4406,2 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - uint64_t txg, dtl_max_txg; -- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) - vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; -@@ -4393,2 +4411,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - int error; -+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev); - -@@ -4501,3 +4520,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - /* mark the device being resilvered */ -- newvd->vdev_resilvering = B_TRUE; -+ newvd->vdev_resilver_txg = txg; - -@@ -4562,3 +4581,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - -- spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, -+ spa_history_log_internal(spa, "vdev attach", NULL, - "%s vdev=%s %s vdev=%s", -@@ -4579,2 +4598,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - * Detach a device from a mirror or replacing vdev. -+ * - * If 'replace_done' is specified, only detach if the parent -@@ -4587,3 +4607,2 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - int error; -- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) - vdev_t *vd, *pvd, *cvd, *tvd; -@@ -4593,3 +4612,3 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - int c, t; -- -+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev); - ASSERT(spa_writeable(spa)); -@@ -4731,3 +4750,2 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - vdev_remove_parent(cvd); -- cvd->vdev_resilvering = B_FALSE; - } -@@ -4780,3 +4798,3 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - -- spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, -+ spa_history_log_internal(spa, "detach", NULL, - "vdev=%s", vdpath); -@@ -4901,3 +4919,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - } else { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4909,3 +4927,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - &glist[c]) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4916,3 +4934,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (vml[c] == NULL) { -- error = ENODEV; -+ error = SET_ERROR(ENODEV); - break; -@@ -4930,3 +4948,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4935,3 +4953,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (vdev_dtl_required(vml[c])) { -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - break; -@@ -5049,5 +5067,4 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (error == 0) -- spa_history_log_internal(LOG_POOL_VDEV_DETACH, -- spa, tx, "vdev=%s", -- vml[c]->vdev_path); -+ spa_history_log_internal(spa, "detach", tx, -+ "vdev=%s", vml[c]->vdev_path); - vdev_free(vml[c]); -@@ -5066,4 +5083,4 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - /* split is complete; log a history record */ -- spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, -- "split new pool %s from pool %s", newname, spa_name(spa)); -+ spa_history_log_internal(newspa, "split", NULL, -+ "from pool %s", spa_name(spa)); - -@@ -5167,3 +5184,3 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) - } else { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -5238,7 +5255,5 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) - * lock. During each step the configuration is synced out. -- */ -- --/* -- * Remove a device from the pool. Currently, this supports removing only hot -- * spares, slogs, and level 2 ARC devices. -+ * -+ * Currently, this supports removing only hot spares, slogs, and level 2 ARC -+ * devices. - */ -@@ -5276,3 +5291,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - } else { -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - } -@@ -5336,3 +5351,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - */ -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } else { -@@ -5341,3 +5356,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - */ -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -5352,3 +5367,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - * Find any device that's done replacing, or a vdev marked 'unspare' that's -- * current spared, so we can detach it. -+ * currently spared, so we can detach it. - */ -@@ -5455,2 +5470,4 @@ spa_vdev_resilver_done(spa_t *spa) - } -+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); -+ - spa_config_exit(spa, SCL_ALL, FTAG); -@@ -5529,3 +5546,3 @@ spa_scan_stop(spa_t *spa) - if (dsl_scan_resilvering(spa->spa_dsl_pool)) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - return (dsl_scan_cancel(spa->spa_dsl_pool)); -@@ -5539,3 +5556,3 @@ spa_scan(spa_t *spa, pool_scan_func_t func) - if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -5649,4 +5666,3 @@ spa_async_thread(spa_t *spa) - if (new_space != old_space) { -- spa_history_log_internal(LOG_POOL_VDEV_ONLINE, -- spa, NULL, -+ spa_history_log_internal(spa, "vdev online", NULL, - "pool '%s' size: %llu(+%llu)", -@@ -5770,2 +5786,27 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - -+/* -+ * Note: this simple function is not inlined to make it easier to dtrace the -+ * amount of time spent syncing frees. -+ */ -+static void -+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(spa, NULL, NULL, 0); -+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx); -+ VERIFY(zio_wait(zio) == 0); -+} -+ -+/* -+ * Note: this simple function is not inlined to make it easier to dtrace the -+ * amount of time spent syncing deferred frees. -+ */ -+static void -+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(spa, NULL, NULL, 0); -+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, -+ spa_free_sync_cb, zio, tx), ==, 0); -+ VERIFY0(zio_wait(zio)); -+} -+ - static void -@@ -5831,3 +5872,3 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, - } else { -- list = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE); -+ list = kmem_alloc(sav->sav_count*sizeof (void *), KM_PUSHPAGE); - for (i = 0; i < sav->sav_count; i++) -@@ -5879,6 +5920,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) - static void --spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_sync_version(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- uint64_t version = *(uint64_t *)arg2; -+ uint64_t *versionp = arg; -+ uint64_t version = *versionp; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - -@@ -5894,2 +5936,3 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) - vdev_config_dirty(spa->spa_root_vdev); -+ spa_history_log_internal(spa, "set", tx, "version=%lld", version); - } -@@ -5900,7 +5943,7 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) - static void --spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_sync_props(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -+ nvlist_t *nvp = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; -- nvlist_t *nvp = arg2; - nvpair_t *elem = NULL; -@@ -5929,2 +5972,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - spa_feature_enable(spa, feature, tx); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=enabled", nvpair_name(elem)); - break; -@@ -5968,2 +6013,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - vdev_config_dirty(spa->spa_root_vdev); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%s", nvpair_name(elem), strval); - break; -@@ -5990,3 +6037,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - 1, strlen(strval) + 1, strval, tx) == 0); -- -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%s", nvpair_name(elem), strval); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { -@@ -6002,2 +6050,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - 8, 1, &intval, tx) == 0); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%lld", nvpair_name(elem), intval); - } else { -@@ -6030,9 +6080,2 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - -- /* log internal history if this is not a zpool create */ -- if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && -- tx->tx_txg != TXG_INITIAL) { -- spa_history_log_internal(LOG_POOL_PROPSET, -- spa, tx, "%s %lld %s", -- nvpair_name(elem), intval, spa_name(spa)); -- } - } -@@ -6056,2 +6099,4 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) - -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && -@@ -6081,2 +6126,3 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) - } -+ rrw_exit(&dp->dp_config_rwlock, FTAG); - } -@@ -6092,3 +6138,2 @@ spa_sync(spa_t *spa, uint64_t txg) - objset_t *mos = spa->spa_meta_objset; -- bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; - bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; -@@ -6139,3 +6184,3 @@ spa_sync(spa_t *spa, uint64_t txg) - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, -- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + -+ spa_deadman, spa, TQ_PUSHPAGE, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); -@@ -6175,6 +6220,3 @@ spa_sync(spa_t *spa, uint64_t txg) - txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { -- zio_t *zio = zio_root(spa, NULL, NULL, 0); -- VERIFY3U(bpobj_iterate(defer_bpo, -- spa_free_sync_cb, zio, tx), ==, 0); -- VERIFY0(zio_wait(zio)); -+ spa_sync_deferred_frees(spa, tx); - } -@@ -6196,9 +6238,6 @@ spa_sync(spa_t *spa, uint64_t txg) - if (pass < zfs_sync_pass_deferred_free) { -- zio_t *zio = zio_root(spa, NULL, NULL, 0); -- bplist_iterate(free_bpl, spa_free_sync_cb, -- zio, tx); -- VERIFY(zio_wait(zio) == 0); -+ spa_sync_frees(spa, free_bpl, tx); - } else { - bplist_iterate(free_bpl, bpobj_enqueue_cb, -- defer_bpo, tx); -+ &spa->spa_deferred_bpobj, tx); - } -diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c -index 5e5b405..c8fe79e 100644 ---- a/module/zfs/spa_config.c -+++ b/module/zfs/spa_config.c -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -198,3 +198,8 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) - * Synchronize pool configuration to disk. This must be called with the -- * namespace lock held. -+ * namespace lock held. Synchronizing the pool cache is typically done after -+ * the configuration has been synced to the MOS. This exposes a window where -+ * the MOS config will have been updated but the cache file has not. If -+ * the system were to crash at that instant then the cached config may not -+ * contain the correct information to open the pool and an explicity import -+ * would be required. - */ -@@ -205,2 +210,3 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) - nvlist_t *nvl; -+ char *pool_name; - -@@ -251,3 +257,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) - -- VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, -+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { -+ VERIFY0(nvlist_lookup_string(spa->spa_config, -+ ZPOOL_CONFIG_POOL_NAME, &pool_name)); -+ } else -+ pool_name = spa_name(spa); -+ -+ VERIFY(nvlist_add_nvlist(nvl, pool_name, - spa->spa_config) == 0); -@@ -322,2 +334,3 @@ spa_config_set(spa_t *spa, nvlist_t *config) - * Generate the pool's configuration based on the current in-core state. -+ * - * We infer whether to generate a complete config or just one top-level config -@@ -333,2 +346,3 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - uint64_t split_guid; -+ char *pool_name; - -@@ -349,2 +363,18 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - -+ /* -+ * Originally, users had to handle spa namespace collisions by either -+ * exporting the already imported pool or by specifying a new name for -+ * the pool with a conflicting name. In the case of root pools from -+ * virtual guests, neither approach to collision resolution is -+ * reasonable. This is addressed by extending the new name syntax with -+ * an option to specify that the new name is temporary. When specified, -+ * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us -+ * to use the previous name, which we do below. -+ */ -+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { -+ VERIFY0(nvlist_lookup_string(spa->spa_config, -+ ZPOOL_CONFIG_POOL_NAME, &pool_name)); -+ } else -+ pool_name = spa_name(spa); -+ - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); -@@ -354,3 +384,3 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, -- spa_name(spa)) == 0); -+ pool_name) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, -@@ -361,2 +391,4 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - spa_guid(spa)) == 0); -+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, -+ spa->spa_errata) == 0); - VERIFY(spa->spa_comment == NULL || nvlist_add_string(config, -diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c -index 3fab192..35853e2 100644 ---- a/module/zfs/spa_errlog.c -+++ b/module/zfs/spa_errlog.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -177,3 +178,3 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) - zap_cursor_fini(&zc); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -184,4 +185,6 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) - (*count - 1) * sizeof (zbookmark_t), -- sizeof (zbookmark_t)) != 0) -- return (EFAULT); -+ sizeof (zbookmark_t)) != 0) { -+ zap_cursor_fini(&zc); -+ return (SET_ERROR(EFAULT)); -+ } - -@@ -203,3 +206,3 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) - if (*count == 0) -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - -@@ -208,3 +211,3 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) - sizeof (zbookmark_t)) != 0) -- return (EFAULT); -+ return (SET_ERROR(EFAULT)); - -diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c -index 9fb75f3..5b82238 100644 ---- a/module/zfs/spa_history.c -+++ b/module/zfs/spa_history.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -32,2 +32,4 @@ - #include -+#include -+#include - #include -@@ -35,2 +37,3 @@ - #include -+#include - #include "zfs_comutil.h" -@@ -187,3 +190,3 @@ spa_history_zone(void) - #else -- return ("global"); -+ return (NULL); - #endif -@@ -196,7 +199,6 @@ spa_history_zone(void) - static void --spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_history_log_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- history_arg_t *hap = arg2; -- const char *history_str = hap->ha_history_str; -+ nvlist_t *nvl = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; -@@ -206,3 +208,2 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - uint64_t le_len; -- nvlist_t *nvrecord; - char *record_packed = NULL; -@@ -223,3 +224,3 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); -+ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; -@@ -236,42 +237,32 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, -- gethrestime_sec()) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); -- if (hap->ha_zone != NULL) -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, -- hap->ha_zone) == 0); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); - #ifdef _KERNEL -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, -- utsname.nodename) == 0); -+ fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); - #endif -- if (hap->ha_log_type == LOG_CMD_POOL_CREATE || -- hap->ha_log_type == LOG_CMD_NORMAL) { -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, -- history_str) == 0); -- -- zfs_dbgmsg("command: %s", history_str); -- } else { -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, -- hap->ha_event) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, -- tx->tx_txg) == 0); -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, -- history_str) == 0); -- -- zfs_dbgmsg("internal %s pool:%s txg:%llu %s", -- zfs_history_event_names[hap->ha_event], spa_name(spa), -- (longlong_t)tx->tx_txg, history_str); -- -+ if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { -+ zfs_dbgmsg("command: %s", -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); -+ } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { -+ if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { -+ zfs_dbgmsg("txg %lld %s %s (id %llu) %s", -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); -+ } else { -+ zfs_dbgmsg("txg %lld %s %s", -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); -+ } -+ } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { -+ zfs_dbgmsg("ioctl %s", -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); - } - -- VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); -- record_packed = kmem_alloc(reclen, KM_PUSHPAGE); -- -- VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, -- NV_ENCODE_XDR, KM_PUSHPAGE) == 0); -+ VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE, -+ KM_PUSHPAGE), ==, 0); - - mutex_enter(&spa->spa_history_lock); -- if (hap->ha_log_type == LOG_CMD_POOL_CREATE) -- VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); - -@@ -283,5 +274,6 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { -- shpp->sh_pool_create_len += sizeof (le_len) + reclen; -- shpp->sh_bof = shpp->sh_pool_create_len; -+ /* The first command is the create, which we keep forever */ -+ if (ret == 0 && shpp->sh_pool_create_len == 0 && -+ nvlist_exists(nvl, ZPOOL_HIST_CMD)) { -+ shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; - } -@@ -289,10 +281,5 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - mutex_exit(&spa->spa_history_lock); -- nvlist_free(nvrecord); -- kmem_free(record_packed, reclen); -+ fnvlist_pack_free(record_packed, reclen); - dmu_buf_rele(dbp, FTAG); -- -- strfree(hap->ha_history_str); -- if (hap->ha_zone != NULL) -- strfree(hap->ha_zone); -- kmem_free(hap, sizeof (history_arg_t)); -+ fnvlist_free(nvl); - } -@@ -303,9 +290,24 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) -+spa_history_log(spa_t *spa, const char *msg) -+{ -+ int err; -+ nvlist_t *nvl; -+ -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); -+ err = spa_history_log_nvl(spa, nvl); -+ fnvlist_free(nvl); -+ return (err); -+} -+ -+int -+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) - { -- history_arg_t *ha; - int err = 0; - dmu_tx_t *tx; -+ nvlist_t *nvarg; - -- ASSERT(what != LOG_INTERNAL); -+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) -+ return (SET_ERROR(EINVAL)); - -@@ -318,15 +320,17 @@ spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) - -- ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); -- ha->ha_history_str = strdup(history_str); -- ha->ha_zone = strdup(spa_history_zone()); -- ha->ha_log_type = what; -- ha->ha_uid = crgetuid(CRED()); -+ VERIFY0(nvlist_dup(nvl, &nvarg, KM_PUSHPAGE)); -+ if (spa_history_zone() != NULL) { -+ fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, -+ spa_history_zone()); -+ } -+ fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); - - /* Kick this off asynchronously; errors are ignored. */ -- dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, -- spa_history_log_sync, spa, ha, 0, tx); -+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, -+ nvarg, 0, tx); - dmu_tx_commit(tx); - -- /* spa_history_log_sync will free ha and strings */ -+ /* spa_history_log_sync will free nvl */ - return (err); -+ - } -@@ -347,3 +351,3 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - /* -- * If the command history doesn't exist (older pool), -+ * If the command history doesn't exist (older pool), - * that's ok, just return ENOENT. -@@ -351,3 +355,3 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - if (!spa->spa_history) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -430,8 +434,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - -+/* -+ * The nvlist will be consumed by this call. -+ */ - static void --log_internal(history_internal_events_t event, spa_t *spa, -+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, - dmu_tx_t *tx, const char *fmt, va_list adx) - { -- history_arg_t *ha; -- va_list adx_copy; -+ char *msg; -+ va_list adx1; -+ int size; - -@@ -440,22 +448,29 @@ log_internal(history_internal_events_t event, spa_t *spa, - * initialized yet, so don't bother logging the internal events. -+ * Likewise if the pool is not writeable. - */ -- if (tx->tx_txg == TXG_INITIAL) -+ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { -+ fnvlist_free(nvl); - return; -+ } -+ -+ va_copy(adx1, adx); -+ size = vsnprintf(NULL, 0, fmt, adx1) + 1; -+ msg = kmem_alloc(size, KM_PUSHPAGE); -+ va_end(adx1); -+ va_copy(adx1, adx); -+ (void) vsprintf(msg, fmt, adx1); -+ va_end(adx1); -+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); -+ kmem_free(msg, size); - -- ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); -- va_copy(adx_copy, adx); -- ha->ha_history_str = kmem_vasprintf(fmt, adx_copy); -- va_end(adx_copy); -- ha->ha_log_type = LOG_INTERNAL; -- ha->ha_event = event; -- ha->ha_zone = NULL; -- ha->ha_uid = 0; -+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); - - if (dmu_tx_is_syncing(tx)) { -- spa_history_log_sync(spa, ha, tx); -+ spa_history_log_sync(nvl, tx); - } else { -- dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, -- spa_history_log_sync, spa, ha, 0, tx); -+ dsl_sync_task_nowait(spa_get_dsl(spa), -+ spa_history_log_sync, nvl, 0, tx); - } -- /* spa_history_log_sync() will free ha and strings */ -+ /* spa_history_log_sync() will free nvl */ - } -@@ -463,3 +478,3 @@ log_internal(history_internal_events_t event, spa_t *spa, - void --spa_history_log_internal(history_internal_events_t event, spa_t *spa, -+spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...) -@@ -468,2 +483,3 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - va_list adx; -+ nvlist_t *nvl; - -@@ -479,3 +495,4 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - va_start(adx, fmt); -- log_internal(event, spa, htx, fmt, adx); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ log_internal(nvl, operation, spa, htx, fmt, adx); - va_end(adx); -@@ -488,19 +505,50 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - void --spa_history_log_version(spa_t *spa, history_internal_events_t event) -+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...) -+{ -+ va_list adx; -+ char namebuf[MAXNAMELEN]; -+ nvlist_t *nvl; -+ -+ ASSERT(tx != NULL); -+ -+ dsl_dataset_name(ds, namebuf); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); -+ -+ va_start(adx, fmt); -+ log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); -+ va_end(adx); -+} -+ -+void -+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...) - { --#ifdef _KERNEL -- uint64_t current_vers = spa_version(spa); -- -- if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { -- spa_history_log_internal(event, spa, NULL, -- "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", -- (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, -- utsname.nodename, utsname.release, utsname.version, -- utsname.machine); -- } -- cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", -- event == LOG_POOL_IMPORT ? "imported" : -- event == LOG_POOL_CREATE ? "created" : "accessed", -- (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); --#endif -+ va_list adx; -+ char namebuf[MAXNAMELEN]; -+ nvlist_t *nvl; -+ -+ ASSERT(tx != NULL); -+ -+ dsl_dir_name(dd, namebuf); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, -+ dd->dd_phys->dd_head_dataset_obj); -+ -+ va_start(adx, fmt); -+ log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); -+ va_end(adx); -+} -+ -+void -+spa_history_log_version(spa_t *spa, const char *operation) -+{ -+ spa_history_log_internal(spa, operation, NULL, -+ "pool version %llu; software version %llu/%d; uts %s %s %s %s", -+ (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, -+ utsname.nodename, utsname.release, utsname.version, -+ utsname.machine); - } -diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c -index 0ca9f3a..f1e1a72 100644 ---- a/module/zfs/spa_misc.c -+++ b/module/zfs/spa_misc.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -50,2 +50,3 @@ - #include -+#include - #include "zfs_prop.h" -@@ -239,12 +240,10 @@ int spa_mode_global; - /* -- * Expiration time in units of zfs_txg_synctime_ms. This value has two -- * meanings. First it is used to determine when the spa_deadman logic -- * should fire. By default the spa_deadman will fire if spa_sync has -- * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). -- * Secondly, the value determines if an I/O is considered "hung". -- * Any I/O that has not completed in zfs_deadman_synctime is considered -- * "hung" resulting in a zevent being posted. -- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). -+ * Expiration time in milliseconds. This value has two meanings. First it is -+ * used to determine when the spa_deadman() logic should fire. By default the -+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. -+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that -+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting -+ * in a system panic. - */ --unsigned long zfs_deadman_synctime = 1000ULL; -+unsigned long zfs_deadman_synctime_ms = 1000000ULL; - -@@ -255,2 +254,12 @@ int zfs_deadman_enabled = 1; - -+/* -+ * The worst case is single-sector max-parity RAID-Z blocks, in which -+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) -+ * times the size; so just assume that. Add to this the fact that -+ * we can have up to 3 DVAs per bp, and one more factor of 2 because -+ * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, -+ * the worst case is: -+ * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 -+ */ -+int spa_asize_inflation = 24; - -@@ -270,3 +279,3 @@ spa_config_lock_init(spa_t *spa) - cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); -- refcount_create(&scl->scl_count); -+ refcount_create_untracked(&scl->scl_count); - scl->scl_writer = NULL; -@@ -328,2 +337,4 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) - -+ ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); -+ - for (i = 0; i < SCL_LOCKS; i++) { -@@ -408,3 +419,2 @@ spa_lookup(const char *name) - avl_index_t where; -- char c = 0; - char *cp; -@@ -413,2 +423,4 @@ spa_lookup(const char *name) - -+ (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); -+ - /* -@@ -417,14 +429,8 @@ spa_lookup(const char *name) - */ -- cp = strpbrk(name, "/@"); -- if (cp) { -- c = *cp; -+ cp = strpbrk(search.spa_name, "/@"); -+ if (cp != NULL) - *cp = '\0'; -- } - -- (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); - spa = avl_find(&spa_namespace_avl, &search, &where); - -- if (cp) -- *cp = c; -- - return (spa); -@@ -449,3 +455,3 @@ spa_deadman(void *arg) - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, -- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + -+ spa_deadman, spa, TQ_PUSHPAGE, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); -@@ -495,4 +501,3 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - -- spa->spa_deadman_synctime = zfs_deadman_synctime * -- zfs_txg_synctime_ms * MICROSEC; -+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); - -@@ -500,2 +505,3 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - spa_config_lock_init(spa); -+ spa_stats_init(spa); - -@@ -538,5 +544,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, -- KM_SLEEP) == 0); -+ KM_PUSHPAGE) == 0); - } - -+ spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); -+ - return (spa); -@@ -583,2 +591,3 @@ spa_remove(spa_t *spa) - -+ spa_stats_destroy(spa); - spa_config_lock_destroy(spa); -@@ -1289,3 +1298,3 @@ spa_freeze(spa_t *spa) - * This is a stripped-down version of strtoull, suitable only for converting -- * lowercase hexidecimal numbers that don't overflow. -+ * lowercase hexadecimal numbers that don't overflow. - */ -@@ -1454,10 +1463,3 @@ spa_get_asize(spa_t *spa, uint64_t lsize) - { -- /* -- * The worst case is single-sector max-parity RAID-Z blocks, in which -- * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) -- * times the size; so just assume that. Add to this the fact that -- * we can have up to 3 DVAs per bp, and one more factor of 2 because -- * the block may be dittoed with up to 3 DVAs by ddt_sync(). -- */ -- return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); -+ return (lsize * spa_asize_inflation); - } -@@ -1632,2 +1634,19 @@ spa_init(int mode) - -+#ifndef _KERNEL -+ if (spa_mode_global != FREAD && dprintf_find_string("watch")) { -+ struct sigaction sa; -+ -+ sa.sa_flags = SA_SIGINFO; -+ sigemptyset(&sa.sa_mask); -+ sa.sa_sigaction = arc_buf_sigsegv; -+ -+ if (sigaction(SIGSEGV, &sa, NULL) == -1) { -+ perror("could not enable watchpoints: " -+ "sigaction(SIGSEGV, ...) = "); -+ } else { -+ arc_watch = B_TRUE; -+ } -+ } -+#endif -+ - fm_init(); -@@ -1636,2 +1655,3 @@ spa_init(int mode) - space_map_init(); -+ ddt_init(); - zio_init(); -@@ -1658,2 +1678,3 @@ spa_fini(void) - zio_fini(); -+ ddt_fini(); - space_map_fini(); -@@ -1759,3 +1780,3 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - bzero(ps, sizeof (pool_scan_stat_t)); -@@ -1865,4 +1886,4 @@ EXPORT_SYMBOL(spa_namespace_lock); - --module_param(zfs_deadman_synctime, ulong, 0644); --MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms"); -+module_param(zfs_deadman_synctime_ms, ulong, 0644); -+MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds"); - -@@ -1870,2 +1891,6 @@ module_param(zfs_deadman_enabled, int, 0644); - MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); -+ -+module_param(spa_asize_inflation, int, 0644); -+MODULE_PARM_DESC(spa_asize_inflation, -+ "SPA size estimate multiplication factor"); - #endif -diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c -new file mode 100644 -index 0000000..dbc761e ---- /dev/null -+++ b/module/zfs/spa_stats.c -@@ -0,0 +1,691 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#include -+#include -+ -+/* -+ * Keeps stats on last N reads per spa_t, disabled by default. -+ */ -+int zfs_read_history = 0; -+ -+/* -+ * Include cache hits in history, disabled by default. -+ */ -+int zfs_read_history_hits = 0; -+ -+/* -+ * Keeps stats on the last N txgs, disabled by default. -+ */ -+int zfs_txg_history = 0; -+ -+/* -+ * ========================================================================== -+ * SPA Read History Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Read statistics - Information exported regarding each arc_read call -+ */ -+typedef struct spa_read_history { -+ uint64_t uid; /* unique identifier */ -+ hrtime_t start; /* time read completed */ -+ uint64_t objset; /* read from this objset */ -+ uint64_t object; /* read of this object number */ -+ uint64_t level; /* block's indirection level */ -+ uint64_t blkid; /* read of this block id */ -+ char origin[24]; /* read originated from here */ -+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ -+ pid_t pid; /* PID of task doing read */ -+ char comm[16]; /* process name of task doing read */ -+ list_node_t srh_link; -+} spa_read_history_t; -+ -+static int -+spa_read_history_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " -+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", -+ "level", "blkid", "aflags", "origin", "pid", "process"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+static int -+spa_read_history_data(char *buf, size_t size, void *data) -+{ -+ spa_read_history_t *srh = (spa_read_history_t *)data; -+ -+ size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " -+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", -+ (u_longlong_t)srh->uid, srh->start, -+ (longlong_t)srh->objset, (longlong_t)srh->object, -+ (longlong_t)srh->level, (longlong_t)srh->blkid, -+ srh->aflags, srh->origin, srh->pid, srh->comm); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+/* -+ * Calculate the address for the next spa_stats_history_t entry. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static void * -+spa_read_history_addr(kstat_t *ksp, loff_t n) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (n == 0) -+ ssh->private = list_tail(&ssh->list); -+ else if (ssh->private) -+ ssh->private = list_prev(&ssh->list, ssh->private); -+ -+ return (ssh->private); -+} -+ -+/* -+ * When the kstat is written discard all spa_read_history_t entires. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static int -+spa_read_history_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ -+ if (rw == KSTAT_WRITE) { -+ spa_read_history_t *srh; -+ -+ while ((srh = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(srh, sizeof (spa_read_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ } -+ -+ ksp->ks_ndata = ssh->size; -+ ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t); -+ -+ return (0); -+} -+ -+static void -+spa_read_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ list_create(&ssh->list, sizeof (spa_read_history_t), -+ offsetof(spa_read_history_t, srh_link)); -+ -+ ssh->count = 0; -+ ssh->size = 0; -+ ssh->private = NULL; -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "reads", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = NULL; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_read_history_update; -+ kstat_set_raw_ops(ksp, spa_read_history_headers, -+ spa_read_history_data, spa_read_history_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_read_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ spa_read_history_t *srh; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_enter(&ssh->lock); -+ while ((srh = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(srh, sizeof (spa_read_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ list_destroy(&ssh->list); -+ mutex_exit(&ssh->lock); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ spa_read_history_t *srh, *rm; -+ -+ ASSERT3P(spa, !=, NULL); -+ ASSERT3P(zb, !=, NULL); -+ -+ if (zfs_read_history == 0 && ssh->size == 0) -+ return; -+ -+ if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) -+ return; -+ -+ srh = kmem_zalloc(sizeof (spa_read_history_t), KM_PUSHPAGE); -+ strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); -+ srh->start = gethrtime(); -+ srh->objset = zb->zb_objset; -+ srh->object = zb->zb_object; -+ srh->level = zb->zb_level; -+ srh->blkid = zb->zb_blkid; -+ srh->aflags = aflags; -+ srh->pid = getpid(); -+ -+ mutex_enter(&ssh->lock); -+ -+ srh->uid = ssh->count++; -+ list_insert_head(&ssh->list, srh); -+ ssh->size++; -+ -+ while (ssh->size > zfs_read_history) { -+ ssh->size--; -+ rm = list_remove_tail(&ssh->list); -+ kmem_free(rm, sizeof (spa_read_history_t)); -+ } -+ -+ mutex_exit(&ssh->lock); -+} -+ -+/* -+ * ========================================================================== -+ * SPA TXG History Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Txg statistics - Information exported regarding each txg sync -+ */ -+ -+typedef struct spa_txg_history { -+ uint64_t txg; /* txg id */ -+ txg_state_t state; /* active txg state */ -+ uint64_t nread; /* number of bytes read */ -+ uint64_t nwritten; /* number of bytes written */ -+ uint64_t reads; /* number of read operations */ -+ uint64_t writes; /* number of write operations */ -+ uint64_t ndirty; /* number of dirty bytes */ -+ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ -+ list_node_t sth_link; -+} spa_txg_history_t; -+ -+static int -+spa_txg_history_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " -+ "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", -+ "ndirty", "nread", "nwritten", "reads", "writes", -+ "otime", "qtime", "wtime", "stime"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+static int -+spa_txg_history_data(char *buf, size_t size, void *data) -+{ -+ spa_txg_history_t *sth = (spa_txg_history_t *)data; -+ uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; -+ char state; -+ -+ switch (sth->state) { -+ case TXG_STATE_BIRTH: state = 'B'; break; -+ case TXG_STATE_OPEN: state = 'O'; break; -+ case TXG_STATE_QUIESCED: state = 'Q'; break; -+ case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; -+ case TXG_STATE_SYNCED: state = 'S'; break; -+ case TXG_STATE_COMMITTED: state = 'C'; break; -+ default: state = '?'; break; -+ } -+ -+ if (sth->times[TXG_STATE_OPEN]) -+ open = sth->times[TXG_STATE_OPEN] - -+ sth->times[TXG_STATE_BIRTH]; -+ -+ if (sth->times[TXG_STATE_QUIESCED]) -+ quiesce = sth->times[TXG_STATE_QUIESCED] - -+ sth->times[TXG_STATE_OPEN]; -+ -+ if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) -+ wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - -+ sth->times[TXG_STATE_QUIESCED]; -+ -+ if (sth->times[TXG_STATE_SYNCED]) -+ sync = sth->times[TXG_STATE_SYNCED] - -+ sth->times[TXG_STATE_WAIT_FOR_SYNC]; -+ -+ size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " -+ "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", -+ (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, -+ (u_longlong_t)sth->ndirty, -+ (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, -+ (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, -+ (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, -+ (u_longlong_t)sync); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+/* -+ * Calculate the address for the next spa_stats_history_t entry. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static void * -+spa_txg_history_addr(kstat_t *ksp, loff_t n) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (n == 0) -+ ssh->private = list_tail(&ssh->list); -+ else if (ssh->private) -+ ssh->private = list_prev(&ssh->list, ssh->private); -+ -+ return (ssh->private); -+} -+ -+/* -+ * When the kstat is written discard all spa_txg_history_t entires. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static int -+spa_txg_history_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (rw == KSTAT_WRITE) { -+ spa_txg_history_t *sth; -+ -+ while ((sth = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(sth, sizeof (spa_txg_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ } -+ -+ ksp->ks_ndata = ssh->size; -+ ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t); -+ -+ return (0); -+} -+ -+static void -+spa_txg_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ list_create(&ssh->list, sizeof (spa_txg_history_t), -+ offsetof(spa_txg_history_t, sth_link)); -+ -+ ssh->count = 0; -+ ssh->size = 0; -+ ssh->private = NULL; -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "txgs", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = NULL; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_txg_history_update; -+ kstat_set_raw_ops(ksp, spa_txg_history_headers, -+ spa_txg_history_data, spa_txg_history_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_txg_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_enter(&ssh->lock); -+ while ((sth = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(sth, sizeof (spa_txg_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ list_destroy(&ssh->list); -+ mutex_exit(&ssh->lock); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+/* -+ * Add a new txg to historical record. -+ */ -+void -+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth, *rm; -+ -+ if (zfs_txg_history == 0 && ssh->size == 0) -+ return; -+ -+ sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_PUSHPAGE); -+ sth->txg = txg; -+ sth->state = TXG_STATE_OPEN; -+ sth->times[TXG_STATE_BIRTH] = birth_time; -+ -+ mutex_enter(&ssh->lock); -+ -+ list_insert_head(&ssh->list, sth); -+ ssh->size++; -+ -+ while (ssh->size > zfs_txg_history) { -+ ssh->size--; -+ rm = list_remove_tail(&ssh->list); -+ kmem_free(rm, sizeof (spa_txg_history_t)); -+ } -+ -+ mutex_exit(&ssh->lock); -+} -+ -+/* -+ * Set txg state completion time and increment current state. -+ */ -+int -+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, -+ hrtime_t completed_time) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ int error = ENOENT; -+ -+ if (zfs_txg_history == 0) -+ return (0); -+ -+ mutex_enter(&ssh->lock); -+ for (sth = list_head(&ssh->list); sth != NULL; -+ sth = list_next(&ssh->list, sth)) { -+ if (sth->txg == txg) { -+ sth->times[completed_state] = completed_time; -+ sth->state++; -+ error = 0; -+ break; -+ } -+ } -+ mutex_exit(&ssh->lock); -+ -+ return (error); -+} -+ -+/* -+ * Set txg IO stats. -+ */ -+int -+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, -+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ int error = ENOENT; -+ -+ if (zfs_txg_history == 0) -+ return (0); -+ -+ mutex_enter(&ssh->lock); -+ for (sth = list_head(&ssh->list); sth != NULL; -+ sth = list_next(&ssh->list, sth)) { -+ if (sth->txg == txg) { -+ sth->nread = nread; -+ sth->nwritten = nwritten; -+ sth->reads = reads; -+ sth->writes = writes; -+ sth->ndirty = ndirty; -+ error = 0; -+ break; -+ } -+ } -+ mutex_exit(&ssh->lock); -+ -+ return (error); -+} -+ -+/* -+ * ========================================================================== -+ * SPA TX Assign Histogram Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Tx statistics - Information exported regarding dmu_tx_assign time. -+ */ -+ -+/* -+ * When the kstat is written zero all buckets. When the kstat is read -+ * count the number of trailing buckets set to zero and update ks_ndata -+ * such that they are not output. -+ */ -+static int -+spa_tx_assign_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ int i; -+ -+ if (rw == KSTAT_WRITE) { -+ for (i = 0; i < ssh->count; i++) -+ ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; -+ } -+ -+ for (i = ssh->count; i > 0; i--) -+ if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) -+ break; -+ -+ ksp->ks_ndata = i; -+ ksp->ks_data_size = i * sizeof (kstat_named_t); -+ -+ return (0); -+} -+ -+static void -+spa_tx_assign_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ char name[KSTAT_STRLEN]; -+ kstat_named_t *ks; -+ kstat_t *ksp; -+ int i; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ -+ ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ -+ ssh->size = ssh->count * sizeof (kstat_named_t); -+ ssh->private = kmem_alloc(ssh->size, KM_SLEEP); -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ for (i = 0; i < ssh->count; i++) { -+ ks = &((kstat_named_t *)ssh->private)[i]; -+ ks->data_type = KSTAT_DATA_UINT64; -+ ks->value.ui64 = 0; -+ (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", -+ (u_longlong_t)1 << i); -+ } -+ -+ ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", -+ KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = ssh->private; -+ ksp->ks_ndata = ssh->count; -+ ksp->ks_data_size = ssh->size; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_tx_assign_update; -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_tx_assign_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ kmem_free(ssh->private, ssh->size); -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ uint64_t idx = 0; -+ -+ while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) -+ idx++; -+ -+ atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); -+} -+ -+/* -+ * ========================================================================== -+ * SPA IO History Routines -+ * ========================================================================== -+ */ -+static int -+spa_io_history_update(kstat_t *ksp, int rw) -+{ -+ if (rw == KSTAT_WRITE) -+ memset(ksp->ks_data, 0, ksp->ks_data_size); -+ -+ return (0); -+} -+ -+static void -+spa_io_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_io_history_update; -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_io_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ if (ssh->kstat) -+ kstat_delete(ssh->kstat); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_stats_init(spa_t *spa) -+{ -+ spa_read_history_init(spa); -+ spa_txg_history_init(spa); -+ spa_tx_assign_init(spa); -+ spa_io_history_init(spa); -+} -+ -+void -+spa_stats_destroy(spa_t *spa) -+{ -+ spa_tx_assign_destroy(spa); -+ spa_txg_history_destroy(spa); -+ spa_read_history_destroy(spa); -+ spa_io_history_destroy(spa); -+} -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+module_param(zfs_read_history, int, 0644); -+MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); -+ -+module_param(zfs_read_history_hits, int, 0644); -+MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); -+ -+module_param(zfs_txg_history, int, 0644); -+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); -+#endif -diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c -index a031f3a..2cf1d2a 100644 ---- a/module/zfs/space_map.c -+++ b/module/zfs/space_map.c -@@ -104,3 +104,3 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) - avl_index_t where; -- space_seg_t ssearch, *ss_before, *ss_after, *ss; -+ space_seg_t *ss_before, *ss_after, *ss; - uint64_t end = start + size; -@@ -117,7 +117,4 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) - -- ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -- -- if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { -+ ss = space_map_find(sm, start, size, &where); -+ if (ss != NULL) { - zfs_panic_recover("zfs: allocating allocated segment" -@@ -173,3 +170,3 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - avl_index_t where; -- space_seg_t ssearch, *ss, *newseg; -+ space_seg_t *ss, *newseg; - uint64_t end = start + size; -@@ -177,11 +174,4 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - -- ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(!sm->sm_condensing); -- VERIFY(size != 0); -- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); -- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); -- -- ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -+ ss = space_map_find(sm, start, size, &where); - -@@ -228,8 +218,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - --boolean_t --space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -+space_seg_t * -+space_map_find(space_map_t *sm, uint64_t start, uint64_t size, -+ avl_index_t *wherep) - { -- avl_index_t where; - space_seg_t ssearch, *ss; -- uint64_t end = start + size; - -@@ -241,6 +230,16 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) - ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -+ ssearch.ss_end = start + size; -+ ss = avl_find(&sm->sm_root, &ssearch, wherep); -+ -+ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size) -+ return (ss); -+ return (NULL); -+} -+ -+boolean_t -+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -+{ -+ avl_index_t where; - -- return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); -+ return (space_map_find(sm, start, size, &where) != 0); - } -diff --git a/module/zfs/txg.c b/module/zfs/txg.c -index 7c820af..524fe8e 100644 ---- a/module/zfs/txg.c -+++ b/module/zfs/txg.c -@@ -29,2 +29,3 @@ - #include -+#include - #include -@@ -33,3 +34,2 @@ - #include --#include - -@@ -48,3 +48,3 @@ - * (though it may be blocked waiting to enter the quiescing state). In broad -- * strokes, transactions — operations that change in-memory structures — are -+ * strokes, transactions -- operations that change in-memory structures -- are - * accepted into the txg in the open state, and are completed while the txg is -@@ -56,3 +56,3 @@ - * When a new txg becomes active, it first enters the open state. New -- * transactions — updates to in-memory structures — are assigned to the -+ * transactions -- updates to in-memory structures -- are assigned to the - * currently open txg. There is always a txg in the open state so that ZFS can -@@ -129,2 +129,4 @@ txg_init(dsl_pool_t *dp, uint64_t txg) - mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, -+ NULL); - for (i = 0; i < TXG_SIZE; i++) { -@@ -171,2 +173,3 @@ txg_fini(dsl_pool_t *dp) - -+ mutex_destroy(&tx->tx_cpu[c].tc_open_lock); - mutex_destroy(&tx->tx_cpu[c].tc_lock); -@@ -235,3 +238,3 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) - static void --txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) -+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) - { -@@ -305,6 +308,8 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) - -- mutex_enter(&tc->tc_lock); -- -+ mutex_enter(&tc->tc_open_lock); - txg = tx->tx_open_txg; -+ -+ mutex_enter(&tc->tc_lock); - tc->tc_count[txg & TXG_MASK]++; -+ mutex_exit(&tc->tc_lock); - -@@ -321,3 +326,4 @@ txg_rele_to_quiesce(txg_handle_t *th) - -- mutex_exit(&tc->tc_lock); -+ ASSERT(!MUTEX_HELD(&tc->tc_lock)); -+ mutex_exit(&tc->tc_open_lock); - } -@@ -350,2 +356,8 @@ txg_rele_to_sync(txg_handle_t *th) - -+/* -+ * Blocks until all transactions in the group are committed. -+ * -+ * On return, the transaction group has reached a stable state in which it can -+ * then be passed off to the syncing context. -+ */ - static void -@@ -353,4 +365,2 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - { -- hrtime_t start; -- txg_history_t *th; - tx_state_t *tx = &dp->dp_tx; -@@ -360,6 +370,6 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - /* -- * Grab all tx_cpu locks so nobody else can get into this txg. -+ * Grab all tc_open_locks so nobody else can get into this txg. - */ - for (c = 0; c < max_ncpus; c++) -- mutex_enter(&tx->tx_cpu[c].tc_lock); -+ mutex_enter(&tx->tx_cpu[c].tc_open_lock); - -@@ -367,2 +377,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - tx->tx_open_txg++; -+ tx->tx_open_time = gethrtime(); -+ -+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx->tx_open_time); -+ spa_txg_history_add(dp->dp_spa, tx->tx_open_txg, tx->tx_open_time); -+ -+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); -+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); - -@@ -373,12 +390,3 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - for (c = 0; c < max_ncpus; c++) -- mutex_exit(&tx->tx_cpu[c].tc_lock); -- -- /* -- * Measure how long the txg was open and replace the kstat. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; -- th->th_kstat.state = TXG_STATE_QUIESCING; -- dsl_pool_txg_history_put(th); -- dsl_pool_txg_history_add(dp, tx->tx_open_txg); -+ mutex_exit(&tx->tx_cpu[c].tc_open_lock); - -@@ -387,4 +395,2 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - */ -- start = gethrtime(); -- - for (c = 0; c < max_ncpus; c++) { -@@ -397,8 +403,3 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - -- /* -- * Measure how long the txg took to quiesce. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.quiesce_time = gethrtime() - start; -- dsl_pool_txg_history_put(th); -+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); - } -@@ -417,2 +418,5 @@ txg_do_callbacks(list_t *cb_list) - * Dispatch the commit callbacks registered on this txg to worker threads. -+ * -+ * If no callbacks are registered for a given TXG, nothing happens. -+ * This function creates a taskq for the associated pool, if needed. - */ -@@ -427,3 +431,6 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) - tx_cpu_t *tc = &tx->tx_cpu[c]; -- /* No need to lock tx_cpu_t at this point */ -+ /* -+ * No need to lock tx_cpu_t at this point, since this can -+ * only be called once a txg has been synced. -+ */ - -@@ -474,2 +481,3 @@ txg_sync_thread(dsl_pool_t *dp) - callb_cpr_t cpr; -+ vdev_stat_t *vs1, *vs2; - uint64_t start, delta; -@@ -487,8 +495,10 @@ txg_sync_thread(dsl_pool_t *dp) - -+ vs1 = kmem_alloc(sizeof (vdev_stat_t), KM_PUSHPAGE); -+ vs2 = kmem_alloc(sizeof (vdev_stat_t), KM_PUSHPAGE); -+ - start = delta = 0; - for (;;) { -- hrtime_t hrstart; -- txg_history_t *th; - uint64_t timer, timeout; - uint64_t txg; -+ uint64_t ndirty; - -@@ -505,3 +515,4 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_synced_txg >= tx->tx_sync_txg_waiting && -- tx->tx_quiesced_txg == 0) { -+ tx->tx_quiesced_txg == 0 && -+ dp->dp_dirty_total < zfs_dirty_data_sync) { - dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", -@@ -524,4 +535,9 @@ txg_sync_thread(dsl_pool_t *dp) - -- if (tx->tx_exiting) -+ if (tx->tx_exiting) { -+ kmem_free(vs2, sizeof (vdev_stat_t)); -+ kmem_free(vs1, sizeof (vdev_stat_t)); - txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); -+ } -+ -+ vdev_get_stats(spa->spa_root_vdev, vs1); - -@@ -535,9 +551,5 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_syncing_txg = txg; -+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_quiesce_more_cv); - -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.state = TXG_STATE_SYNCING; -- vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); -- dsl_pool_txg_history_put(th); -- - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", -@@ -546,4 +558,7 @@ txg_sync_thread(dsl_pool_t *dp) - -+ spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, -+ gethrtime()); -+ ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; -+ - start = ddi_get_lbolt(); -- hrstart = gethrtime(); - spa_sync(spa, txg); -@@ -554,2 +569,3 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_syncing_txg = 0; -+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_done_cv); -@@ -561,18 +577,10 @@ txg_sync_thread(dsl_pool_t *dp) - -- /* -- * Measure the txg sync time determine the amount of I/O done. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); -- th->th_kstat.sync_time = gethrtime() - hrstart; -- th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - -- th->th_vs1.vs_bytes[ZIO_TYPE_READ]; -- th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - -- th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; -- th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - -- th->th_vs1.vs_ops[ZIO_TYPE_READ]; -- th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - -- th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; -- th->th_kstat.state = TXG_STATE_COMMITTED; -- dsl_pool_txg_history_put(th); -+ vdev_get_stats(spa->spa_root_vdev, vs2); -+ spa_txg_history_set_io(spa, txg, -+ vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], -+ vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], -+ vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], -+ vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], -+ ndirty); -+ spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); - } -@@ -619,2 +627,3 @@ txg_quiesce_thread(dsl_pool_t *dp) - tx->tx_quiesced_txg = txg; -+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_more_cv); -@@ -625,13 +634,13 @@ txg_quiesce_thread(dsl_pool_t *dp) - /* -- * Delay this thread by 'ticks' if we are still in the open transaction -- * group and there is already a waiting txg quiesing or quiesced. Abort -- * the delay if this txg stalls or enters the quiesing state. -+ * Delay this thread by delay nanoseconds if we are still in the open -+ * transaction group and there is already a waiting txg quiesing or quiesced. -+ * Abort the delay if this txg stalls or enters the quiesing state. - */ - void --txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) -+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) - { - tx_state_t *tx = &dp->dp_tx; -- clock_t timeout = ddi_get_lbolt() + ticks; -+ hrtime_t start = gethrtime(); - -- /* don't delay if this txg could transition to quiesing immediately */ -+ /* don't delay if this txg could transition to quiescing immediately */ - if (tx->tx_open_txg > txg || -@@ -646,6 +655,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) - -- while (ddi_get_lbolt() < timeout && -- tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) -- (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, -- timeout); -+ while (gethrtime() - start < delay && -+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { -+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, -+ &tx->tx_sync_lock, delay, resolution, 0); -+ } - -@@ -661,2 +671,4 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) - -+ ASSERT(!dsl_pool_config_held(dp)); -+ - mutex_enter(&tx->tx_sync_lock); -@@ -684,2 +696,4 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) - -+ ASSERT(!dsl_pool_config_held(dp)); -+ - mutex_enter(&tx->tx_sync_lock); -@@ -699,2 +713,24 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) - -+/* -+ * If there isn't a txg syncing or in the pipeline, push another txg through -+ * the pipeline by queiscing the open txg. -+ */ -+void -+txg_kick(dsl_pool_t *dp) -+{ -+ tx_state_t *tx = &dp->dp_tx; -+ -+ ASSERT(!dsl_pool_config_held(dp)); -+ -+ mutex_enter(&tx->tx_sync_lock); -+ if (tx->tx_syncing_txg == 0 && -+ tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && -+ tx->tx_sync_txg_waiting <= tx->tx_synced_txg && -+ tx->tx_quiesced_txg <= tx->tx_synced_txg) { -+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; -+ cv_broadcast(&tx->tx_quiesce_more_cv); -+ } -+ mutex_exit(&tx->tx_sync_lock); -+} -+ - boolean_t -@@ -749,6 +785,6 @@ txg_list_empty(txg_list_t *tl, uint64_t txg) - /* -- * Add an entry to the list. -- * Returns 0 if it's a new entry, 1 if it's already there. -+ * Add an entry to the list (unless it's already on the list). -+ * Returns B_TRUE if it was actually added. - */ --int -+boolean_t - txg_list_add(txg_list_t *tl, void *p, uint64_t txg) -@@ -757,7 +793,7 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); -- int already_on_list; -+ boolean_t add; - - mutex_enter(&tl->tl_lock); -- already_on_list = tn->tn_member[t]; -- if (!already_on_list) { -+ add = (tn->tn_member[t] == 0); -+ if (add) { - tn->tn_member[t] = 1; -@@ -768,3 +804,3 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - -- return (already_on_list); -+ return (add); - } -@@ -772,6 +808,7 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - /* -- * Add an entry to the end of the list (walks list to find end). -- * Returns 0 if it's a new entry, 1 if it's already there. -+ * Add an entry to the end of the list, unless it's already on the list. -+ * (walks list to find end) -+ * Returns B_TRUE if it was actually added. - */ --int -+boolean_t - txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) -@@ -780,7 +817,7 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); -- int already_on_list; -+ boolean_t add; - - mutex_enter(&tl->tl_lock); -- already_on_list = tn->tn_member[t]; -- if (!already_on_list) { -+ add = (tn->tn_member[t] == 0); -+ if (add) { - txg_node_t **tp; -@@ -796,3 +833,3 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) - -- return (already_on_list); -+ return (add); - } -@@ -847,3 +884,3 @@ txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) - --int -+boolean_t - txg_list_member(txg_list_t *tl, void *p, uint64_t txg) -@@ -853,3 +890,3 @@ txg_list_member(txg_list_t *tl, void *p, uint64_t txg) - -- return (tn->tn_member[t]); -+ return (tn->tn_member[t] != 0); - } -diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c -index 692cda1..a07dc00 100644 ---- a/module/zfs/uberblock.c -+++ b/module/zfs/uberblock.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -35,3 +36,3 @@ uberblock_verify(uberblock_t *ub) - if (ub->ub_magic != UBERBLOCK_MAGIC) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c -index 662a877..7751683 100644 ---- a/module/zfs/vdev.c -+++ b/module/zfs/vdev.c -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -357,6 +357,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if ((ops = vdev_getops(type)) == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -371,15 +371,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - label_id != id) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_SPARE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_L2CACHE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -390,3 +390,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -398,6 +398,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (islog && spa_version(spa) < SPA_VERSION_SLOGS) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - - if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -411,3 +411,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - /* -@@ -418,6 +418,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - spa_version(spa) < SPA_VERSION_RAIDZ2) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } else { -@@ -428,3 +428,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - /* -@@ -528,4 +528,4 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - -- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, -- &vd->vdev_resilvering); -+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, -+ &vd->vdev_resilver_txg); - -@@ -951,3 +951,3 @@ vdev_probe_done(zio_t *zio) - spa, vd, NULL, 0, 0); -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - } -@@ -961,3 +961,3 @@ vdev_probe_done(zio_t *zio) - if (!vdev_accessible(vd, pio)) -- pio->io_error = ENXIO; -+ pio->io_error = SET_ERROR(ENXIO); - -@@ -968,5 +968,7 @@ vdev_probe_done(zio_t *zio) - /* -- * Determine whether this device is accessible by reading and writing -- * to several known locations: the pad regions of each vdev label -- * but the first (which we leave alone in case it contains a VTOC). -+ * Determine whether this device is accessible. -+ * -+ * Read and write to several known locations: the pad regions of each -+ * vdev label but the first, which we leave alone in case it contains -+ * a VTOC. - */ -@@ -1154,3 +1156,3 @@ vdev_open(vdev_t *vd) - vd->vdev_label_aux); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } else if (vd->vdev_offline) { -@@ -1158,3 +1160,3 @@ vdev_open(vdev_t *vd) - vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -1193,3 +1195,3 @@ vdev_open(vdev_t *vd) - vd->vdev_label_aux); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -1225,3 +1227,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_TOO_SMALL); -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - } -@@ -1236,3 +1238,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_TOO_SMALL); -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - } -@@ -1251,3 +1253,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_BAD_LABEL); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1338,3 +1340,3 @@ vdev_validate(vdev_t *vd, boolean_t strict) - if (vdev_validate(vd->vdev_child[c], strict) != 0) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -1424,3 +1426,3 @@ vdev_validate(vdev_t *vd, boolean_t strict) - state != POOL_STATE_ACTIVE) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -1683,2 +1685,71 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) - /* -+ * Returns the lowest txg in the DTL range. -+ */ -+static uint64_t -+vdev_dtl_min(vdev_t *vd) -+{ -+ space_seg_t *ss; -+ -+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); -+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0); -+ ASSERT0(vd->vdev_children); -+ -+ ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); -+ return (ss->ss_start - 1); -+} -+ -+/* -+ * Returns the highest txg in the DTL. -+ */ -+static uint64_t -+vdev_dtl_max(vdev_t *vd) -+{ -+ space_seg_t *ss; -+ -+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); -+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0); -+ ASSERT0(vd->vdev_children); -+ -+ ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); -+ return (ss->ss_end); -+} -+ -+/* -+ * Determine if a resilvering vdev should remove any DTL entries from -+ * its range. If the vdev was resilvering for the entire duration of the -+ * scan then it should excise that range from its DTLs. Otherwise, this -+ * vdev is considered partially resilvered and should leave its DTL -+ * entries intact. The comment in vdev_dtl_reassess() describes how we -+ * excise the DTLs. -+ */ -+static boolean_t -+vdev_dtl_should_excise(vdev_t *vd) -+{ -+ spa_t *spa = vd->vdev_spa; -+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; -+ -+ ASSERT0(scn->scn_phys.scn_errors); -+ ASSERT0(vd->vdev_children); -+ -+ if (vd->vdev_resilver_txg == 0 || -+ vd->vdev_dtl[DTL_MISSING].sm_space == 0) -+ return (B_TRUE); -+ -+ /* -+ * When a resilver is initiated the scan will assign the scn_max_txg -+ * value to the highest txg value that exists in all DTLs. If this -+ * device's max DTL is not part of this scan (i.e. it is not in -+ * the range (scn_min_txg, scn_max_txg] then it is not eligible -+ * for excision. -+ */ -+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { -+ ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); -+ ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); -+ ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); -+ return (B_TRUE); -+ } -+ return (B_FALSE); -+} -+ -+/* - * Reassess DTLs after a config change or scrub completion. -@@ -1705,5 +1776,13 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) - mutex_enter(&vd->vdev_dtl_lock); -+ -+ /* -+ * If we've completed a scan cleanly then determine -+ * if this vdev should remove any DTLs. We only want to -+ * excise regions on vdevs that were available during -+ * the entire duration of this scan. -+ */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || -- (scn && scn->scn_phys.scn_errors == 0))) { -+ (scn != NULL && scn->scn_phys.scn_errors == 0)) && -+ vdev_dtl_should_excise(vd)) { - /* -@@ -1746,2 +1825,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) - space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); -+ -+ /* -+ * If the vdev was resilvering and no longer has any -+ * DTLs then reset its resilvering flag. -+ */ -+ if (vd->vdev_resilver_txg != 0 && -+ vd->vdev_dtl[DTL_MISSING].sm_space == 0 && -+ vd->vdev_dtl[DTL_OUTAGE].sm_space == 0) -+ vd->vdev_resilver_txg = 0; -+ - mutex_exit(&vd->vdev_dtl_lock); -@@ -1922,8 +2011,5 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) - vdev_writeable(vd)) { -- space_seg_t *ss; - -- ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); -- thismin = ss->ss_start - 1; -- ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); -- thismax = ss->ss_end; -+ thismin = vdev_dtl_min(vd); -+ thismax = vdev_dtl_max(vd); - needed = B_TRUE; -@@ -2204,6 +2290,8 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) - /* -- * Online the given vdev. If 'unspare' is set, it implies two things. First, -- * any attached spare device should be detached when the device finishes -- * resilvering. Second, the online should be treated like a 'test' online case, -- * so no FMA events are generated if the device fails to open. -+ * Online the given vdev. -+ * -+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached -+ * spare device should be detached when the device finishes resilvering. -+ * Second, the online should be treated like a 'test' online case, so no FMA -+ * events are generated if the device fails to open. - */ -@@ -3210,3 +3298,3 @@ vdev_deadman(vdev_t *vd) - mutex_enter(&vq->vq_lock); -- if (avl_numnodes(&vq->vq_pending_tree) > 0) { -+ if (avl_numnodes(&vq->vq_active_tree) > 0) { - spa_t *spa = vd->vdev_spa; -@@ -3220,3 +3308,3 @@ vdev_deadman(vdev_t *vd) - */ -- fio = avl_first(&vq->vq_pending_tree); -+ fio = avl_first(&vq->vq_active_tree); - delta = gethrtime() - fio->io_timestamp; -diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c -index bf4ae7b..ffd50ec 100644 ---- a/module/zfs/vdev_cache.c -+++ b/module/zfs/vdev_cache.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ - -@@ -255,4 +258,4 @@ vdev_cache_read(zio_t *zio) - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); -- ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) - zio_t *fio; -+ ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); - -@@ -261,6 +264,6 @@ vdev_cache_read(zio_t *zio) - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (zio->io_size > zfs_vdev_cache_max) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - -@@ -270,3 +273,3 @@ vdev_cache_read(zio_t *zio) - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - -@@ -276,6 +279,6 @@ vdev_cache_read(zio_t *zio) - -- ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_PUSHPAGE); -+ ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_PUSHPAGE); - ve_search->ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); -- kmem_free(ve_search, sizeof(vdev_cache_entry_t)); -+ kmem_free(ve_search, sizeof (vdev_cache_entry_t)); - -@@ -284,3 +287,3 @@ vdev_cache_read(zio_t *zio) - mutex_exit(&vc->vc_lock); -- return (ESTALE); -+ return (SET_ERROR(ESTALE)); - } -@@ -307,3 +310,3 @@ vdev_cache_read(zio_t *zio) - mutex_exit(&vc->vc_lock); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -311,3 +314,3 @@ vdev_cache_read(zio_t *zio) - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, -- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, -+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); -diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c -index 2869716..cb0cdd7 100644 ---- a/module/zfs/vdev_disk.c -+++ b/module/zfs/vdev_disk.c -@@ -25,2 +25,3 @@ - * LLNL-CODE-403049. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -48,3 +49,3 @@ typedef struct dio_request { - int dr_bio_count; /* Count of bio's */ -- struct bio *dr_bio[0]; /* Attached bio's */ -+ struct bio *dr_bio[0]; /* Attached bio's */ - } dio_request_t; -@@ -66,3 +67,3 @@ vdev_bdev_mode(int smode) - -- return mode; -+ return (mode); - } -@@ -79,3 +80,3 @@ vdev_bdev_mode(int smode) - -- return mode; -+ return (mode); - } -@@ -140,3 +141,3 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - /* Leave existing scheduler when set to "none" */ -- if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) -+ if (strncmp(elevator, "none", 4) && (strlen(elevator) == 4) == 0) - return (0); -@@ -146,3 +147,4 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - #else -- /* For pre-2.6.36 kernels elevator_change() is not available. -+ /* -+ * For pre-2.6.36 kernels elevator_change() is not available. - * Therefore we fall back to using a usermodehelper to echo the -@@ -151,3 +153,3 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - */ --# define SET_SCHEDULER_CMD \ -+#define SET_SCHEDULER_CMD \ - "exec 0vdev_path, device, error); -+ elevator, v->vdev_path, device, error); - -@@ -208,3 +210,3 @@ vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) - if (IS_ERR(bdev)) -- return bdev; -+ return (bdev); - -@@ -232,5 +234,5 @@ vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) - -- return result; -+ return (result); - #else -- return ERR_PTR(-EOPNOTSUPP); -+ return (ERR_PTR(-EOPNOTSUPP)); - #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ -@@ -249,3 +251,3 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return EINVAL; -+ return (EINVAL); - } -@@ -262,5 +264,5 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - -- vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE); -+ vd = kmem_zalloc(sizeof (vdev_disk_t), KM_PUSHPAGE); - if (vd == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -287,4 +289,4 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - if (IS_ERR(bdev)) { -- kmem_free(vd, sizeof(vdev_disk_t)); -- return -PTR_ERR(bdev); -+ kmem_free(vd, sizeof (vdev_disk_t)); -+ return (-PTR_ERR(bdev)); - } -@@ -313,3 +315,3 @@ skip_open: - -- return 0; -+ return (0); - } -@@ -326,5 +328,5 @@ vdev_disk_close(vdev_t *v) - vdev_bdev_close(vd->vd_bdev, -- vdev_bdev_mode(spa_mode(v->vdev_spa))); -+ vdev_bdev_mode(spa_mode(v->vdev_spa))); - -- kmem_free(vd, sizeof(vdev_disk_t)); -+ kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; -@@ -338,4 +340,4 @@ vdev_disk_dio_alloc(int bio_count) - -- dr = kmem_zalloc(sizeof(dio_request_t) + -- sizeof(struct bio *) * bio_count, KM_PUSHPAGE); -+ dr = kmem_zalloc(sizeof (dio_request_t) + -+ sizeof (struct bio *) * bio_count, KM_PUSHPAGE); - if (dr) { -@@ -350,3 +352,3 @@ vdev_disk_dio_alloc(int bio_count) - -- return dr; -+ return (dr); - } -@@ -362,4 +364,4 @@ vdev_disk_dio_free(dio_request_t *dr) - -- kmem_free(dr, sizeof(dio_request_t) + -- sizeof(struct bio *) * dr->dr_bio_count); -+ kmem_free(dr, sizeof (dio_request_t) + -+ sizeof (struct bio *) * dr->dr_bio_count); - } -@@ -371,15 +373,15 @@ vdev_disk_dio_is_sync(dio_request_t *dr) - /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ -- return (dr->dr_rw & (1 << BIO_RW_SYNC)); -+ return (dr->dr_rw & (1 << BIO_RW_SYNC)); - #else --# ifdef HAVE_BIO_RW_SYNCIO -+#ifdef HAVE_BIO_RW_SYNCIO - /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ -- return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); --# else --# ifdef HAVE_REQ_SYNC -+ return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); -+#else -+#ifdef HAVE_REQ_SYNC - /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ -- return (dr->dr_rw & REQ_SYNC); --# else --# error "Unable to determine bio sync flag" --# endif /* HAVE_REQ_SYNC */ --# endif /* HAVE_BIO_RW_SYNC */ -+ return (dr->dr_rw & REQ_SYNC); -+#else -+#error "Unable to determine bio sync flag" -+#endif /* HAVE_REQ_SYNC */ -+#endif /* HAVE_BIO_RW_SYNC */ - #endif /* HAVE_BIO_RW_SYNCIO */ -@@ -418,3 +420,3 @@ vdev_disk_dio_put(dio_request_t *dr) - -- return rc; -+ return (rc); - } -@@ -432,3 +434,3 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, -- bio->bi_idx, bio->bi_size, bio->bi_end_io, -+ BIO_BI_IDX(bio), BIO_BI_SIZE(bio), bio->bi_end_io, - atomic_read(&bio->bi_cnt)); -@@ -436,4 +438,4 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - #ifndef HAVE_2ARGS_BIO_END_IO_T -- if (bio->bi_size) -- return 1; -+ if (BIO_BI_SIZE(bio)) -+ return (1); - #endif /* HAVE_2ARGS_BIO_END_IO_T */ -@@ -441,3 +443,3 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) -- error = -EIO; -+ error = (-EIO); - -@@ -460,3 +462,3 @@ bio_nr_pages(void *bio_ptr, unsigned int bio_size) - return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> -- PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); -+ PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); - } -@@ -492,3 +494,3 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) - -- return bio_size; -+ return (bio_size); - } -@@ -497,5 +499,5 @@ static int - __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, -- size_t kbuf_size, uint64_t kbuf_offset, int flags) -+ size_t kbuf_size, uint64_t kbuf_offset, int flags) - { -- dio_request_t *dr; -+ dio_request_t *dr; - caddr_t bio_ptr; -@@ -510,3 +512,3 @@ retry: - if (dr == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -546,6 +548,6 @@ retry: - dr->dr_bio[i] = bio_alloc(GFP_NOIO, -- bio_nr_pages(bio_ptr, bio_size)); -+ bio_nr_pages(bio_ptr, bio_size)); - if (dr->dr_bio[i] == NULL) { - vdev_disk_dio_free(dr); -- return ENOMEM; -+ return (ENOMEM); - } -@@ -556,3 +558,3 @@ retry: - dr->dr_bio[i]->bi_bdev = bdev; -- dr->dr_bio[i]->bi_sector = bio_offset >> 9; -+ BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_rw = dr->dr_rw; -@@ -565,4 +567,4 @@ retry: - /* Advance in buffer and construct another bio if needed */ -- bio_ptr += dr->dr_bio[i]->bi_size; -- bio_offset += dr->dr_bio[i]->bi_size; -+ bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); -+ bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); - } -@@ -593,5 +595,5 @@ retry: - -- (void)vdev_disk_dio_put(dr); -+ (void) vdev_disk_dio_put(dr); - -- return error; -+ return (error); - } -@@ -600,6 +602,6 @@ int - vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, -- size_t size, uint64_t offset, int flags) -+ size_t size, uint64_t offset, int flags) - { - bio_set_flags_failfast(bdev, &flags); -- return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); -+ return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags)); - } -@@ -632,3 +634,3 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - if (!q) -- return ENXIO; -+ return (ENXIO); - -@@ -636,3 +638,3 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - if (!bio) -- return ENOMEM; -+ return (ENOMEM); - -@@ -643,4 +645,5 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - submit_bio(VDEV_WRITE_FLUSH_FUA, bio); -+ invalidate_bdev(bdev); - -- return 0; -+ return (0); - } -@@ -658,4 +661,4 @@ vdev_disk_io_start(zio_t *zio) - if (!vdev_readable(v)) { -- zio->io_error = ENXIO; -- return ZIO_PIPELINE_CONTINUE; -+ zio->io_error = SET_ERROR(ENXIO); -+ return (ZIO_PIPELINE_CONTINUE); - } -@@ -669,3 +672,3 @@ vdev_disk_io_start(zio_t *zio) - if (v->vdev_nowritecache) { -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - break; -@@ -675,3 +678,3 @@ vdev_disk_io_start(zio_t *zio) - if (error == 0) -- return ZIO_PIPELINE_STOP; -+ return (ZIO_PIPELINE_STOP); - -@@ -684,6 +687,6 @@ vdev_disk_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - } - -- return ZIO_PIPELINE_CONTINUE; -+ return (ZIO_PIPELINE_CONTINUE); - -@@ -698,4 +701,4 @@ vdev_disk_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -- return ZIO_PIPELINE_CONTINUE; -+ zio->io_error = SET_ERROR(ENOTSUP); -+ return (ZIO_PIPELINE_CONTINUE); - } -@@ -703,9 +706,9 @@ vdev_disk_io_start(zio_t *zio) - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, -- zio->io_size, zio->io_offset, flags); -+ zio->io_size, zio->io_offset, flags); - if (error) { - zio->io_error = error; -- return ZIO_PIPELINE_CONTINUE; -+ return (ZIO_PIPELINE_CONTINUE); - } - -- return ZIO_PIPELINE_STOP; -+ return (ZIO_PIPELINE_STOP); - } -@@ -721,3 +724,3 @@ vdev_disk_io_done(zio_t *zio) - if (zio->io_error == EIO) { -- vdev_t *v = zio->io_vd; -+ vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; -@@ -788,3 +791,3 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - if (IS_ERR(bdev)) -- return -PTR_ERR(bdev); -+ return (-PTR_ERR(bdev)); - -@@ -793,10 +796,10 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); -- return EIO; -+ return (EIO); - } - -- size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); -- label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE); -+ size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); -+ label = vmem_alloc(sizeof (vdev_label_t), KM_PUSHPAGE); - - for (i = 0; i < VDEV_LABELS; i++) { -- uint64_t offset, state, txg = 0; -+ uint64_t offset, state, txg = 0; - -@@ -831,6 +834,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - -- vmem_free(label, sizeof(vdev_label_t)); -+ vmem_free(label, sizeof (vdev_label_t)); - vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); - -- return 0; -+ return (0); - } -diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c -index 06999a8..858582a 100644 ---- a/module/zfs/vdev_file.c -+++ b/module/zfs/vdev_file.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -64,3 +64,3 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -102,3 +102,3 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; -- return (ENODEV); -+ return (SET_ERROR(ENODEV)); - } -@@ -156,3 +156,3 @@ vdev_file_io_strategy(void *arg) - if (resid != 0 && zio->io_error == 0) -- zio->io_error = ENOSPC; -+ zio->io_error = SET_ERROR(ENOSPC); - -@@ -164,3 +164,2 @@ vdev_file_io_start(zio_t *zio) - { -- spa_t *spa = zio->io_spa; - vdev_t *vd = zio->io_vd; -@@ -171,3 +170,3 @@ vdev_file_io_start(zio_t *zio) - if (!vdev_readable(vd)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - return (ZIO_PIPELINE_CONTINUE); -@@ -181,3 +180,3 @@ vdev_file_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - } -@@ -187,4 +186,4 @@ vdev_file_io_start(zio_t *zio) - -- spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, -- vdev_file_io_strategy, zio, 0, &zio->io_tqent); -+ VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, -+ TQ_PUSHPAGE), !=, 0); - -diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c -index 1fe36fe..d5af110 100644 ---- a/module/zfs/vdev_label.c -+++ b/module/zfs/vdev_label.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -220,24 +220,19 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, -- vd->vdev_ops->vdev_op_type) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); - if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) -- == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); - - if (vd->vdev_path != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, -- vd->vdev_path) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); - - if (vd->vdev_devid != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, -- vd->vdev_devid) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); - - if (vd->vdev_physpath != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, -- vd->vdev_physpath) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, -+ vd->vdev_physpath); - - if (vd->vdev_fru != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, -- vd->vdev_fru) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - -@@ -262,4 +257,3 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - */ -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, -- vd->vdev_nparity) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } -@@ -267,10 +261,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_wholedisk != -1ULL) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, -- vd->vdev_wholedisk) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, -+ vd->vdev_wholedisk); - - if (vd->vdev_not_present) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); - - if (vd->vdev_isspare) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); - -@@ -278,15 +272,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - vd == vd->vdev_top) { -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, -- vd->vdev_ms_array) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, -- vd->vdev_ms_shift) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, -- vd->vdev_ashift) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, -- vd->vdev_asize) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, -- vd->vdev_islog) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, -+ vd->vdev_ms_array); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, -+ vd->vdev_ms_shift); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, -+ vd->vdev_asize); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, -- vd->vdev_removing) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, -+ vd->vdev_removing); - } -@@ -294,8 +286,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_dtl_smo.smo_object != 0) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, -- vd->vdev_dtl_smo.smo_object) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, -+ vd->vdev_dtl_smo.smo_object); - - if (vd->vdev_crtxg) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, -- vd->vdev_crtxg) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); - -@@ -306,4 +297,4 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - vdev_get_stats(vd, &vs); -- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, -- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); -+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, -+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); - -@@ -311,6 +302,5 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (spa_scan_get_stats(spa, &ps) == 0) { -- VERIFY(nvlist_add_uint64_array(nv, -+ fnvlist_add_uint64_array(nv, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, -- sizeof (pool_scan_stat_t) / sizeof (uint64_t)) -- == 0); -+ sizeof (pool_scan_stat_t) / sizeof (uint64_t)); - } -@@ -344,4 +334,4 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (idx) { -- VERIFY(nvlist_add_nvlist_array(nv, -- ZPOOL_CONFIG_CHILDREN, child, idx) == 0); -+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, -+ child, idx); - } -@@ -357,22 +347,16 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_offline && !vd->vdev_tmpoffline) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, -- B_TRUE) == 0); -- if (vd->vdev_resilvering) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); -+ if (vd->vdev_resilver_txg != 0) -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, -+ vd->vdev_resilver_txg); - if (vd->vdev_faulted) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); - if (vd->vdev_degraded) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); - if (vd->vdev_removed) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); - if (vd->vdev_unspare) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); - if (vd->vdev_ishole) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); - -@@ -389,8 +373,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (aux != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, -- aux) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); - - if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, -- vd->vdev_orig_guid) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, -+ vd->vdev_orig_guid); - } -@@ -663,3 +646,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - -- if (!vd->vdev_ops->vdev_op_leaf) -+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) - return (0); -@@ -670,3 +653,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - if (vdev_is_dead(vd)) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -677,3 +660,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -1037,2 +1020,3 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) - -+/* Sync the uberblocks to all vdevs in svd[] */ - int -@@ -1088,3 +1072,3 @@ vdev_label_sync_top_done(zio_t *zio) - if (*good_writes == 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - -@@ -1134,3 +1118,3 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) - -- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE) == 0) { -+ if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE)) { - for (; l < VDEV_LABELS; l += 2) { -diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c -index e0884dc..99b35f0 100644 ---- a/module/zfs/vdev_mirror.c -+++ b/module/zfs/vdev_mirror.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -91,13 +91,10 @@ vdev_mirror_pending(vdev_t *vd) - { -- vdev_queue_t *vq = &vd->vdev_queue; -- int pending; -- -- mutex_enter(&vq->vq_lock); -- pending = avl_numnodes(&vq->vq_pending_tree); -- mutex_exit(&vq->vq_lock); -- -- return (pending); -+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); - } - --static mirror_map_t * -+/* -+ * Avoid inlining the function to keep vdev_mirror_io_start(), which -+ * is this functions only caller, as small as possible on the stack. -+ */ -+noinline static mirror_map_t * - vdev_mirror_map_alloc(zio_t *zio) -@@ -115,3 +112,4 @@ vdev_mirror_map_alloc(zio_t *zio) - -- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); -+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), -+ KM_PUSHPAGE); - mm->mm_children = c; -@@ -145,3 +143,4 @@ vdev_mirror_map_alloc(zio_t *zio) - -- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); -+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), -+ KM_PUSHPAGE); - mm->mm_children = c; -@@ -161,3 +160,3 @@ vdev_mirror_map_alloc(zio_t *zio) - if (!vdev_readable(mc->mc_vd)) { -- mc->mc_error = ENXIO; -+ mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; -@@ -207,3 +206,3 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -304,3 +303,3 @@ vdev_mirror_child_select(zio_t *zio) - if (!vdev_readable(mc->mc_vd)) { -- mc->mc_error = ENXIO; -+ mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; /* don't even try */ -@@ -311,3 +310,3 @@ vdev_mirror_child_select(zio_t *zio) - return (c); -- mc->mc_error = ESTALE; -+ mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; -@@ -496,3 +495,3 @@ vdev_mirror_io_done(zio_t *zio) - continue; -- mc->mc_error = ESTALE; -+ mc->mc_error = SET_ERROR(ESTALE); - } -@@ -502,3 +501,3 @@ vdev_mirror_io_done(zio_t *zio) - zio->io_data, zio->io_size, -- ZIO_TYPE_WRITE, zio->io_priority, -+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? -diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c -index 3bd8c90..b9eb99d 100644 ---- a/module/zfs/vdev_missing.c -+++ b/module/zfs/vdev_missing.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -71,3 +71,3 @@ vdev_missing_io_start(zio_t *zio) - { -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - return (ZIO_PIPELINE_CONTINUE); -diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c -index b2cc6b8..0dc733e 100644 ---- a/module/zfs/vdev_queue.c -+++ b/module/zfs/vdev_queue.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -31,25 +31,133 @@ - #include -+#include - #include - #include -+#include -+#include -+#include -+#include - - /* -- * These tunables are for performance analysis. -+ * ZFS I/O Scheduler -+ * --------------- -+ * -+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The -+ * I/O scheduler determines when and in what order those operations are -+ * issued. The I/O scheduler divides operations into five I/O classes -+ * prioritized in the following order: sync read, sync write, async read, -+ * async write, and scrub/resilver. Each queue defines the minimum and -+ * maximum number of concurrent operations that may be issued to the device. -+ * In addition, the device has an aggregate maximum. Note that the sum of the -+ * per-queue minimums must not exceed the aggregate maximum. If the -+ * sum of the per-queue maximums exceeds the aggregate maximum, then the -+ * number of active i/os may reach zfs_vdev_max_active, in which case no -+ * further i/os will be issued regardless of whether all per-queue -+ * minimums have been met. -+ * -+ * For many physical devices, throughput increases with the number of -+ * concurrent operations, but latency typically suffers. Further, physical -+ * devices typically have a limit at which more concurrent operations have no -+ * effect on throughput or can actually cause it to decrease. -+ * -+ * The scheduler selects the next operation to issue by first looking for an -+ * I/O class whose minimum has not been satisfied. Once all are satisfied and -+ * the aggregate maximum has not been hit, the scheduler looks for classes -+ * whose maximum has not been satisfied. Iteration through the I/O classes is -+ * done in the order specified above. No further operations are issued if the -+ * aggregate maximum number of concurrent operations has been hit or if there -+ * are no operations queued for an I/O class that has not hit its maximum. -+ * Every time an i/o is queued or an operation completes, the I/O scheduler -+ * looks for new operations to issue. -+ * -+ * All I/O classes have a fixed maximum number of outstanding operations -+ * except for the async write class. Asynchronous writes represent the data -+ * that is committed to stable storage during the syncing stage for -+ * transaction groups (see txg.c). Transaction groups enter the syncing state -+ * periodically so the number of queued async writes will quickly burst up and -+ * then bleed down to zero. Rather than servicing them as quickly as possible, -+ * the I/O scheduler changes the maximum number of active async write i/os -+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since -+ * both throughput and latency typically increase with the number of -+ * concurrent operations issued to physical devices, reducing the burstiness -+ * in the number of concurrent operations also stabilizes the response time of -+ * operations from other -- and in particular synchronous -- queues. In broad -+ * strokes, the I/O scheduler will issue more concurrent operations from the -+ * async write queue as there's more dirty data in the pool. -+ * -+ * Async Writes -+ * -+ * The number of concurrent operations issued for the async write I/O class -+ * follows a piece-wise linear function defined by a few adjustable points. -+ * -+ * | o---------| <-- zfs_vdev_async_write_max_active -+ * ^ | /^ | -+ * | | / | | -+ * active | / | | -+ * I/O | / | | -+ * count | / | | -+ * | / | | -+ * |------------o | | <-- zfs_vdev_async_write_min_active -+ * 0|____________^______|_________| -+ * 0% | | 100% of zfs_dirty_data_max -+ * | | -+ * | `-- zfs_vdev_async_write_active_max_dirty_percent -+ * `--------- zfs_vdev_async_write_active_min_dirty_percent -+ * -+ * Until the amount of dirty data exceeds a minimum percentage of the dirty -+ * data allowed in the pool, the I/O scheduler will limit the number of -+ * concurrent operations to the minimum. As that threshold is crossed, the -+ * number of concurrent operations issued increases linearly to the maximum at -+ * the specified maximum percentage of the dirty data allowed in the pool. -+ * -+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped -+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent -+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the -+ * maximum percentage, this indicates that the rate of incoming data is -+ * greater than the rate that the backend storage can handle. In this case, we -+ * must further throttle incoming writes (see dmu_tx_delay() for details). - */ -+ - /* -- * zfs_vdev_max_pending is the maximum number of i/os concurrently -- * pending to each device. zfs_vdev_min_pending is the initial number -- * of i/os pending to each device (before it starts ramping up to -- * max_pending). -+ * The maximum number of i/os active to each device. Ideally, this will be >= -+ * the sum of each queue's max_active. It must be at least the sum of each -+ * queue's min_active. - */ --int zfs_vdev_max_pending = 10; --int zfs_vdev_min_pending = 4; -+uint32_t zfs_vdev_max_active = 1000; - - /* -- * The deadlines are grouped into buckets based on zfs_vdev_time_shift: -- * deadline = pri + gethrtime() >> time_shift) -+ * Per-queue limits on the number of i/os active to each device. If the -+ * number of active i/os is < zfs_vdev_max_active, then the min_active comes -+ * into play. We will send min_active from each queue, and then select from -+ * queues in the order defined by zio_priority_t. -+ * -+ * In general, smaller max_active's will lead to lower latency of synchronous -+ * operations. Larger max_active's may lead to higher overall throughput, -+ * depending on underlying storage. -+ * -+ * The ratio of the queues' max_actives determines the balance of performance -+ * between reads, writes, and scrubs. E.g., increasing -+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete -+ * more quickly, but reads and writes to have higher latency and lower -+ * throughput. - */ --int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */ -+uint32_t zfs_vdev_sync_read_min_active = 10; -+uint32_t zfs_vdev_sync_read_max_active = 10; -+uint32_t zfs_vdev_sync_write_min_active = 10; -+uint32_t zfs_vdev_sync_write_max_active = 10; -+uint32_t zfs_vdev_async_read_min_active = 1; -+uint32_t zfs_vdev_async_read_max_active = 3; -+uint32_t zfs_vdev_async_write_min_active = 1; -+uint32_t zfs_vdev_async_write_max_active = 10; -+uint32_t zfs_vdev_scrub_min_active = 1; -+uint32_t zfs_vdev_scrub_max_active = 2; - --/* exponential I/O issue ramp-up rate */ --int zfs_vdev_ramp_rate = 2; -+/* -+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent -+ * dirty data, use zfs_vdev_async_write_min_active. When it has more than -+ * zfs_vdev_async_write_active_max_dirty_percent, use -+ * zfs_vdev_async_write_max_active. The value is linearly interpolated -+ * between min and max. -+ */ -+int zfs_vdev_async_write_active_min_dirty_percent = 30; -+int zfs_vdev_async_write_active_max_dirty_percent = 60; - -@@ -65,7 +173,4 @@ int zfs_vdev_write_gap_limit = 4 << 10; - --/* -- * Virtual device vector for disk I/O scheduling. -- */ - int --vdev_queue_deadline_compare(const void *x1, const void *x2) -+vdev_queue_offset_compare(const void *x1, const void *x2) - { -@@ -74,7 +179,2 @@ vdev_queue_deadline_compare(const void *x1, const void *x2) - -- if (z1->io_deadline < z2->io_deadline) -- return (-1); -- if (z1->io_deadline > z2->io_deadline) -- return (1); -- - if (z1->io_offset < z2->io_offset) -@@ -93,3 +193,3 @@ vdev_queue_deadline_compare(const void *x1, const void *x2) - int --vdev_queue_offset_compare(const void *x1, const void *x2) -+vdev_queue_timestamp_compare(const void *x1, const void *x2) - { -@@ -98,5 +198,5 @@ vdev_queue_offset_compare(const void *x1, const void *x2) - -- if (z1->io_offset < z2->io_offset) -+ if (z1->io_timestamp < z2->io_timestamp) - return (-1); -- if (z1->io_offset > z2->io_offset) -+ if (z1->io_timestamp > z2->io_timestamp) - return (1); -@@ -111,2 +211,109 @@ vdev_queue_offset_compare(const void *x1, const void *x2) - -+static int -+vdev_queue_class_min_active(zio_priority_t p) -+{ -+ switch (p) { -+ case ZIO_PRIORITY_SYNC_READ: -+ return (zfs_vdev_sync_read_min_active); -+ case ZIO_PRIORITY_SYNC_WRITE: -+ return (zfs_vdev_sync_write_min_active); -+ case ZIO_PRIORITY_ASYNC_READ: -+ return (zfs_vdev_async_read_min_active); -+ case ZIO_PRIORITY_ASYNC_WRITE: -+ return (zfs_vdev_async_write_min_active); -+ case ZIO_PRIORITY_SCRUB: -+ return (zfs_vdev_scrub_min_active); -+ default: -+ panic("invalid priority %u", p); -+ return (0); -+ } -+} -+ -+static int -+vdev_queue_max_async_writes(uint64_t dirty) -+{ -+ int writes; -+ uint64_t min_bytes = zfs_dirty_data_max * -+ zfs_vdev_async_write_active_min_dirty_percent / 100; -+ uint64_t max_bytes = zfs_dirty_data_max * -+ zfs_vdev_async_write_active_max_dirty_percent / 100; -+ -+ if (dirty < min_bytes) -+ return (zfs_vdev_async_write_min_active); -+ if (dirty > max_bytes) -+ return (zfs_vdev_async_write_max_active); -+ -+ /* -+ * linear interpolation: -+ * slope = (max_writes - min_writes) / (max_bytes - min_bytes) -+ * move right by min_bytes -+ * move up by min_writes -+ */ -+ writes = (dirty - min_bytes) * -+ (zfs_vdev_async_write_max_active - -+ zfs_vdev_async_write_min_active) / -+ (max_bytes - min_bytes) + -+ zfs_vdev_async_write_min_active; -+ ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); -+ ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); -+ return (writes); -+} -+ -+static int -+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) -+{ -+ switch (p) { -+ case ZIO_PRIORITY_SYNC_READ: -+ return (zfs_vdev_sync_read_max_active); -+ case ZIO_PRIORITY_SYNC_WRITE: -+ return (zfs_vdev_sync_write_max_active); -+ case ZIO_PRIORITY_ASYNC_READ: -+ return (zfs_vdev_async_read_max_active); -+ case ZIO_PRIORITY_ASYNC_WRITE: -+ return (vdev_queue_max_async_writes( -+ spa->spa_dsl_pool->dp_dirty_total)); -+ case ZIO_PRIORITY_SCRUB: -+ return (zfs_vdev_scrub_max_active); -+ default: -+ panic("invalid priority %u", p); -+ return (0); -+ } -+} -+ -+/* -+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if -+ * there is no eligible class. -+ */ -+static zio_priority_t -+vdev_queue_class_to_issue(vdev_queue_t *vq) -+{ -+ spa_t *spa = vq->vq_vdev->vdev_spa; -+ zio_priority_t p; -+ -+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) -+ return (ZIO_PRIORITY_NUM_QUEUEABLE); -+ -+ /* find a queue that has not reached its minimum # outstanding i/os */ -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && -+ vq->vq_class[p].vqc_active < -+ vdev_queue_class_min_active(p)) -+ return (p); -+ } -+ -+ /* -+ * If we haven't found a queue, look for one that hasn't reached its -+ * maximum # outstanding i/os. -+ */ -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && -+ vq->vq_class[p].vqc_active < -+ vdev_queue_class_max_active(spa, p)) -+ return (p); -+ } -+ -+ /* No eligible queued i/os */ -+ return (ZIO_PRIORITY_NUM_QUEUEABLE); -+} -+ - void -@@ -115,2 +322,4 @@ vdev_queue_init(vdev_t *vd) - vdev_queue_t *vq = &vd->vdev_queue; -+ int max_active_sum; -+ zio_priority_t p; - int i; -@@ -118,14 +327,21 @@ vdev_queue_init(vdev_t *vd) - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); -+ vq->vq_vdev = vd; - -- avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, -- sizeof (zio_t), offsetof(struct zio, io_deadline_node)); -- -- avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -- -- avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, -+ sizeof (zio_t), offsetof(struct zio, io_queue_node)); - -- avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ /* -+ * The synchronous i/o queues are FIFO rather than LBA ordered. -+ * This provides more consistent latency for these i/os, and -+ * they tend to not be tightly clustered anyway so there is -+ * little to no throughput loss. -+ */ -+ boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ || -+ p == ZIO_PRIORITY_SYNC_WRITE); -+ avl_create(&vq->vq_class[p].vqc_queued_tree, -+ fifo ? vdev_queue_timestamp_compare : -+ vdev_queue_offset_compare, -+ sizeof (zio_t), offsetof(struct zio, io_queue_node)); -+ } - -@@ -138,3 +354,6 @@ vdev_queue_init(vdev_t *vd) - -- for (i = 0; i < zfs_vdev_max_pending; i++) -+ max_active_sum = zfs_vdev_sync_read_max_active + -+ zfs_vdev_sync_write_max_active + zfs_vdev_async_read_max_active + -+ zfs_vdev_async_write_max_active + zfs_vdev_scrub_max_active; -+ for (i = 0; i < max_active_sum; i++) - list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); -@@ -147,7 +366,7 @@ vdev_queue_fini(vdev_t *vd) - vdev_io_t *vi; -+ zio_priority_t p; - -- avl_destroy(&vq->vq_deadline_tree); -- avl_destroy(&vq->vq_read_tree); -- avl_destroy(&vq->vq_write_tree); -- avl_destroy(&vq->vq_pending_tree); -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) -+ avl_destroy(&vq->vq_class[p].vqc_queued_tree); -+ avl_destroy(&vq->vq_active_tree); - -@@ -166,4 +385,13 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) - { -- avl_add(&vq->vq_deadline_tree, zio); -- avl_add(zio->io_vdev_tree, zio); -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_waitq_enter(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } - } -@@ -173,4 +401,58 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) - { -- avl_remove(&vq->vq_deadline_tree, zio); -- avl_remove(zio->io_vdev_tree, zio); -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_waitq_exit(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } -+} -+ -+static void -+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) -+{ -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ vq->vq_class[zio->io_priority].vqc_active++; -+ avl_add(&vq->vq_active_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_runq_enter(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } -+} -+ -+static void -+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) -+{ -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ vq->vq_class[zio->io_priority].vqc_active--; -+ avl_remove(&vq->vq_active_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ kstat_io_t *ksio = ssh->kstat->ks_data; -+ -+ mutex_enter(&ssh->lock); -+ kstat_runq_exit(ksio); -+ if (zio->io_type == ZIO_TYPE_READ) { -+ ksio->reads++; -+ ksio->nread += zio->io_size; -+ } else if (zio->io_type == ZIO_TYPE_WRITE) { -+ ksio->writes++; -+ ksio->nwritten += zio->io_size; -+ } -+ mutex_exit(&ssh->lock); -+ } - } -@@ -182,8 +464,10 @@ vdev_queue_agg_io_done(zio_t *aio) - vdev_io_t *vi = aio->io_data; -- zio_t *pio; - -- while ((pio = zio_walk_parents(aio)) != NULL) -- if (aio->io_type == ZIO_TYPE_READ) -+ if (aio->io_type == ZIO_TYPE_READ) { -+ zio_t *pio; -+ while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); -+ } -+ } - -@@ -204,24 +488,36 @@ vdev_queue_agg_io_done(zio_t *aio) - static zio_t * --vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) -+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) - { -- zio_t *fio, *lio, *aio, *dio, *nio, *mio; -- avl_tree_t *t; - vdev_io_t *vi; -- int flags; -- uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); -- uint64_t maxgap; -- int stretch; -+ zio_t *first, *last, *aio, *dio, *mandatory, *nio; -+ uint64_t maxgap = 0; -+ uint64_t size; -+ boolean_t stretch = B_FALSE; -+ vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority]; -+ avl_tree_t *t = &vqc->vqc_queued_tree; -+ enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; -+ -+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) -+ return (NULL); - --again: -- ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ /* -+ * Prevent users from setting the zfs_vdev_aggregation_limit -+ * tuning larger than SPA_MAXBLOCKSIZE. -+ */ -+ zfs_vdev_aggregation_limit = -+ MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); - -- if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || -- avl_numnodes(&vq->vq_deadline_tree) == 0) -+ /* -+ * The synchronous i/o queues are not sorted by LBA, so we can't -+ * find adjacent i/os. These i/os tend to not be tightly clustered, -+ * or too large to aggregate, so this has little impact on performance. -+ */ -+ if (zio->io_priority == ZIO_PRIORITY_SYNC_READ || -+ zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) - return (NULL); - -- fio = lio = avl_first(&vq->vq_deadline_tree); -+ first = last = zio; - -- t = fio->io_vdev_tree; -- flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; -- maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; -+ if (zio->io_type == ZIO_TYPE_READ) -+ maxgap = zfs_vdev_read_gap_limit; - -@@ -233,85 +529,83 @@ again: - -- if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { -- /* -- * We can aggregate I/Os that are sufficiently adjacent and of -- * the same flavor, as expressed by the AGG_INHERIT flags. -- * The latter requirement is necessary so that certain -- * attributes of the I/O, such as whether it's a normal I/O -- * or a scrub/resilver, can be preserved in the aggregate. -- * We can include optional I/Os, but don't allow them -- * to begin a range as they add no benefit in that situation. -- */ -+ /* -+ * We can aggregate I/Os that are sufficiently adjacent and of -+ * the same flavor, as expressed by the AGG_INHERIT flags. -+ * The latter requirement is necessary so that certain -+ * attributes of the I/O, such as whether it's a normal I/O -+ * or a scrub/resilver, can be preserved in the aggregate. -+ * We can include optional I/Os, but don't allow them -+ * to begin a range as they add no benefit in that situation. -+ */ - -- /* -- * We keep track of the last non-optional I/O. -- */ -- mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; -+ /* -+ * We keep track of the last non-optional I/O. -+ */ -+ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; - -- /* -- * Walk backwards through sufficiently contiguous I/Os -- * recording the last non-option I/O. -- */ -- while ((dio = AVL_PREV(t, fio)) != NULL && -- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -- IO_SPAN(dio, lio) <= maxspan && -- IO_GAP(dio, fio) <= maxgap) { -- fio = dio; -- if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) -- mio = fio; -- } -+ /* -+ * Walk backwards through sufficiently contiguous I/Os -+ * recording the last non-option I/O. -+ */ -+ while ((dio = AVL_PREV(t, first)) != NULL && -+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -+ IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && -+ IO_GAP(dio, first) <= maxgap) { -+ first = dio; -+ if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) -+ mandatory = first; -+ } - -- /* -- * Skip any initial optional I/Os. -- */ -- while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { -- fio = AVL_NEXT(t, fio); -- ASSERT(fio != NULL); -- } -+ /* -+ * Skip any initial optional I/Os. -+ */ -+ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { -+ first = AVL_NEXT(t, first); -+ ASSERT(first != NULL); -+ } - -- /* -- * Walk forward through sufficiently contiguous I/Os. -- */ -- while ((dio = AVL_NEXT(t, lio)) != NULL && -- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -- IO_SPAN(fio, dio) <= maxspan && -- IO_GAP(lio, dio) <= maxgap) { -- lio = dio; -- if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) -- mio = lio; -- } - -- /* -- * Now that we've established the range of the I/O aggregation -- * we must decide what to do with trailing optional I/Os. -- * For reads, there's nothing to do. While we are unable to -- * aggregate further, it's possible that a trailing optional -- * I/O would allow the underlying device to aggregate with -- * subsequent I/Os. We must therefore determine if the next -- * non-optional I/O is close enough to make aggregation -- * worthwhile. -- */ -- stretch = B_FALSE; -- if (t != &vq->vq_read_tree && mio != NULL) { -- nio = lio; -- while ((dio = AVL_NEXT(t, nio)) != NULL && -- IO_GAP(nio, dio) == 0 && -- IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { -- nio = dio; -- if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { -- stretch = B_TRUE; -- break; -- } -+ /* -+ * Walk forward through sufficiently contiguous I/Os. -+ */ -+ while ((dio = AVL_NEXT(t, last)) != NULL && -+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -+ IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && -+ IO_GAP(last, dio) <= maxgap) { -+ last = dio; -+ if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) -+ mandatory = last; -+ } -+ -+ /* -+ * Now that we've established the range of the I/O aggregation -+ * we must decide what to do with trailing optional I/Os. -+ * For reads, there's nothing to do. While we are unable to -+ * aggregate further, it's possible that a trailing optional -+ * I/O would allow the underlying device to aggregate with -+ * subsequent I/Os. We must therefore determine if the next -+ * non-optional I/O is close enough to make aggregation -+ * worthwhile. -+ */ -+ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { -+ zio_t *nio = last; -+ while ((dio = AVL_NEXT(t, nio)) != NULL && -+ IO_GAP(nio, dio) == 0 && -+ IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { -+ nio = dio; -+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { -+ stretch = B_TRUE; -+ break; - } - } -+ } - -- if (stretch) { -- /* This may be a no-op. */ -- VERIFY((dio = AVL_NEXT(t, lio)) != NULL); -- dio->io_flags &= ~ZIO_FLAG_OPTIONAL; -- } else { -- while (lio != mio && lio != fio) { -- ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); -- lio = AVL_PREV(t, lio); -- ASSERT(lio != NULL); -- } -+ if (stretch) { -+ /* This may be a no-op. */ -+ dio = AVL_NEXT(t, last); -+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL; -+ } else { -+ while (last != mandatory && last != first) { -+ ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); -+ last = AVL_PREV(t, last); -+ ASSERT(last != NULL); - } -@@ -319,44 +613,84 @@ again: - -- if (fio != lio) { -- uint64_t size = IO_SPAN(fio, lio); -- ASSERT(size <= maxspan); -- ASSERT(vi != NULL); -- -- aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, -- vi, size, fio->io_type, ZIO_PRIORITY_AGG, -- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, -- vdev_queue_agg_io_done, NULL); -- aio->io_timestamp = fio->io_timestamp; -- -- nio = fio; -- do { -- dio = nio; -- nio = AVL_NEXT(t, dio); -- ASSERT(dio->io_type == aio->io_type); -- ASSERT(dio->io_vdev_tree == t); -- -- if (dio->io_flags & ZIO_FLAG_NODATA) { -- ASSERT(dio->io_type == ZIO_TYPE_WRITE); -- bzero((char *)aio->io_data + (dio->io_offset - -- aio->io_offset), dio->io_size); -- } else if (dio->io_type == ZIO_TYPE_WRITE) { -- bcopy(dio->io_data, (char *)aio->io_data + -- (dio->io_offset - aio->io_offset), -- dio->io_size); -- } -+ if (first == last) -+ return (NULL); - -- zio_add_child(dio, aio); -- vdev_queue_io_remove(vq, dio); -- zio_vdev_io_bypass(dio); -- zio_execute(dio); -- } while (dio != lio); -+ ASSERT(vi != NULL); -+ -+ size = IO_SPAN(first, last); -+ ASSERT3U(size, <=, zfs_vdev_aggregation_limit); -+ -+ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, -+ vi, size, first->io_type, zio->io_priority, -+ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, -+ vdev_queue_agg_io_done, NULL); -+ aio->io_timestamp = first->io_timestamp; -+ -+ nio = first; -+ do { -+ dio = nio; -+ nio = AVL_NEXT(t, dio); -+ ASSERT3U(dio->io_type, ==, aio->io_type); -+ -+ if (dio->io_flags & ZIO_FLAG_NODATA) { -+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); -+ bzero((char *)aio->io_data + (dio->io_offset - -+ aio->io_offset), dio->io_size); -+ } else if (dio->io_type == ZIO_TYPE_WRITE) { -+ bcopy(dio->io_data, (char *)aio->io_data + -+ (dio->io_offset - aio->io_offset), -+ dio->io_size); -+ } - -- avl_add(&vq->vq_pending_tree, aio); -- list_remove(&vq->vq_io_list, vi); -+ zio_add_child(dio, aio); -+ vdev_queue_io_remove(vq, dio); -+ zio_vdev_io_bypass(dio); -+ zio_execute(dio); -+ } while (dio != last); -+ -+ list_remove(&vq->vq_io_list, vi); -+ -+ return (aio); -+} - -- return (aio); -+static zio_t * -+vdev_queue_io_to_issue(vdev_queue_t *vq) -+{ -+ zio_t *zio, *aio; -+ zio_priority_t p; -+ avl_index_t idx; -+ vdev_queue_class_t *vqc; -+ zio_t *search; -+ -+again: -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ -+ p = vdev_queue_class_to_issue(vq); -+ -+ if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { -+ /* No eligible queued i/os */ -+ return (NULL); - } - -- ASSERT(fio->io_vdev_tree == t); -- vdev_queue_io_remove(vq, fio); -+ /* -+ * For LBA-ordered queues (async / scrub), issue the i/o which follows -+ * the most recently issued i/o in LBA (offset) order. -+ * -+ * For FIFO queues (sync), issue the i/o with the lowest timestamp. -+ */ -+ vqc = &vq->vq_class[p]; -+ search = zio_buf_alloc(sizeof (*search)); -+ search->io_timestamp = 0; -+ search->io_offset = vq->vq_last_offset + 1; -+ VERIFY3P(avl_find(&vqc->vqc_queued_tree, search, &idx), ==, NULL); -+ zio_buf_free(search, sizeof (*search)); -+ zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER); -+ if (zio == NULL) -+ zio = avl_first(&vqc->vqc_queued_tree); -+ ASSERT3U(zio->io_priority, ==, p); -+ -+ aio = vdev_queue_aggregate(vq, zio); -+ if (aio != NULL) -+ zio = aio; -+ else -+ vdev_queue_io_remove(vq, zio); - -@@ -368,6 +702,6 @@ again: - */ -- if (fio->io_flags & ZIO_FLAG_NODATA) { -+ if (zio->io_flags & ZIO_FLAG_NODATA) { - mutex_exit(&vq->vq_lock); -- zio_vdev_io_bypass(fio); -- zio_execute(fio); -+ zio_vdev_io_bypass(zio); -+ zio_execute(zio); - mutex_enter(&vq->vq_lock); -@@ -376,5 +710,6 @@ again: - -- avl_add(&vq->vq_pending_tree, fio); -+ vdev_queue_pending_add(vq, zio); -+ vq->vq_last_offset = zio->io_offset; - -- return (fio); -+ return (zio); - } -@@ -387,4 +722,2 @@ vdev_queue_io(zio_t *zio) - -- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); -- - if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) -@@ -392,19 +725,24 @@ vdev_queue_io(zio_t *zio) - -- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; -+ /* -+ * Children i/os inherent their parent's priority, which might -+ * not match the child's i/o type. Fix it up here. -+ */ -+ if (zio->io_type == ZIO_TYPE_READ) { -+ if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && -+ zio->io_priority != ZIO_PRIORITY_ASYNC_READ && -+ zio->io_priority != ZIO_PRIORITY_SCRUB) -+ zio->io_priority = ZIO_PRIORITY_ASYNC_READ; -+ } else { -+ ASSERT(zio->io_type == ZIO_TYPE_WRITE); -+ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && -+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) -+ zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; -+ } - -- if (zio->io_type == ZIO_TYPE_READ) -- zio->io_vdev_tree = &vq->vq_read_tree; -- else -- zio->io_vdev_tree = &vq->vq_write_tree; -+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; - - mutex_enter(&vq->vq_lock); -- - zio->io_timestamp = gethrtime(); -- zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + -- zio->io_priority; -- - vdev_queue_io_add(vq, zio); -- -- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); -- -+ nio = vdev_queue_io_to_issue(vq); - mutex_exit(&vq->vq_lock); -@@ -426,3 +764,3 @@ vdev_queue_io_done(zio_t *zio) - vdev_queue_t *vq = &zio->io_vd->vdev_queue; -- int i; -+ zio_t *nio; - -@@ -433,3 +771,3 @@ vdev_queue_io_done(zio_t *zio) - -- avl_remove(&vq->vq_pending_tree, zio); -+ vdev_queue_pending_remove(vq, zio); - -@@ -439,6 +777,3 @@ vdev_queue_io_done(zio_t *zio) - -- for (i = 0; i < zfs_vdev_ramp_rate; i++) { -- zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); -- if (nio == NULL) -- break; -+ while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { - mutex_exit(&vq->vq_lock); -@@ -457,8 +792,2 @@ vdev_queue_io_done(zio_t *zio) - #if defined(_KERNEL) && defined(HAVE_SPL) --module_param(zfs_vdev_max_pending, int, 0644); --MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); -- --module_param(zfs_vdev_min_pending, int, 0644); --MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); -- - module_param(zfs_vdev_aggregation_limit, int, 0644); -@@ -466,8 +795,2 @@ MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); - --module_param(zfs_vdev_time_shift, int, 0644); --MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); -- --module_param(zfs_vdev_ramp_rate, int, 0644); --MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); -- - module_param(zfs_vdev_read_gap_limit, int, 0644); -@@ -477,2 +800,51 @@ module_param(zfs_vdev_write_gap_limit, int, 0644); - MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); -+ -+module_param(zfs_vdev_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_max_active, "Maximum number of active I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_active_max_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_active_max_dirty_percent, -+ "Async write concurrency max threshold"); -+ -+module_param(zfs_vdev_async_write_active_min_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_active_min_dirty_percent, -+ "Async write concurrency min threshold"); -+ -+module_param(zfs_vdev_async_read_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_read_max_active, -+ "Max active async read I/Os per vdev"); -+ -+module_param(zfs_vdev_async_read_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_read_min_active, -+ "Min active async read I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_max_active, -+ "Max active async write I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_min_active, -+ "Min active async write I/Os per vdev"); -+ -+module_param(zfs_vdev_scrub_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev"); -+ -+module_param(zfs_vdev_scrub_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_read_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_read_max_active, -+ "Max active sync read I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_read_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_read_min_active, -+ "Min active sync read I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_write_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_write_max_active, -+ "Max active sync write I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_write_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_write_min_active, -+ "Min active sync write I/Osper vdev"); - #endif -diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c -index 3e1878d..4cd21df 100644 ---- a/module/zfs/vdev_raidz.c -+++ b/module/zfs/vdev_raidz.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -62,2 +62,3 @@ - * o multiplication of A by 2 is defined by the following bitwise expression: -+ * - * (A * 2)_7 = A_6 -@@ -120,3 +121,3 @@ typedef struct raidz_map { - uint64_t rm_nskip; /* Skipped sectors for padding */ -- uint64_t rm_skipstart; /* Column index of padding start */ -+ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ -@@ -160,6 +161,3 @@ int vdev_raidz_default_to_general; - --/* -- * These two tables represent powers and logs of 2 in the Galois field defined -- * above. These values were computed by repeatedly multiplying by 2 as above. -- */ -+/* Powers of 2 in the Galois field defined above. */ - static const uint8_t vdev_raidz_pow2[256] = { -@@ -198,2 +196,3 @@ static const uint8_t vdev_raidz_pow2[256] = { - }; -+/* Logs of 2 in the Galois field defined above. */ - static const uint8_t vdev_raidz_log2[256] = { -@@ -433,3 +432,10 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { - --static raidz_map_t * -+/* -+ * Divides the IO evenly across all child vdevs; usually, dcols is -+ * the number of children in the target vdev. -+ * -+ * Avoid inlining the function to keep vdev_raidz_io_start(), which -+ * is this functions only caller, as small as possible on the stack. -+ */ -+noinline static raidz_map_t * - vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, -@@ -438,5 +444,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - raidz_map_t *rm; -+ /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = zio->io_offset >> unit_shift; -+ /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = zio->io_size >> unit_shift; -+ /* The first column for this stripe. */ - uint64_t f = b % dcols; -+ /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; -@@ -444,8 +454,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - -+ /* -+ * "Quotient": The number of data sectors for this stripe on all but -+ * the "big column" child vdevs that also contain "remainder" data. -+ */ - q = s / (dcols - nparity); -+ -+ /* -+ * "Remainder": The number of partial stripe data sectors in this I/O. -+ * This will add a sector to some, but not all, child vdevs. -+ */ - r = s - q * (dcols - nparity); -+ -+ /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); -+ -+ /* -+ * The total number of data and parity sectors associated with -+ * this I/O. -+ */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - -+ /* acols: The columns that will be accessed. */ -+ /* scols: The columns that will be accessed or skipped. */ - if (q == 0) { -+ /* Our I/O request doesn't span all child vdevs. */ - acols = bc; -@@ -1192,3 +1221,4 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, - uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; -- uint8_t log = 0, val; -+ uint8_t log = 0; -+ uint8_t val; - int ll; -@@ -1458,3 +1488,3 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1522,2 +1552,19 @@ vdev_raidz_child_done(zio_t *zio) - -+/* -+ * Start an IO operation on a RAIDZ VDev -+ * -+ * Outline: -+ * - For write operations: -+ * 1. Generate the parity data -+ * 2. Create child zio write operations to each column's vdev, for both -+ * data and parity. -+ * 3. If the column skips any sectors for padding, create optional dummy -+ * write zio children for those areas to improve aggregation continuity. -+ * - For read operations: -+ * 1. Create child zio read operations to each data column's vdev to read -+ * the range of data required for zio. -+ * 2. If this is a scrub or resilver operation, or if any of the data -+ * vdevs have had errors, then create zio read operations to the parity -+ * columns' VDevs as well. -+ */ - static int -@@ -1583,3 +1630,3 @@ vdev_raidz_io_start(zio_t *zio) - rm->rm_missingparity++; -- rc->rc_error = ENXIO; -+ rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ -@@ -1593,3 +1640,3 @@ vdev_raidz_io_start(zio_t *zio) - rm->rm_missingparity++; -- rc->rc_error = ESTALE; -+ rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; -@@ -1684,3 +1731,3 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) - raidz_checksum_error(zio, rc, orig[c]); -- rc->rc_error = ECKSUM; -+ rc->rc_error = SET_ERROR(ECKSUM); - ret++; -@@ -1808,3 +1855,3 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) - orig[i]); -- rc->rc_error = ECKSUM; -+ rc->rc_error = SET_ERROR(ECKSUM); - } -@@ -1865,2 +1912,23 @@ done: - -+/* -+ * Complete an IO operation on a RAIDZ VDev -+ * -+ * Outline: -+ * - For write operations: -+ * 1. Check for errors on the child IOs. -+ * 2. Return, setting an error code if too few child VDevs were written -+ * to reconstruct the data later. Note that partial writes are -+ * considered successful if they can be reconstructed at all. -+ * - For read operations: -+ * 1. Check for errors on the child IOs. -+ * 2. If data errors occurred: -+ * a. Try to reassemble the data from the parity available. -+ * b. If we haven't yet read the parity drives, read them now. -+ * c. If all parity drives have been read but the data still doesn't -+ * reassemble with a correct checksum, then try combinatorial -+ * reconstruction. -+ * d. If that doesn't work, return an error. -+ * 3. If there were unexpected errors or this is a resilver operation, -+ * rewrite the vdevs that had errors. -+ */ - static void -@@ -2084,3 +2152,3 @@ vdev_raidz_io_done(zio_t *zio) - */ -- zio->io_error = ECKSUM; -+ zio->io_error = SET_ERROR(ECKSUM); - -@@ -2122,3 +2190,3 @@ done: - rc->rc_offset, rc->rc_data, rc->rc_size, -- ZIO_TYPE_WRITE, zio->io_priority, -+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? -diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c -index 5241b02..90250b0 100644 ---- a/module/zfs/vdev_root.c -+++ b/module/zfs/vdev_root.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -65,3 +65,3 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -diff --git a/module/zfs/zap.c b/module/zfs/zap.c -index a7bae5e..cfae26a 100644 ---- a/module/zfs/zap.c -+++ b/module/zfs/zap.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -297,3 +297,4 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) - DMU_READ_NO_PREFETCH); -- dmu_buf_rele(db, FTAG); -+ if (err == 0) -+ dmu_buf_rele(db, FTAG); - } -@@ -327,3 +328,3 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -716,3 +717,3 @@ fzap_checkname(zap_name_t *zn) - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - return (0); -@@ -731,3 +732,3 @@ fzap_checksize(uint64_t integer_size, uint64_t num_integers) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -807,3 +808,3 @@ retry: - if (err == 0) { -- err = EEXIST; -+ err = SET_ERROR(EEXIST); - goto out; -@@ -994,2 +995,3 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -997,4 +999,6 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - (void) zap_cursor_advance(&zc)) { -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - err = zap_add(os, intoobj, za.za_name, -@@ -1002,6 +1006,6 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1016,2 +1020,3 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -1019,4 +1024,6 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - (void) zap_cursor_advance(&zc)) { -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - err = zap_add(os, intoobj, za.za_name, -@@ -1024,6 +1031,6 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1038,2 +1045,3 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -1043,4 +1051,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - -@@ -1048,3 +1058,3 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err != 0 && err != ENOENT) -- return (err); -+ break; - delta += za.za_first_integer; -@@ -1052,6 +1062,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1252,3 +1262,3 @@ fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c -index ad21882..13bc879 100644 ---- a/module/zfs/zap_leaf.c -+++ b/module/zfs/zap_leaf.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -436,3 +437,3 @@ again: - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -494,3 +495,3 @@ zap_entry_read(const zap_entry_handle_t *zeh, - if (le->le_value_intlen > integer_size) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -501,3 +502,3 @@ zap_entry_read(const zap_entry_handle_t *zeh, - if (zeh->zeh_num_integers > num_integers) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - return (0); -@@ -522,3 +523,3 @@ zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, - if (le->le_name_numints > buflen) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - return (0); -@@ -538,3 +539,3 @@ zap_entry_update(zap_entry_handle_t *zeh, - if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - -@@ -628,3 +629,3 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, - if (l->l_phys->l_hdr.lh_nfree < numchunks) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - -diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c -index 4da7836..555d52f 100644 ---- a/module/zfs/zap_micro.c -+++ b/module/zfs/zap_micro.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -782,3 +782,3 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -791,8 +791,8 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { - if (num_integers < 1) { -- err = EOVERFLOW; -+ err = SET_ERROR(EOVERFLOW); - } else if (integer_size != 8) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -@@ -828,3 +828,3 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -851,3 +851,3 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -886,3 +886,3 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -893,3 +893,3 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -920,3 +920,3 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -940,3 +940,4 @@ mzap_addent(zap_name_t *zn, uint64_t value) - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { -- ASSERTV(mzap_ent_phys_t *mze=&zap->zap_m.zap_phys->mz_chunk[i]); -+ ASSERTV(mzap_ent_phys_t *mze); -+ ASSERT(mze = &zap->zap_m.zap_phys->mz_chunk[i]); - ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); -@@ -989,3 +990,3 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1003,3 +1004,3 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, - if (mze != NULL) { -- err = EEXIST; -+ err = SET_ERROR(EEXIST); - } else { -@@ -1030,3 +1031,3 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1067,3 +1068,3 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1112,3 +1113,3 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1143,3 +1144,3 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1150,3 +1151,3 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -1177,3 +1178,3 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1255,3 +1256,3 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - if (zc->zc_hash == -1ULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1281,4 +1282,2 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - } else { -- err = ENOENT; -- - mze_tofind.mze_hash = zc->zc_hash; -@@ -1305,2 +1304,3 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - zc->zc_hash = -1ULL; -+ err = SET_ERROR(ENOENT); - } -@@ -1338,3 +1338,3 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) - rw_exit(&zc->zc_zap->zap_rwlock); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1346,3 +1346,3 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - goto out; -diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c -index c09b32d..4f4785a 100644 ---- a/module/zfs/zfeature.c -+++ b/module/zfs/zfeature.c -@@ -22,3 +22,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -182,4 +182,4 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); -@@ -206,3 +206,3 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - if (zap_lookup(os, desc_obj, za->za_name, -- 1, sizeof (buf), buf) == 0) -+ 1, MAXPATHLEN, buf) == 0) - desc = buf; -@@ -217,4 +217,4 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - kmem_free(buf, MAXPATHLEN); -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - -@@ -236,3 +236,3 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (zapobj == 0) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -242,3 +242,3 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (err == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - else -@@ -283,5 +283,5 @@ feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (error == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (refcount == UINT64_MAX) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - refcount++; -@@ -290,5 +290,5 @@ feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (error == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (refcount == 0) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - refcount--; -diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c -index 8ab5abe..89b6245 100644 ---- a/module/zfs/zfs_acl.c -+++ b/module/zfs/zfs_acl.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -678,3 +679,3 @@ zfs_copy_ace_2_fuid(zfs_sb_t *zsb, umode_t obj_mode, zfs_acl_t *aclp, - aceptr->z_hdr.z_flags) != B_TRUE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -785,3 +786,3 @@ zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, - aceptr->z_flags) != B_TRUE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1119,3 +1120,3 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - goto done; -@@ -1157,2 +1158,5 @@ zfs_acl_chown_setattr(znode_t *zp) - -+ if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ - ASSERT(MUTEX_HELD(&zp->z_lock)); -@@ -1163,2 +1167,14 @@ zfs_acl_chown_setattr(znode_t *zp) - &zp->z_pflags, zp->z_uid, zp->z_gid); -+ -+ /* -+ * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL -+ * nor a DACL_ACES SA in which case ENOENT is returned from -+ * zfs_acl_node_read() when the SA can't be located. -+ * Allow chown/chgrp to succeed in these cases rather than -+ * returning an error that makes no sense in the context of -+ * the caller. -+ */ -+ if (error == ENOENT) -+ return (0); -+ - return (error); -@@ -1474,3 +1490,4 @@ zfs_acl_chmod(zfs_sb_t *zsb, uint64_t mode, zfs_acl_t *aclp) - new_bytes += abstract_size; -- } if (deny1) { -+ } -+ if (deny1) { - zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); -@@ -1871,3 +1888,3 @@ zfs_acl_ids_overquota(zfs_sb_t *zsb, zfs_acl_ids_t *acl_ids) - /* -- * Retrieve a files ACL -+ * Retrieve a file's ACL - */ -@@ -1886,3 +1903,3 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) - if (mask == 0) -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - -@@ -1980,3 +1997,3 @@ zfs_vsec_2_aclp(zfs_sb_t *zsb, umode_t obj_mode, - if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2026,3 +2043,3 @@ zfs_vsec_2_aclp(zfs_sb_t *zsb, umode_t obj_mode, - /* -- * Set a files ACL -+ * Set a file's ACL - */ -@@ -2042,6 +2059,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) - if (mask == 0) -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - - if (zp->z_pflags & ZFS_IMMUTABLE) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -2141,3 +2158,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { -- return (EROFS); -+ return (SET_ERROR(EROFS)); - } -@@ -2152,3 +2169,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_IMMUTABLE)))) { -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -2157,3 +2174,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_NOUNLINK)) { -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -2162,3 +2179,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_AV_QUARANTINED))) { -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -2271,3 +2288,3 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - mutex_exit(&zp->z_acl_lock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -2305,3 +2322,3 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - *working_mode |= deny_mask; -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } else if (*working_mode) { -@@ -2372,3 +2389,3 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - if (*working_mode != ACE_WRITE_DATA) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -@@ -2388,3 +2405,3 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) - if (zdp->z_pflags & ZFS_AV_QUARANTINED) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -@@ -2447,2 +2464,3 @@ slow: - * Determine whether Access should be granted/denied. -+ * - * The least priv subsytem is always consulted as a basic privilege -@@ -2494,3 +2512,3 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) - if (error) -- return (error); -+ return (error); - -@@ -2600,3 +2618,3 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) - if (working_mode & ~(ZFS_CHECKED_MASKS)) { -- error = EACCES; -+ error = SET_ERROR(EACCES); - } -@@ -2654,3 +2672,2 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, - * -- * - * The following chart is the recommended NFSv4 enforcement for -@@ -2710,3 +2727,3 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) - if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -2777,3 +2794,3 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - if (szp->z_pflags & ZFS_AV_QUARANTINED) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c -index b35f27d..9652054 100644 ---- a/module/zfs/zfs_ctldir.c -+++ b/module/zfs/zfs_ctldir.c -@@ -29,2 +29,3 @@ - * Brian Behlendorf -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -82,2 +83,3 @@ - #include -+#include - #include -@@ -100,3 +102,3 @@ zfsctl_sep_alloc(void) - { -- return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); -+ return (kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP)); - } -@@ -255,3 +257,2 @@ zfsctl_inode_destroy(struct inode *ip) - { -- return; - } -@@ -287,3 +288,3 @@ zfsctl_create(zfs_sb_t *zsb) - if (zsb->z_ctldir == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -291,3 +292,3 @@ zfsctl_create(zfs_sb_t *zsb) - #else -- return (EOPNOTSUPP); -+ return (SET_ERROR(EOPNOTSUPP)); - #endif /* CONFIG_64BIT */ -@@ -332,3 +333,3 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) - ZFS_EXIT(zsb); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -356,3 +357,3 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - if (snapshot_namecheck(name, NULL, NULL) != 0) -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - -@@ -360,3 +361,3 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - if ((strlen(zname) + 1 + strlen(name)) >= len) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -@@ -368,2 +369,7 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - -+/* -+ * Gets the full dataset name that corresponds to the given snapshot name -+ * Example: -+ * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" -+ */ - static int -@@ -384,3 +390,3 @@ zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) - if (path_len > len) { -- error = EFAULT; -+ error = SET_ERROR(EFAULT); - goto out; -@@ -422,3 +428,3 @@ zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, - if (*ipp == NULL) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -458,3 +464,3 @@ zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, - } else { -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -490,4 +496,4 @@ zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name) - int --zfsctl_snapdir_rename(struct inode *sdip, char *sname, -- struct inode *tdip, char *tname, cred_t *cr, int flags) -+zfsctl_snapdir_rename(struct inode *sdip, char *snm, -+ struct inode *tdip, char *tnm, cred_t *cr, int flags) - { -@@ -496,3 +502,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - avl_index_t where; -- char *to, *from, *real; -+ char *to, *from, *real, *fsname; - int error; -@@ -504,8 +510,9 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - real = kmem_alloc(MAXNAMELEN, KM_SLEEP); -+ fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - - if (zsb->z_case == ZFS_CASE_INSENSITIVE) { -- error = dmu_snapshot_realname(zsb->z_os, sname, real, -+ error = dmu_snapshot_realname(zsb->z_os, snm, real, - MAXNAMELEN, NULL); - if (error == 0) { -- sname = real; -+ snm = real; - } else if (error != ENOTSUP) { -@@ -515,8 +522,10 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from); -- if (!error) -- error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to); -- if (!error) -+ dmu_objset_name(zsb->z_os, fsname); -+ -+ error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from); -+ if (error == 0) -+ error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to); -+ if (error == 0) - error = zfs_secpolicy_rename_perms(from, to, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -527,3 +536,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - if (sdip != tdip) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -534,3 +543,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - */ -- if (strcmp(sname, tname) == 0) { -+ if (strcmp(snm, tnm) == 0) { - error = 0; -@@ -541,3 +550,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- error = dmu_objset_rename(from, to, B_FALSE); -+ error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); - if (error) -@@ -545,6 +554,6 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- search.se_name = (char *)sname; -+ search.se_name = (char *)snm; - sep = avl_find(&zsb->z_ctldir_snaps, &search, &where); - if (sep) -- zfsctl_rename_snap(zsb, sep, tname); -+ zfsctl_rename_snap(zsb, sep, tnm); - -@@ -556,2 +565,3 @@ out: - kmem_free(real, MAXNAMELEN); -+ kmem_free(fsname, MAXNAMELEN); - -@@ -590,5 +600,5 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) - error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname); -- if (!error) -+ if (error == 0) - error = zfs_secpolicy_destroy_perms(snapname, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -597,3 +607,3 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) - if ((error == 0) || (error == ENOENT)) -- error = dmu_objset_destroy(snapname, B_FALSE); -+ error = dsl_destroy_snapshot(snapname, B_FALSE); - out: -@@ -623,3 +633,3 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - if (snapshot_namecheck(dirname, NULL, NULL) != 0) { -- error = EILSEQ; -+ error = SET_ERROR(EILSEQ); - goto out; -@@ -630,3 +640,3 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - error = zfs_secpolicy_snapshot_perms(dsname, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -634,5 +644,4 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - if (error == 0) { -- error = dmu_objset_snapshot(dsname, dirname, -- NULL, NULL, B_FALSE, B_FALSE, -1); -- if (error) -+ error = dmu_objset_snapshot_one(dsname, dirname); -+ if (error != 0) - goto out; -@@ -684,3 +693,3 @@ zfsctl_snapdir_inactive(struct inode *ip) - */ --#define SET_UNMOUNT_CMD \ -+#define SET_UNMOUNT_CMD \ - "exec 0z_ctldir_lock); -@@ -946,3 +963,3 @@ zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp) - } else { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } -@@ -969,3 +986,3 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, - ZFS_EXIT(zsb); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c -index ad611ac..4f612e1 100644 ---- a/module/zfs/zfs_debug.c -+++ b/module/zfs/zfs_debug.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -27,2 +27,9 @@ - -+#if !defined(_KERNEL) || !defined(__linux__) -+list_t zfs_dbgmsgs; -+int zfs_dbgmsg_size; -+kmutex_t zfs_dbgmsgs_lock; -+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ -+#endif -+ - /* -@@ -36,2 +43,4 @@ int zfs_flags = 0; - * set, calls to zfs_panic_recover() will turn into warning messages. -+ * This should only be used as a last resort, as it typically results -+ * in leaked space, or worse. - */ -@@ -59,2 +68,8 @@ zfs_dbgmsg_init(void) - { -+#if !defined(_KERNEL) || !defined(__linux__) -+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), -+ offsetof(zfs_dbgmsg_t, zdm_node)); -+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); -+#endif -+ - if (zfs_flags == 0) { -@@ -73,5 +88,72 @@ zfs_dbgmsg_fini(void) - { -- return; -+#if !defined(_KERNEL) || !defined(__linux__) -+ zfs_dbgmsg_t *zdm; -+ -+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { -+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); -+ kmem_free(zdm, size); -+ zfs_dbgmsg_size -= size; -+ } -+ mutex_destroy(&zfs_dbgmsgs_lock); -+ ASSERT0(zfs_dbgmsg_size); -+#endif -+} -+ -+#if !defined(_KERNEL) || !defined(__linux__) -+/* -+ * Print these messages by running: -+ * echo ::zfs_dbgmsg | mdb -k -+ * -+ * Monitor these messages by running: -+ * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' -+ */ -+void -+zfs_dbgmsg(const char *fmt, ...) -+{ -+ int size; -+ va_list adx; -+ zfs_dbgmsg_t *zdm; -+ -+ va_start(adx, fmt); -+ size = vsnprintf(NULL, 0, fmt, adx); -+ va_end(adx); -+ -+ /* -+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used -+ * for the terminating null. -+ */ -+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); -+ zdm->zdm_timestamp = gethrestime_sec(); -+ -+ va_start(adx, fmt); -+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); -+ va_end(adx); -+ -+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); -+ -+ mutex_enter(&zfs_dbgmsgs_lock); -+ list_insert_tail(&zfs_dbgmsgs, zdm); -+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; -+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { -+ zdm = list_remove_head(&zfs_dbgmsgs); -+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); -+ kmem_free(zdm, size); -+ zfs_dbgmsg_size -= size; -+ } -+ mutex_exit(&zfs_dbgmsgs_lock); - } - -+void -+zfs_dbgmsg_print(const char *tag) -+{ -+ zfs_dbgmsg_t *zdm; -+ -+ (void) printf("ZFS_DBGMSG(%s):\n", tag); -+ mutex_enter(&zfs_dbgmsgs_lock); -+ for (zdm = list_head(&zfs_dbgmsgs); zdm; -+ zdm = list_next(&zfs_dbgmsgs, zdm)) -+ (void) printf("%s\n", zdm->zdm_msg); -+ mutex_exit(&zfs_dbgmsgs_lock); -+} -+#endif - -diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c -index 4a4969f..448a872 100644 ---- a/module/zfs/zfs_dir.c -+++ b/module/zfs/zfs_dir.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -173,3 +174,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -244,3 +245,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - rw_exit(&dzp->z_name_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -255,3 +256,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - rw_exit(&dzp->z_name_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -309,3 +310,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - if (error == 0) -- error = (zoid == 0 ? ENOENT : 0); -+ error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); - } else { -@@ -316,3 +317,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - iput(vp); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } else if (vp) { -@@ -321,3 +322,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - iput(vp); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -343,3 +344,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - zfs_dirent_unlock(dl); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -764,3 +765,3 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) - mutex_exit(&zp->z_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -867,3 +868,3 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - mutex_exit(&zp->z_lock); -- return (ENOTEMPTY); -+ return (SET_ERROR(ENOTEMPTY)); - } -@@ -971,6 +972,5 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) - zfs_acl_ids_free(&acl_ids); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } - --top: - tx = dmu_tx_create(zsb->z_os); -@@ -983,9 +983,4 @@ top: - zfs_fuid_txhold(zsb, tx); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto top; -- } - zfs_acl_ids_free(&acl_ids); -@@ -1053,3 +1048,3 @@ top: - zfs_dirent_unlock(dl); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1058,3 +1053,3 @@ top: - zfs_dirent_unlock(dl); -- return (EROFS); -+ return (SET_ERROR(EROFS)); - } -diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c -index af2030a..05ee84c 100644 ---- a/module/zfs/zfs_fm.c -+++ b/module/zfs/zfs_fm.c -@@ -253,2 +253,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - vdev_queue_t *vq = &vd->vdev_queue; -+ vdev_stat_t *vs = &vd->vdev_stat; -+ vdev_t *spare_vd; -+ uint64_t *spare_guids; -+ char **spare_paths; -+ int i, spare_count; - -@@ -284,2 +289,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - -+ if (vs != NULL) { -+ fm_payload_set(ereport, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_read_errors, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_write_errors, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); -+ } -+ - if (pvd != NULL) { -@@ -300,2 +315,24 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - } -+ -+ spare_count = spa->spa_spares.sav_count; -+ spare_paths = kmem_zalloc(sizeof (char *) * spare_count, -+ KM_PUSHPAGE); -+ spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, -+ KM_PUSHPAGE); -+ -+ for (i = 0; i < spare_count; i++) { -+ spare_vd = spa->spa_spares.sav_vdevs[i]; -+ if (spare_vd) { -+ spare_paths[i] = spare_vd->vdev_path; -+ spare_guids[i] = spare_vd->vdev_guid; -+ } -+ } -+ -+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, -+ DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, -+ DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); -+ -+ kmem_free(spare_guids, sizeof (uint64_t) * spare_count); -+ kmem_free(spare_paths, sizeof (char *) * spare_count); - } -@@ -318,4 +355,2 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - DATA_TYPE_UINT64, zio->io_timestamp, NULL); -- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE, -- DATA_TYPE_UINT64, zio->io_deadline, NULL); - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, -@@ -838,11 +873,14 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) - ZFS_ERROR_CLASS, name); -- VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); -- VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); -+ VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); -+ VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); -+ VERIFY0(nvlist_add_int32(resource, -+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); -+ - if (vd) { -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); - } -diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c -index debb5f8..6ca61b8 100644 ---- a/module/zfs/zfs_fuid.c -+++ b/module/zfs/zfs_fuid.c -@@ -567,5 +567,5 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - idmap_stat status; -- uint64_t idx; -+ uint64_t idx = 0; - zfs_fuid_t *zfuid = NULL; -- zfs_fuid_info_t *fuidp; -+ zfs_fuid_info_t *fuidp = NULL; - -@@ -594,2 +594,5 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - -+ VERIFY3U(type, >=, ZFS_OWNER); -+ VERIFY3U(type, <=, ZFS_ACE_GROUP); -+ - switch (type) { -@@ -610,3 +613,3 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - }; -- domain = fuidp->z_domain_table[idx -1]; -+ domain = fuidp->z_domain_table[idx - 1]; - } else { -diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c -index a9184a1..0dfda1a 100644 ---- a/module/zfs/zfs_ioctl.c -+++ b/module/zfs/zfs_ioctl.c -@@ -27,5 +27,107 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 201i3 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+/* -+ * ZFS ioctls. -+ * -+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage -+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. -+ * -+ * There are two ways that we handle ioctls: the legacy way where almost -+ * all of the logic is in the ioctl callback, and the new way where most -+ * of the marshalling is handled in the common entry point, zfsdev_ioctl(). -+ * -+ * Non-legacy ioctls should be registered by calling -+ * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked -+ * from userland by lzc_ioctl(). -+ * -+ * The registration arguments are as follows: -+ * -+ * const char *name -+ * The name of the ioctl. This is used for history logging. If the -+ * ioctl returns successfully (the callback returns 0), and allow_log -+ * is true, then a history log entry will be recorded with the input & -+ * output nvlists. The log entry can be printed with "zpool history -i". -+ * -+ * zfs_ioc_t ioc -+ * The ioctl request number, which userland will pass to ioctl(2). -+ * The ioctl numbers can change from release to release, because -+ * the caller (libzfs) must be matched to the kernel. -+ * -+ * zfs_secpolicy_func_t *secpolicy -+ * This function will be called before the zfs_ioc_func_t, to -+ * determine if this operation is permitted. It should return EPERM -+ * on failure, and 0 on success. Checks include determining if the -+ * dataset is visible in this zone, and if the user has either all -+ * zfs privileges in the zone (SYS_MOUNT), or has been granted permission -+ * to do this operation on this dataset with "zfs allow". -+ * -+ * zfs_ioc_namecheck_t namecheck -+ * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool -+ * name, a dataset name, or nothing. If the name is not well-formed, -+ * the ioctl will fail and the callback will not be called. -+ * Therefore, the callback can assume that the name is well-formed -+ * (e.g. is null-terminated, doesn't have more than one '@' character, -+ * doesn't have invalid characters). -+ * -+ * zfs_ioc_poolcheck_t pool_check -+ * This specifies requirements on the pool state. If the pool does -+ * not meet them (is suspended or is readonly), the ioctl will fail -+ * and the callback will not be called. If any checks are specified -+ * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. -+ * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | -+ * POOL_CHECK_READONLY). -+ * -+ * boolean_t smush_outnvlist -+ * If smush_outnvlist is true, then the output is presumed to be a -+ * list of errors, and it will be "smushed" down to fit into the -+ * caller's buffer, by removing some entries and replacing them with a -+ * single "N_MORE_ERRORS" entry indicating how many were removed. See -+ * nvlist_smush() for details. If smush_outnvlist is false, and the -+ * outnvlist does not fit into the userland-provided buffer, then the -+ * ioctl will fail with ENOMEM. -+ * -+ * zfs_ioc_func_t *func -+ * The callback function that will perform the operation. -+ * -+ * The callback should return 0 on success, or an error number on -+ * failure. If the function fails, the userland ioctl will return -1, -+ * and errno will be set to the callback's return value. The callback -+ * will be called with the following arguments: -+ * -+ * const char *name -+ * The name of the pool or dataset to operate on, from -+ * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the -+ * expected type (pool, dataset, or none). -+ * -+ * nvlist_t *innvl -+ * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or -+ * NULL if no input nvlist was provided. Changes to this nvlist are -+ * ignored. If the input nvlist could not be deserialized, the -+ * ioctl will fail and the callback will not be called. -+ * -+ * nvlist_t *outnvl -+ * The output nvlist, initially empty. The callback can fill it in, -+ * and it will be returned to userland by serializing it into -+ * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization -+ * fails (e.g. because the caller didn't supply a large enough -+ * buffer), then the overall ioctl will fail. See the -+ * 'smush_nvlist' argument above for additional behaviors. -+ * -+ * There are two typical uses of the output nvlist: -+ * - To return state, e.g. property values. In this case, -+ * smush_outnvlist should be false. If the buffer was not large -+ * enough, the caller will reallocate a larger buffer and try -+ * the ioctl again. -+ * -+ * - To return multiple errors from an ioctl which makes on-disk -+ * changes. In this case, smush_outnvlist should be true. -+ * Ioctls which make on-disk modifications should generally not -+ * use the outnvl if they succeed, because the caller can not -+ * distinguish between the operation failing, and -+ * deserialization failing. - */ -@@ -59,2 +161,3 @@ - #include -+#include - #include -@@ -75,5 +178,7 @@ - #include --#include - #include - -+#include -+#include -+#include - #include -@@ -93,4 +198,9 @@ extern void zfs_fini(void); - --typedef int zfs_ioc_func_t(zfs_cmd_t *); --typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); -+uint_t zfs_fsyncer_key; -+extern uint_t rrw_tsd_key; -+static uint_t zfs_allow_log_key; -+ -+typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); -+typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); -+typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); - -@@ -105,3 +215,3 @@ typedef enum { - POOL_CHECK_SUSPENDED = 1 << 1, -- POOL_CHECK_READONLY = 1 << 2 -+ POOL_CHECK_READONLY = 1 << 2, - } zfs_ioc_poolcheck_t; -@@ -109,2 +219,3 @@ typedef enum { - typedef struct zfs_ioc_vec { -+ zfs_ioc_legacy_func_t *zvec_legacy_func; - zfs_ioc_func_t *zvec_func; -@@ -112,4 +223,6 @@ typedef struct zfs_ioc_vec { - zfs_ioc_namecheck_t zvec_namecheck; -- boolean_t zvec_his_log; -+ boolean_t zvec_allow_log; - zfs_ioc_poolcheck_t zvec_pool_check; -+ boolean_t zvec_smush_outnvlist; -+ const char *zvec_name; - } zfs_ioc_vec_t; -@@ -131,9 +244,6 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, - boolean_t *); --int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); -+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); -+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); - --static int zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature); --static int zfs_prop_activate_feature_check(void *arg1, void *arg2, -- dmu_tx_t *tx); --static void zfs_prop_activate_feature_sync(void *arg1, void *arg2, -- dmu_tx_t *tx); -+static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature); - -@@ -183,5 +293,3 @@ zfs_is_bootfs(const char *name) - /* -- * zfs_earlier_version -- * -- * Return non-zero if the spa version is less than requested version. -+ * Return non-zero if the spa version is less than requested version. - */ -@@ -203,4 +311,2 @@ zfs_earlier_version(const char *name, int version) - /* -- * zpl_earlier_version -- * - * Return TRUE if the ZPL version is less than requested version. -@@ -239,3 +345,3 @@ zfs_log_history(zfs_cmd_t *zc) - if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) -- (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); -+ (void) spa_history_log(spa, buf); - spa_close(spa, FTAG); -@@ -251,3 +357,3 @@ zfs_log_history(zfs_cmd_t *zc) - static int --zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -262,3 +368,3 @@ zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -268,3 +374,3 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -282,3 +388,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - !zone_dataset_visible(dataset, &writable)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -290,3 +396,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (secpolicy_zfs(cr) && zoned) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } else { -@@ -296,3 +402,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (!zoned) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -300,3 +406,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (!writable) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -311,3 +417,3 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -321,8 +427,4 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -- return (ENOENT); -- } -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -+ if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) -+ return (SET_ERROR(ENOENT)); - -@@ -331,24 +433,7 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) - --/* -- * If name ends in a '@', then require recursive permissions. -- */ --int --zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) -+static int -+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, -+ const char *perm, cred_t *cr) - { - int error; -- boolean_t descendent = B_FALSE; -- dsl_dataset_t *ds; -- char *at; -- -- at = strchr(name, '@'); -- if (at != NULL && at[1] == '\0') { -- *at = '\0'; -- descendent = B_TRUE; -- } -- -- error = dsl_dataset_hold(name, FTAG, &ds); -- if (at != NULL) -- *at = '@'; -- if (error != 0) -- return (error); - -@@ -357,7 +442,5 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - error = secpolicy_zfs(cr); -- if (error) -- error = dsl_deleg_access_impl(ds, descendent, perm, cr); -+ if (error != 0) -+ error = dsl_deleg_access_impl(ds, perm, cr); - } -- -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -365,14 +448,23 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - --int --zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, -- const char *perm, cred_t *cr) -+static int -+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - { - int error; -+ dsl_dataset_t *ds; -+ dsl_pool_t *dp; - -- error = zfs_dozonecheck_ds(name, ds, cr); -- if (error == 0) { -- error = secpolicy_zfs(cr); -- if (error) -- error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); -+ error = dsl_pool_hold(name, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, name, FTAG, &ds); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); - } -+ -+ error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); -+ -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -399,4 +491,4 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - 1, sizeof (ds_hexsl), &ds_hexsl, NULL); -- if (error) -- return (EPERM); -+ if (error != 0) -+ return (SET_ERROR(EPERM)); - -@@ -407,3 +499,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -416,3 +508,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (new_default || !blequal(&new_sl, CR_SL(CRED()))) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - return (0); -@@ -427,6 +519,6 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - if (!zoned) { - if (zfs_check_global_label(name, strval) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -449,4 +541,4 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - setsl_tag, &os); -- if (error) -- return (EPERM); -+ if (error != 0) -+ return (SET_ERROR(EPERM)); - -@@ -460,3 +552,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (hexstr_to_label(strval, &new_sl) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -477,3 +569,3 @@ out_check: - #else -- return ENOTSUP; -+ return (ENOTSUP); - #endif /* HAVE_MLSLABEL */ -@@ -498,3 +590,3 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - break; -@@ -512,5 +604,5 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - setpoint)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - if (!zoned || strlen(dsname) <= strlen(setpoint)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -520,3 +612,3 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - if (!is_system_labeled()) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -535,4 +627,5 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - --int --zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -541,3 +634,3 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) - error = zfs_dozonecheck(zc->zc_name, cr); -- if (error) -+ if (error != 0) - return (error); -@@ -551,4 +644,5 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) - --int --zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -558,6 +652,6 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) - --int --zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- spa_t *spa; - dsl_pool_t *dp; -@@ -573,14 +667,12 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - if (cp == NULL) -- return (EINVAL); -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -+ return (SET_ERROR(EINVAL)); -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); -+ } - -@@ -591,2 +683,3 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -595,5 +688,14 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ -+static int -+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -+{ -+ return (zfs_secpolicy_write_perms(zc->zc_name, -+ ZFS_DELEG_PERM_SEND, cr)); -+} -+ - #ifdef HAVE_SMB_SHARE -+/* ARGSUSED */ - static int --zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -612,3 +714,3 @@ zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) - VN_RELE(vp); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -622,3 +724,3 @@ zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) - int --zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -626,3 +728,3 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -631,6 +733,6 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - } else { -- return (zfs_secpolicy_deleg_share(zc, cr)); -+ return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -639,3 +741,3 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - int --zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -643,3 +745,3 @@ zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -648,6 +750,6 @@ zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) - } else { -- return (zfs_secpolicy_deleg_share(zc, cr)); -+ return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -670,3 +772,3 @@ zfs_get_parent(const char *datasetname, char *parent, int parentsize) - if (cp == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - cp[0] = '\0'; -@@ -689,4 +791,5 @@ zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -697,17 +800,47 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) - * Destroying snapshots with delegated permissions requires -- * descendent mount and destroy permissions. -+ * descendant mount and destroy permissions. - */ -+/* ARGSUSED */ - static int --zfs_secpolicy_destroy_recursive(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int error; -- char *dsname; -+ nvlist_t *snaps; -+ nvpair_t *pair, *nextpair; -+ int error = 0; - -- dsname = kmem_asprintf("%s@", zc->zc_name); -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nextpair) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; - -- error = zfs_secpolicy_destroy_perms(dsname, cr); -- if (error == ENOENT) -- error = zfs_secpolicy_destroy_perms(zc->zc_name, cr); -+ error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp); -+ if (error != 0) -+ break; -+ nextpair = nvlist_next_nvpair(snaps, pair); -+ error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); -+ if (error == 0) -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ -+ if (error == 0) { -+ error = zfs_secpolicy_destroy_perms(nvpair_name(pair), -+ cr); -+ } else if (error == ENOENT) { -+ /* -+ * Ignore any snapshots that don't exist (we consider -+ * them "already destroyed"). Remove the name from the -+ * nvl here in case the snapshot is created between -+ * now and when we try to destroy it (in which case -+ * we don't want to destroy it since we haven't -+ * checked for permission). -+ */ -+ fnvlist_remove_nvpair(snaps, pair); -+ error = 0; -+ } -+ if (error != 0) -+ break; -+ } - -- strfree(dsname); - return (error); -@@ -744,4 +877,5 @@ zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -750,7 +884,8 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- char parentname[MAXNAMELEN]; -- objset_t *clone; -+ dsl_pool_t *dp; -+ dsl_dataset_t *clone; - int error; -@@ -759,18 +894,22 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - ZFS_DELEG_PERM_PROMOTE, cr); -- if (error) -+ if (error != 0) -+ return (error); -+ -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -- error = dmu_objset_hold(zc->zc_name, FTAG, &clone); -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); - - if (error == 0) { -- dsl_dataset_t *pclone = NULL; -+ char parentname[MAXNAMELEN]; -+ dsl_dataset_t *origin = NULL; - dsl_dir_t *dd; -- dd = clone->os_dsl_dataset->ds_dir; -+ dd = clone->ds_dir; - -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_origin_obj, FTAG, &pclone); -- rw_exit(&dd->dd_pool->dp_config_rwlock); -- if (error) { -- dmu_objset_rele(clone, FTAG); -+ dd->dd_phys->dd_origin_obj, FTAG, &origin); -+ if (error != 0) { -+ dsl_dataset_rele(clone, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -778,12 +917,14 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_write_perms(zc->zc_name, -+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, - ZFS_DELEG_PERM_MOUNT, cr); - -- dsl_dataset_name(pclone, parentname); -- dmu_objset_rele(clone, FTAG); -- dsl_dataset_rele(pclone, FTAG); -- if (error == 0) -- error = zfs_secpolicy_write_perms(parentname, -+ dsl_dataset_name(origin, parentname); -+ if (error == 0) { -+ error = zfs_secpolicy_write_perms_ds(parentname, origin, - ZFS_DELEG_PERM_PROMOTE, cr); -+ } -+ dsl_dataset_rele(clone, FTAG); -+ dsl_dataset_rele(origin, FTAG); - } -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -791,4 +932,5 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -815,7 +957,44 @@ zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) - -+/* -+ * Check for permission to create each snapshot in the nvlist. -+ */ -+/* ARGSUSED */ - static int --zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -+ nvlist_t *snaps; -+ int error = 0; -+ nvpair_t *pair; -+ -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char *name = nvpair_name(pair); -+ char *atp = strchr(name, '@'); -+ -+ if (atp == NULL) { -+ error = SET_ERROR(EINVAL); -+ break; -+ } -+ *atp = '\0'; -+ error = zfs_secpolicy_snapshot_perms(name, cr); -+ *atp = '@'; -+ if (error != 0) -+ break; -+ } -+ return (error); -+} - -- return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); -+/* ARGSUSED */ -+static int -+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -+{ -+ /* -+ * Even root must have a proper TSD so that we know what pool -+ * to log to. -+ */ -+ if (tsd_get(zfs_allow_log_key) == NULL) -+ return (SET_ERROR(EPERM)); -+ return (0); - } -@@ -823,3 +1002,3 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -827,2 +1006,3 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - int error; -+ char *origin; - -@@ -832,7 +1012,6 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - -- if (zc->zc_value[0] != '\0') { -- if ((error = zfs_secpolicy_write_perms(zc->zc_value, -- ZFS_DELEG_PERM_CLONE, cr)) != 0) -- return (error); -- } -+ if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && -+ (error = zfs_secpolicy_write_perms(origin, -+ ZFS_DELEG_PERM_CLONE, cr)) != 0) -+ return (error); - -@@ -842,6 +1021,4 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_write_perms(parentname, -- ZFS_DELEG_PERM_MOUNT, cr); -- -- return (error); -+ return (zfs_secpolicy_write_perms(parentname, -+ ZFS_DELEG_PERM_MOUNT, cr)); - } -@@ -854,6 +1031,6 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { - if (secpolicy_sys_config(cr, B_FALSE) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -867,3 +1044,3 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -883,3 +1060,3 @@ zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -888,4 +1065,5 @@ zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -895,3 +1073,3 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) - if (!zfs_prop_user(zc->zc_value)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - return (zfs_secpolicy_write_perms(zc->zc_name, -@@ -905,5 +1083,5 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int err = zfs_secpolicy_read(zc, cr); -+ int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) -@@ -912,3 +1090,3 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -934,5 +1112,5 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int err = zfs_secpolicy_read(zc, cr); -+ int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) -@@ -941,3 +1119,3 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -947,4 +1125,5 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -954,14 +1133,47 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- return (zfs_secpolicy_write_perms(zc->zc_name, -- ZFS_DELEG_PERM_HOLD, cr)); -+ nvpair_t *pair; -+ nvlist_t *holds; -+ int error; -+ -+ error = nvlist_lookup_nvlist(innvl, "holds", &holds); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ char fsname[MAXNAMELEN]; -+ error = dmu_fsname(nvpair_name(pair), fsname); -+ if (error != 0) -+ return (error); -+ error = zfs_secpolicy_write_perms(fsname, -+ ZFS_DELEG_PERM_HOLD, cr); -+ if (error != 0) -+ return (error); -+ } -+ return (0); - } - -+/* ARGSUSED */ - static int --zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- return (zfs_secpolicy_write_perms(zc->zc_name, -- ZFS_DELEG_PERM_RELEASE, cr)); -+ nvpair_t *pair; -+ int error; -+ -+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(innvl, pair)) { -+ char fsname[MAXNAMELEN]; -+ error = dmu_fsname(nvpair_name(pair), fsname); -+ if (error != 0) -+ return (error); -+ error = zfs_secpolicy_write_perms(fsname, -+ ZFS_DELEG_PERM_RELEASE, cr); -+ if (error != 0) -+ return (error); -+ } -+ return (0); - } -@@ -972,3 +1184,3 @@ zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -985,9 +1197,9 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_snapshot(zc, cr); -- if (!error) -- error = zfs_secpolicy_hold(zc, cr); -- if (!error) -- error = zfs_secpolicy_release(zc, cr); -- if (!error) -- error = zfs_secpolicy_destroy(zc, cr); -+ error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); -+ if (error == 0) -+ error = zfs_secpolicy_hold(zc, innvl, cr); -+ if (error == 0) -+ error = zfs_secpolicy_release(zc, innvl, cr); -+ if (error == 0) -+ error = zfs_secpolicy_destroy(zc, innvl, cr); - return (error); -@@ -1009,3 +1221,3 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) - if (size == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1030,4 +1242,10 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) - -+/* -+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes. -+ * Entries will be removed from the end of the nvlist, and one int32 entry -+ * named "N_MORE_ERRORS" will be added indicating how many entries were -+ * removed. -+ */ - static int --fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) -+nvlist_smush(nvlist_t *errors, size_t max) - { -@@ -1035,5 +1253,5 @@ fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) - -- VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); -+ size = fnvlist_size(errors); - -- if (size > zc->zc_nvlist_dst_size) { -+ if (size > max) { - nvpair_t *more_errors; -@@ -1041,21 +1259,19 @@ fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) - -- if (zc->zc_nvlist_dst_size < 1024) -- return (ENOMEM); -+ if (max < 1024) -+ return (SET_ERROR(ENOMEM)); - -- VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); -- more_errors = nvlist_prev_nvpair(*errors, NULL); -+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); -+ more_errors = nvlist_prev_nvpair(errors, NULL); - - do { -- nvpair_t *pair = nvlist_prev_nvpair(*errors, -+ nvpair_t *pair = nvlist_prev_nvpair(errors, - more_errors); -- VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); -+ fnvlist_remove_nvpair(errors, pair); - n++; -- VERIFY(nvlist_size(*errors, &size, -- NV_ENCODE_NATIVE) == 0); -- } while (size > zc->zc_nvlist_dst_size); -+ size = fnvlist_size(errors); -+ } while (size > max); - -- VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); -- VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); -- ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); -- ASSERT(size <= zc->zc_nvlist_dst_size); -+ fnvlist_remove_nvpair(errors, more_errors); -+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); -+ ASSERT3U(fnvlist_size(errors), <=, max); - } -@@ -1072,14 +1288,12 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) - -- VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); -+ size = fnvlist_size(nvl); - - if (size > zc->zc_nvlist_dst_size) { -- error = ENOMEM; -+ error = SET_ERROR(ENOMEM); - } else { -- packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); -- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, -- KM_SLEEP) == 0); -+ packed = fnvlist_pack(nvl, &size); - if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size, zc->zc_iflags) != 0) -- error = EFAULT; -- kmem_free(packed, size); -+ error = SET_ERROR(EFAULT); -+ fnvlist_pack_free(packed, size); - } -@@ -1087,2 +1301,3 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) - zc->zc_nvlist_dst_size = size; -+ zc->zc_nvlist_dst_filled = B_TRUE; - return (error); -@@ -1097,3 +1312,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - error = dmu_objset_hold(dsname, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -1101,3 +1316,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1109,3 +1324,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - } else { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1136,6 +1351,6 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) - * thread should be just about to disassociate the -- * objset from the zfsvfs. -+ * objset from the zsb. - */ - rrw_exit(&(*zsbp)->z_teardown_lock, tag); -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -1165,3 +1380,2 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - nvlist_t *zplprops = NULL; -- char *buf; - -@@ -1185,3 +1399,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - if (!SPA_VERSION_IS_SUPPORTED(version)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto pool_props_bad; -@@ -1201,3 +1415,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - zplprops, NULL); -- if (error) -+ if (error != 0) - goto pool_props_bad; -@@ -1205,5 +1419,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - -- buf = history_str_get(zc); -- -- error = spa_create(zc->zc_name, config, props, buf, zplprops); -+ error = spa_create(zc->zc_name, config, props, zplprops); - -@@ -1216,5 +1428,2 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - -- if (buf != NULL) -- history_str_free(buf); -- - pool_props_bad: -@@ -1259,3 +1468,3 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) - guid != zc->zc_guid) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - else -@@ -1299,3 +1508,3 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -1363,3 +1572,3 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) - if (config == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1421,3 +1630,3 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1439,3 +1648,3 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) - if ((size = zc->zc_history_len) == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1446,3 +1655,3 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1479,8 +1688,3 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) - { -- int error; -- -- if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value))) -- return (error); -- -- return (0); -+ return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); - } -@@ -1506,3 +1710,3 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1535,3 +1739,3 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1549,4 +1753,3 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) - int error; -- nvlist_t *config, **l2cache, **spares; -- uint_t nl2cache = 0, nspares = 0; -+ nvlist_t *config; - -@@ -1558,24 +1761,2 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) - zc->zc_iflags, &config); -- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, -- &l2cache, &nl2cache); -- -- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, -- &spares, &nspares); -- -- /* -- * A root pool with concatenated devices is not supported. -- * Thus, can not add a device to a root pool. -- * -- * Intent log device can not be added to a rootpool because -- * during mountroot, zil is replayed, a seperated log device -- * can not be accessed during the mountroot time. -- * -- * l2cache and spare devices are ok to be added to a rootpool. -- */ -- if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { -- nvlist_free(config); -- spa_close(spa, FTAG); -- return (EDOM); -- } -- - if (error == 0) { -@@ -1643,3 +1824,3 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) - default: -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } -@@ -1801,11 +1982,10 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) - { -- objset_t *os = NULL; -+ objset_t *os; - int error; - -- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) -- return (error); -- -- error = zfs_ioc_objset_stats_impl(zc, os); -- -- dmu_objset_rele(os, FTAG); -+ error = dmu_objset_hold(zc->zc_name, FTAG, &os); -+ if (error == 0) { -+ error = zfs_ioc_objset_stats_impl(zc, os); -+ dmu_objset_rele(os, FTAG); -+ } - -@@ -1830,9 +2010,5 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - { -- objset_t *os = NULL; -- int error; -+ int error = 0; - nvlist_t *nv; - -- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) -- return (error); -- - /* -@@ -1842,9 +2018,7 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - */ -- if (!dsl_prop_get_hasrecvd(os)) { -- dmu_objset_rele(os, FTAG); -- return (ENOTSUP); -- } -+ if (!dsl_prop_get_hasrecvd(zc->zc_name)) -+ return (SET_ERROR(ENOTSUP)); - - if (zc->zc_nvlist_dst != 0 && -- (error = dsl_prop_get_received(os, &nv)) == 0) { -+ (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { - error = put_nvlist(zc, nv); -@@ -1853,3 +2027,2 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - -- dmu_objset_rele(os, FTAG); - return (error); -@@ -1912,3 +2085,3 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) - } else { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -@@ -1918,3 +2091,3 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) - --static boolean_t -+boolean_t - dataset_name_hidden(const char *name) -@@ -1959,3 +2132,3 @@ top: - if (error == ENOENT) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - return (error); -@@ -1968,16 +2141,2 @@ top: - -- /* -- * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 -- * but is not declared void because its called by dmu_objset_find(). -- */ -- if (zc->zc_cookie == 0) { -- uint64_t cookie = 0; -- int len = sizeof (zc->zc_name) - (p - zc->zc_name); -- -- while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { -- if (!dataset_name_hidden(zc->zc_name)) -- (void) dmu_objset_prefetch(zc->zc_name, NULL); -- } -- } -- - do { -@@ -1987,3 +2146,3 @@ top: - if (error == ENOENT) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } while (error == 0 && dataset_name_hidden(zc->zc_name)); -@@ -2024,10 +2183,6 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) - --top: -- if (zc->zc_cookie == 0 && !zc->zc_simple) -- (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, -- NULL, DS_FIND_SNAPSHOTS); -- - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) { - return (error == ENOENT ? ESRCH : error); -+ } - -@@ -2039,3 +2194,3 @@ top: - dmu_objset_rele(os, FTAG); -- return (ESRCH); -+ return (SET_ERROR(ESRCH)); - } -@@ -2051,20 +2206,4 @@ top: - -- /* -- * Since we probably don't have a hold on this snapshot, -- * it's possible that the objsetid could have been destroyed -- * and reused for a new objset. It's OK if this happens during -- * a zfs send operation, since the new createtxg will be -- * beyond the range we're interested in. -- */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) { -- if (error == ENOENT) { -- /* Racing with destroy, get the next one. */ -- *strchr(zc->zc_name, '@') = '\0'; -- dmu_objset_rele(os, FTAG); -- goto top; -- } -- } else { -+ if (error == 0) { - objset_t *ossnap; -@@ -2077,3 +2216,3 @@ top: - } else if (error == ENOENT) { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -2082,3 +2221,3 @@ top: - /* if we failed, undo the @ that we tacked on to zc_name */ -- if (error) -+ if (error != 0) - *strchr(zc->zc_name, '@') = '\0'; -@@ -2106,3 +2245,3 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) - &pair) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2116,3 +2255,3 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) - vallen != 3) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2172,3 +2311,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - case ZFS_PROP_REFQUOTA: -- err = dsl_dataset_set_quota(dsname, source, intval); -+ err = dsl_dataset_set_refquota(dsname, source, intval); - break; -@@ -2178,3 +2317,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - case ZFS_PROP_REFRESERVATION: -- err = dsl_dataset_set_reservation(dsname, source, intval); -+ err = dsl_dataset_set_refreservation(dsname, source, intval); - break; -@@ -2213,3 +2352,2 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - spa_t *spa; -- dsl_pool_t *dp; - -@@ -2218,4 +2356,2 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - -- dp = spa->spa_dsl_pool; -- - /* -@@ -2225,3 +2361,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - if (!spa_feature_is_active(spa, feature)) { -- if ((err = zfs_prop_activate_feature(dp, -+ if ((err = zfs_prop_activate_feature(spa, - feature)) != 0) { -@@ -2251,10 +2387,9 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - * This function is best effort. If it fails to set any of the given properties, -- * it continues to set as many as it can and returns the first error -- * encountered. If the caller provides a non-NULL errlist, it also gives the -- * complete list of names of all the properties it failed to set along with the -- * corresponding error numbers. The caller is responsible for freeing the -- * returned errlist. -+ * it continues to set as many as it can and returns the last error -+ * encountered. If the caller provides a non-NULL errlist, it will be filled in -+ * with the list of names of all the properties that failed along with the -+ * corresponding error numbers. - * -- * If every property is set successfully, zero is returned and the list pointed -- * at by errlist is NULL. -+ * If every property is set successfully, zero is returned and errlist is not -+ * modified. - */ -@@ -2262,3 +2397,3 @@ int - zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, -- nvlist_t **errlist) -+ nvlist_t *errlist) - { -@@ -2269,10 +2404,5 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, - char *strval; -- nvlist_t *genericnvl; -- nvlist_t *errors; -- nvlist_t *retrynvl; -- -- VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - -+ nvlist_t *genericnvl = fnvlist_alloc(); -+ nvlist_t *retrynvl = fnvlist_alloc(); - retry: -@@ -2288,6 +2418,6 @@ retry: - nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -+ attrs = fnvpair_value_nvlist(pair); - if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &propval) != 0) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2298,3 +2428,3 @@ retry: - if (nvpair_type(propval) != DATA_TYPE_STRING) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else if (zfs_prop_userquota(propname)) { -@@ -2302,5 +2432,5 @@ retry: - DATA_TYPE_UINT64_ARRAY) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2309,3 +2439,3 @@ retry: - if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { -@@ -2313,4 +2443,3 @@ retry: - -- VERIFY(nvpair_value_uint64(propval, -- &intval) == 0); -+ intval = fnvpair_value_uint64(propval); - -@@ -2320,3 +2449,3 @@ retry: - case PROP_TYPE_STRING: -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -2325,3 +2454,3 @@ retry: - intval, &unused) != 0) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -2332,3 +2461,3 @@ retry: - } else { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2358,4 +2487,7 @@ retry: - -- if (err != 0) -- VERIFY(nvlist_add_int32(errors, propname, err) == 0); -+ if (err != 0) { -+ if (errlist != NULL) -+ fnvlist_add_int32(errlist, propname, err); -+ rv = err; -+ } - } -@@ -2381,5 +2513,5 @@ retry: - nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, -- &propval) == 0); -+ attrs = fnvpair_value_nvlist(pair); -+ propval = fnvlist_lookup_nvpair(attrs, -+ ZPROP_VALUE); - } -@@ -2387,11 +2519,9 @@ retry: - if (nvpair_type(propval) == DATA_TYPE_STRING) { -- VERIFY(nvpair_value_string(propval, -- &strval) == 0); -- err = dsl_prop_set(dsname, propname, source, 1, -- strlen(strval) + 1, strval); -+ strval = fnvpair_value_string(propval); -+ err = dsl_prop_set_string(dsname, propname, -+ source, strval); - } else { -- VERIFY(nvpair_value_uint64(propval, -- &intval) == 0); -- err = dsl_prop_set(dsname, propname, source, 8, -- 1, &intval); -+ intval = fnvpair_value_uint64(propval); -+ err = dsl_prop_set_int(dsname, propname, source, -+ intval); - } -@@ -2399,4 +2529,7 @@ retry: - if (err != 0) { -- VERIFY(nvlist_add_int32(errors, propname, -- err) == 0); -+ if (errlist != NULL) { -+ fnvlist_add_int32(errlist, propname, -+ err); -+ } -+ rv = err; - } -@@ -2407,14 +2540,2 @@ retry: - -- if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { -- nvlist_free(errors); -- errors = NULL; -- } else { -- VERIFY(nvpair_value_int32(pair, &rv) == 0); -- } -- -- if (errlist == NULL) -- nvlist_free(errors); -- else -- *errlist = errors; -- - return (rv); -@@ -2426,3 +2547,3 @@ retry: - static int --zfs_check_userprops(char *fsname, nvlist_t *nvl) -+zfs_check_userprops(const char *fsname, nvlist_t *nvl) - { -@@ -2437,3 +2558,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - nvpair_type(pair) != DATA_TYPE_STRING) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2444,3 +2565,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - if (strlen(propname) >= ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -@@ -2448,3 +2569,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - if (strlen(valstr) >= ZAP_MAXVALUELEN) -- return (E2BIG); -+ return (SET_ERROR(E2BIG)); - } -@@ -2470,3 +2591,3 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) - static int --clear_received_props(objset_t *os, const char *fs, nvlist_t *props, -+clear_received_props(const char *dsname, nvlist_t *props, - nvlist_t *skipped) -@@ -2482,4 +2603,4 @@ clear_received_props(objset_t *os, const char *fs, nvlist_t *props, - zprop_source_t flags = (ZPROP_SRC_NONE | -- (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); -- err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); -+ (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); -+ err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); - } -@@ -2506,3 +2627,3 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) - ZPROP_SRC_LOCAL); -- nvlist_t *errors = NULL; -+ nvlist_t *errors; - int error; -@@ -2515,17 +2636,15 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) - nvlist_t *origprops; -- objset_t *os; -- -- if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { -- if (dsl_prop_get_received(os, &origprops) == 0) { -- (void) clear_received_props(os, -- zc->zc_name, origprops, nvl); -- nvlist_free(origprops); -- } - -- dsl_prop_set_hasrecvd(os); -- dmu_objset_rele(os, FTAG); -+ if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { -+ (void) clear_received_props(zc->zc_name, -+ origprops, nvl); -+ nvlist_free(origprops); - } -+ -+ error = dsl_prop_set_hasrecvd(zc->zc_name); - } - -- error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); -+ errors = fnvlist_alloc(); -+ if (error == 0) -+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); - -@@ -2570,3 +2689,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - if (!zfs_prop_user(propname)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2575,3 +2694,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - prop == ZFS_PROP_VERSION) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else { -@@ -2592,3 +2711,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - nvlist_free(dummy); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2608,7 +2727,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } - -- /* the property name has been validated by zfs_secpolicy_inherit() */ -- return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); -+ /* property name has been validated by zfs_secpolicy_inherit_prop() */ -+ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); - } -@@ -2685,3 +2804,3 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) - else -- error = EFAULT; -+ error = SET_ERROR(EFAULT); - -@@ -2693,26 +2812,2 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) - * inputs: -- * zc_name name of volume -- * -- * outputs: none -- */ --static int --zfs_ioc_create_minor(zfs_cmd_t *zc) --{ -- return (zvol_create_minor(zc->zc_name)); --} -- --/* -- * inputs: -- * zc_name name of volume -- * -- * outputs: none -- */ --static int --zfs_ioc_remove_minor(zfs_cmd_t *zc) --{ -- return (zvol_remove_minor(zc->zc_name)); --} -- --/* -- * inputs: - * zc_name name of filesystem -@@ -2738,3 +2833,3 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) - nvlist_free(fsaclnv); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2748,3 +2843,3 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) - error = secpolicy_zfs(CRED()); -- if (error) { -+ if (error != 0) { - if (zc->zc_perm_action == B_FALSE) { -@@ -2799,6 +2894,6 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) - * inputs: -- * createprops list of properties requested by creator -- * default_zplver zpl version to use if unspecified in createprops -- * fuids_ok fuids allowed in this version of the spa? - * os parent objset pointer (NULL if root fs) -+ * fuids_ok fuids allowed in this version of the spa? -+ * sa_ok SAs allowed in this version of the spa? -+ * createprops list of properties requested by creator - * -@@ -2861,3 +2956,3 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - sense != ZFS_PROP_UNDEFINED))) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -2957,22 +3052,26 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, - /* -- * inputs: -- * zc_objset_type type of objset to create (fs vs zvol) -- * zc_name name of new objset -- * zc_value name of snapshot to clone from (may be empty) -- * zc_nvlist_src{_size} nvlist of properties to apply -+ * innvl: { -+ * "type" -> dmu_objset_type_t (int32) -+ * (optional) "props" -> { prop -> value } -+ * } - * -- * outputs: none -+ * outnvl: propname -> error code (int32) - */ - static int --zfs_ioc_create(zfs_cmd_t *zc) -+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) - { -- objset_t *clone; - int error = 0; -- zfs_creat_t zct; -+ zfs_creat_t zct = { 0 }; - nvlist_t *nvprops = NULL; - void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -- dmu_objset_type_t type = zc->zc_objset_type; -+ int32_t type32; -+ dmu_objset_type_t type; -+ boolean_t is_insensitive = B_FALSE; - -- switch (type) { -+ if (nvlist_lookup_int32(innvl, "type", &type32) != 0) -+ return (SET_ERROR(EINVAL)); -+ type = type32; -+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - -+ switch (type) { - case DMU_OST_ZFS: -@@ -2989,96 +3088,104 @@ zfs_ioc_create(zfs_cmd_t *zc) - } -- if (strchr(zc->zc_name, '@') || -- strchr(zc->zc_name, '%')) -- return (EINVAL); -- -- if (zc->zc_nvlist_src != 0 && -- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvprops)) != 0) -- return (error); -+ if (strchr(fsname, '@') || -+ strchr(fsname, '%')) -+ return (SET_ERROR(EINVAL)); - -- zct.zct_zplprops = NULL; - zct.zct_props = nvprops; - -- if (zc->zc_value[0] != '\0') { -- /* -- * We're creating a clone of an existing snapshot. -- */ -- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; -- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ if (cbfunc == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ if (type == DMU_OST_ZVOL) { -+ uint64_t volsize, volblocksize; -+ -+ if (nvprops == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (nvlist_lookup_uint64(nvprops, -+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) -+ return (SET_ERROR(EINVAL)); - -- error = dmu_objset_hold(zc->zc_value, FTAG, &clone); -- if (error) { -- nvlist_free(nvprops); -+ if ((error = nvlist_lookup_uint64(nvprops, -+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), -+ &volblocksize)) != 0 && error != ENOENT) -+ return (SET_ERROR(EINVAL)); -+ -+ if (error != 0) -+ volblocksize = zfs_prop_default_numeric( -+ ZFS_PROP_VOLBLOCKSIZE); -+ -+ if ((error = zvol_check_volblocksize( -+ volblocksize)) != 0 || -+ (error = zvol_check_volsize(volsize, -+ volblocksize)) != 0) - return (error); -- } -+ } else if (type == DMU_OST_ZFS) { -+ int error; - -- error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -- if (error) { -- nvlist_free(nvprops); -+ /* -+ * We have to have normalization and -+ * case-folding flags correct when we do the -+ * file system creation, so go figure them out -+ * now. -+ */ -+ VERIFY(nvlist_alloc(&zct.zct_zplprops, -+ NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ error = zfs_fill_zplprops(fsname, nvprops, -+ zct.zct_zplprops, &is_insensitive); -+ if (error != 0) { -+ nvlist_free(zct.zct_zplprops); - return (error); - } -- } else { -- boolean_t is_insensitive = B_FALSE; -+ } - -- if (cbfunc == NULL) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ error = dmu_objset_create(fsname, type, -+ is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); -+ nvlist_free(zct.zct_zplprops); - -- if (type == DMU_OST_ZVOL) { -- uint64_t volsize, volblocksize; -+ /* -+ * It would be nice to do this atomically. -+ */ -+ if (error == 0) { -+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, -+ nvprops, outnvl); -+ if (error != 0) -+ (void) dsl_destroy_head(fsname); -+ } - -- if (nvprops == NULL || -- nvlist_lookup_uint64(nvprops, -- zfs_prop_to_name(ZFS_PROP_VOLSIZE), -- &volsize) != 0) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+#ifdef _KERNEL -+ if (error == 0 && type == DMU_OST_ZVOL) -+ zvol_create_minors(fsname); -+#endif - -- if ((error = nvlist_lookup_uint64(nvprops, -- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), -- &volblocksize)) != 0 && error != ENOENT) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ return (error); -+} - -- if (error != 0) -- volblocksize = zfs_prop_default_numeric( -- ZFS_PROP_VOLBLOCKSIZE); -+/* -+ * innvl: { -+ * "origin" -> name of origin snapshot -+ * (optional) "props" -> { prop -> value } -+ * } -+ * -+ * outputs: -+ * outnvl: propname -> error code (int32) -+ */ -+static int -+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ int error = 0; -+ nvlist_t *nvprops = NULL; -+ char *origin_name; - -- if ((error = zvol_check_volblocksize( -- volblocksize)) != 0 || -- (error = zvol_check_volsize(volsize, -- volblocksize)) != 0) { -- nvlist_free(nvprops); -- return (error); -- } -- } else if (type == DMU_OST_ZFS) { -- int error; -+ if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) -+ return (SET_ERROR(EINVAL)); -+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - -- /* -- * We have to have normalization and -- * case-folding flags correct when we do the -- * file system creation, so go figure them out -- * now. -- */ -- VERIFY(nvlist_alloc(&zct.zct_zplprops, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -- error = zfs_fill_zplprops(zc->zc_name, nvprops, -- zct.zct_zplprops, &is_insensitive); -- if (error != 0) { -- nvlist_free(nvprops); -- nvlist_free(zct.zct_zplprops); -- return (error); -- } -- } -- error = dmu_objset_create(zc->zc_name, type, -- is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); -- nvlist_free(zct.zct_zplprops); -- } -+ if (strchr(fsname, '@') || -+ strchr(fsname, '%')) -+ return (SET_ERROR(EINVAL)); -+ -+ if (dataset_namecheck(origin_name, NULL, NULL) != 0) -+ return (SET_ERROR(EINVAL)); -+ error = dmu_objset_clone(fsname, origin_name); -+ if (error != 0) -+ return (error); - -@@ -3088,8 +3195,13 @@ zfs_ioc_create(zfs_cmd_t *zc) - if (error == 0) { -- error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, -- nvprops, NULL); -+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, -+ nvprops, outnvl); - if (error != 0) -- (void) dmu_objset_destroy(zc->zc_name, B_FALSE); -+ (void) dsl_destroy_head(fsname); - } -- nvlist_free(nvprops); -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(fsname); -+#endif -+ - return (error); -@@ -3098,41 +3210,64 @@ zfs_ioc_create(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -- * zc_value short name of snapshot -- * zc_cookie recursive flag -- * zc_nvlist_src[_size] property list -+ * innvl: { -+ * "snaps" -> { snapshot1, snapshot2 } -+ * (optional) "props" -> { prop -> value (string) } -+ * } - * -- * outputs: -- * zc_value short snapname (i.e. part after the '@') -+ * outnvl: snapshot -> error code (int32) - */ - static int --zfs_ioc_snapshot(zfs_cmd_t *zc) -+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) - { -- nvlist_t *nvprops = NULL; -- int error; -- boolean_t recursive = zc->zc_cookie; -- -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -+ nvlist_t *snaps; -+ nvlist_t *props = NULL; -+ int error, poollen; -+ nvpair_t *pair, *pair2; - -- if (zc->zc_nvlist_src != 0 && -- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvprops)) != 0) -+ (void) nvlist_lookup_nvlist(innvl, "props", &props); -+ if ((error = zfs_check_userprops(poolname, props)) != 0) - return (error); - -- error = zfs_check_userprops(zc->zc_name, nvprops); -- if (error) -- goto out; -+ if (!nvlist_empty(props) && -+ zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) -+ return (SET_ERROR(ENOTSUP)); - -- if (!nvlist_empty(nvprops) && -- zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { -- error = ENOTSUP; -- goto out; -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ poollen = strlen(poolname); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ const char *name = nvpair_name(pair); -+ const char *cp = strchr(name, '@'); -+ -+ /* -+ * The snap name must contain an @, and the part after it must -+ * contain only valid characters. -+ */ -+ if (cp == NULL || snapshot_namecheck(cp + 1, NULL, NULL) != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ /* -+ * The snap must be in the specified pool. -+ */ -+ if (strncmp(name, poolname, poollen) != 0 || -+ (name[poollen] != '/' && name[poollen] != '@')) -+ return (SET_ERROR(EXDEV)); -+ -+ /* This must be the only snap of this fs. */ -+ for (pair2 = nvlist_next_nvpair(snaps, pair); -+ pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { -+ if (strncmp(name, nvpair_name(pair2), cp - name + 1) -+ == 0) { -+ return (SET_ERROR(EXDEV)); -+ } -+ } - } - -- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, -- nvprops, recursive, B_FALSE, -1); -+ error = dsl_dataset_snapshot(snaps, props, outnvl); -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(poolname); -+#endif - --out: -- nvlist_free(nvprops); - return (error); -@@ -3141,8 +3276,59 @@ out: - /* -- * inputs: -- * name dataset name, or when 'arg == NULL' the full snapshot name -- * arg short snapshot name (i.e. part after the '@') -+ * innvl: "message" -> string -+ */ -+/* ARGSUSED */ -+static int -+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ char *message; -+ spa_t *spa; -+ int error; -+ char *poolname; -+ -+ /* -+ * The poolname in the ioctl is not set, we get it from the TSD, -+ * which was set at the end of the last successful ioctl that allows -+ * logging. The secpolicy func already checked that it is set. -+ * Only one log ioctl is allowed after each successful ioctl, so -+ * we clear the TSD here. -+ */ -+ poolname = tsd_get(zfs_allow_log_key); -+ (void) tsd_set(zfs_allow_log_key, NULL); -+ error = spa_open(poolname, &spa, FTAG); -+ strfree(poolname); -+ if (error != 0) -+ return (error); -+ -+ if (nvlist_lookup_string(innvl, "message", &message) != 0) { -+ spa_close(spa, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ -+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { -+ spa_close(spa, FTAG); -+ return (SET_ERROR(ENOTSUP)); -+ } -+ -+ error = spa_history_log(spa, message); -+ spa_close(spa, FTAG); -+ return (error); -+} -+ -+/* -+ * The dp_config_rwlock must not be held when calling this, because the -+ * unmount may need to write out data. -+ * -+ * This function is best-effort. Callers must deal gracefully if it -+ * remains mounted (or is remounted after this call). -+ * -+ * XXX: This function should detect a failure to unmount a snapdir of a dataset -+ * and return the appropriate error code when it is mounted. Its Illumos and -+ * FreeBSD counterparts do this. We do not do this on Linux because there is no -+ * clear way to access the mount information that FreeBSD and Illumos use to -+ * distinguish between things with mounted snapshot directories, and things -+ * without mounted snapshot directories, which include zvols. Returning a -+ * failure for the latter causes `zfs destroy` to fail on zvol snapshots. - */ - int --zfs_unmount_snap(const char *name, void *arg) -+zfs_unmount_snap(const char *snapname) - { -@@ -3150,38 +3336,29 @@ zfs_unmount_snap(const char *name, void *arg) - char *dsname; -- char *snapname; - char *fullname; - char *ptr; -- int error; - -- if (arg) { -- dsname = strdup(name); -- snapname = strdup(arg); -- } else { -- ptr = strchr(name, '@'); -- if (ptr) { -- dsname = strdup(name); -- dsname[ptr - name] = '\0'; -- snapname = strdup(ptr + 1); -- } else { -- return (0); -- } -- } -+ if ((ptr = strchr(snapname, '@')) == NULL) -+ return (0); - -- fullname = kmem_asprintf("%s@%s", dsname, snapname); -+ dsname = kmem_alloc(ptr - snapname + 1, KM_SLEEP); -+ strlcpy(dsname, snapname, ptr - snapname + 1); -+ fullname = strdup(snapname); - -- error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE); -- if (error == 0) { -- error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); -+ if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) { -+ ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os))); -+ (void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); - zfs_sb_rele(zsb, FTAG); -- -- /* Allow ENOENT for consistency with upstream */ -- if (error == ENOENT) -- error = 0; - } - -- strfree(dsname); -- strfree(snapname); -+ kmem_free(dsname, ptr - snapname + 1); - strfree(fullname); - -- return (error); -+ return (0); -+} -+ -+/* ARGSUSED */ -+static int -+zfs_unmount_snap_cb(const char *snapname, void *arg) -+{ -+ return (zfs_unmount_snap(snapname)); - } -@@ -3189,36 +3366,62 @@ zfs_unmount_snap(const char *name, void *arg) - /* -- * inputs: -- * zc_name name of filesystem, snaps must be under it -- * zc_nvlist_src[_size] full names of snapshots to destroy -- * zc_defer_destroy mark for deferred destroy -+ * When a clone is destroyed, its origin may also need to be destroyed, -+ * in which case it must be unmounted. This routine will do that unmount -+ * if necessary. -+ */ -+void -+zfs_destroy_unmount_origin(const char *fsname) -+{ -+ int error; -+ objset_t *os; -+ dsl_dataset_t *ds; -+ -+ error = dmu_objset_hold(fsname, FTAG, &os); -+ if (error != 0) -+ return; -+ ds = dmu_objset_ds(os); -+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { -+ char originname[MAXNAMELEN]; -+ dsl_dataset_name(ds->ds_prev, originname); -+ dmu_objset_rele(os, FTAG); -+ (void) zfs_unmount_snap(originname); -+ } else { -+ dmu_objset_rele(os, FTAG); -+ } -+} -+ -+/* -+ * innvl: { -+ * "snaps" -> { snapshot1, snapshot2 } -+ * (optional boolean) "defer" -+ * } - * -- * outputs: -- * zc_name on failure, name of failed snapshot -+ * outnvl: snapshot -> error code (int32) - */ - static int --zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) -+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) - { -- int err, len; -- nvlist_t *nvl; -+ int error, poollen; -+ nvlist_t *snaps; - nvpair_t *pair; -+ boolean_t defer; - -- if ((err = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvl)) != 0) -- return (err); -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ defer = nvlist_exists(innvl, "defer"); - -- len = strlen(zc->zc_name); -- for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; -- pair = nvlist_next_nvpair(nvl, pair)) { -+ poollen = strlen(poolname); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { - const char *name = nvpair_name(pair); -+ - /* -- * The snap name must be underneath the zc_name. This ensures -- * that our permission checks were legitimate. -+ * The snap must be in the specified pool. - */ -- if (strncmp(zc->zc_name, name, len) != 0 || -- (name[len] != '@' && name[len] != '/')) { -- nvlist_free(nvl); -- return (EINVAL); -- } -+ if (strncmp(name, poolname, poollen) != 0 || -+ (name[poollen] != '/' && name[poollen] != '@')) -+ return (SET_ERROR(EXDEV)); - -- (void) zfs_unmount_snap(name, NULL); -+ error = zfs_unmount_snap(name); -+ if (error != 0) -+ return (error); - (void) zvol_remove_minor(name); -@@ -3226,6 +3429,3 @@ zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) - -- err = dmu_snapshots_destroy_nvl(nvl, zc->zc_defer_destroy, -- zc->zc_name); -- nvlist_free(nvl); -- return (err); -+ return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); - } -@@ -3244,5 +3444,6 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - int err; -- if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { -- err = zfs_unmount_snap(zc->zc_name, NULL); -- if (err) -+ -+ if (zc->zc_objset_type == DMU_OST_ZFS) { -+ err = zfs_unmount_snap(zc->zc_name); -+ if (err != 0) - return (err); -@@ -3250,3 +3451,6 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - -- err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); -+ if (strchr(zc->zc_name, '@')) -+ err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); -+ else -+ err = dsl_destroy_head(zc->zc_name); - if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) -@@ -3257,47 +3461,17 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of dataset to rollback (to most recent snapshot) -+ * fsname is name of dataset to rollback (to most recent snapshot) - * -- * outputs: none -+ * innvl is not used. -+ * -+ * outnvl: "target" -> name of most recent snapshot -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_rollback(zfs_cmd_t *zc) -+zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) - { -- dsl_dataset_t *ds, *clone; -- int error; - zfs_sb_t *zsb; -- char *clone_name; -- -- error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); -- if (error) -- return (error); -- -- /* must not be a snapshot */ -- if (dsl_dataset_is_snapshot(ds)) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -- } -- -- /* must have a most recent snapshot */ -- if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -- } -- -- /* -- * Create clone of most recent snapshot. -- */ -- clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); -- error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); -- if (error) -- goto out; -- -- error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); -- if (error) -- goto out; -+ int error; - -- /* -- * Do clone swap. -- */ -- if (get_zfs_sb(zc->zc_name, &zsb) == 0) { -+ if (get_zfs_sb(fsname, &zsb) == 0) { - error = zfs_suspend_fs(zsb); -@@ -3306,11 +3480,4 @@ zfs_ioc_rollback(zfs_cmd_t *zc) - -- if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { -- error = dsl_dataset_clone_swap(clone, ds, -- B_TRUE); -- dsl_dataset_disown(ds, FTAG); -- ds = NULL; -- } else { -- error = EBUSY; -- } -- resume_err = zfs_resume_fs(zsb, zc->zc_name); -+ error = dsl_dataset_rollback(fsname, zsb, outnvl); -+ resume_err = zfs_resume_fs(zsb, fsname); - error = error ? error : resume_err; -@@ -3319,20 +3486,18 @@ zfs_ioc_rollback(zfs_cmd_t *zc) - } else { -- if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { -- error = dsl_dataset_clone_swap(clone, ds, B_TRUE); -- dsl_dataset_disown(ds, FTAG); -- ds = NULL; -- } else { -- error = EBUSY; -- } -+ error = dsl_dataset_rollback(fsname, NULL, outnvl); - } -+ return (error); -+} - -- /* -- * Destroy clone (which also closes it). -- */ -- (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); -+static int -+recursive_unmount(const char *fsname, void *arg) -+{ -+ const char *snapname = arg; -+ char *fullname; -+ int error; -+ -+ fullname = kmem_asprintf("%s@%s", fsname, snapname); -+ error = zfs_unmount_snap(fullname); -+ strfree(fullname); - --out: -- strfree(clone_name); -- if (ds) -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -3352,3 +3517,3 @@ zfs_ioc_rename(zfs_cmd_t *zc) - boolean_t recursive = zc->zc_cookie & 1; -- int err; -+ char *at; - -@@ -3357,23 +3522,29 @@ zfs_ioc_rename(zfs_cmd_t *zc) - strchr(zc->zc_value, '%')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -- /* -- * Unmount snapshot unless we're doing a recursive rename, -- * in which case the dataset code figures out which snapshots -- * to unmount. -- */ -- if (!recursive && strchr(zc->zc_name, '@') != NULL && -- zc->zc_objset_type == DMU_OST_ZFS) { -- err = zfs_unmount_snap(zc->zc_name, NULL); -- if (err) -- return (err); -- } -+ at = strchr(zc->zc_name, '@'); -+ if (at != NULL) { -+ /* snaps must be in same fs */ -+ int error; - -- err = dmu_objset_rename(zc->zc_name, zc->zc_value, recursive); -- if ((err == 0) && (zc->zc_objset_type == DMU_OST_ZVOL)) { -- (void) zvol_remove_minor(zc->zc_name); -- (void) zvol_create_minor(zc->zc_value); -- } -+ if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) -+ return (SET_ERROR(EXDEV)); -+ *at = '\0'; -+ if (zc->zc_objset_type == DMU_OST_ZFS) { -+ error = dmu_objset_find(zc->zc_name, -+ recursive_unmount, at + 1, -+ recursive ? DS_FIND_CHILDREN : 0); -+ if (error != 0) { -+ *at = '@'; -+ return (error); -+ } -+ } -+ error = dsl_dataset_rename_snapshot(zc->zc_name, -+ at + 1, strchr(zc->zc_value, '@') + 1, recursive); -+ *at = '@'; - -- return (err); -+ return (error); -+ } else { -+ return (dsl_dir_rename(zc->zc_name, zc->zc_value)); -+ } - } -@@ -3412,3 +3583,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - /* USERUSED and GROUPUSED are read-only */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3420,3 +3591,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3424,3 +3595,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (issnap) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3453,3 +3624,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_GZIP_COMPRESSION)) { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3459,3 +3630,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_ZLE_COMPRESSION)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -3472,3 +3643,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - spa_close(spa, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3486,3 +3657,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - !BOOTFS_COMPRESS_VALID(intval)) { -- return (ERANGE); -+ return (SET_ERROR(ERANGE)); - } -@@ -3493,3 +3664,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3498,3 +3669,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3503,3 +3674,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3512,3 +3683,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_PASSTHROUGH_X)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3523,22 +3694,2 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - /* -- * Activates a feature on a pool in response to a property setting. This -- * creates a new sync task which modifies the pool to reflect the feature -- * as being active. -- */ --static int --zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) --{ -- int err; -- -- /* EBUSY here indicates that the feature is already active */ -- err = dsl_sync_task_do(dp, zfs_prop_activate_feature_check, -- zfs_prop_activate_feature_sync, dp->dp_spa, feature, 2); -- -- if (err != 0 && err != EBUSY) -- return (err); -- else -- return (0); --} -- --/* - * Checks for a race condition to make sure we don't increment a feature flag -@@ -3546,8 +3697,7 @@ zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) - */ --/*ARGSUSED*/ - static int --zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) -+zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - -@@ -3556,3 +3706,3 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) - else -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -3564,6 +3714,6 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - -@@ -3573,2 +3723,23 @@ zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* -+ * Activates a feature on a pool in response to a property setting. This -+ * creates a new sync task which modifies the pool to reflect the feature -+ * as being active. -+ */ -+static int -+zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature) -+{ -+ int err; -+ -+ /* EBUSY here indicates that the feature is already active */ -+ err = dsl_sync_task(spa_name(spa), -+ zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, -+ feature, 2); -+ -+ if (err != 0 && err != EBUSY) -+ return (err); -+ else -+ return (0); -+} -+ -+/* - * Removes properties from the given props list that fail permission checks -@@ -3607,3 +3778,3 @@ zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) - if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || -- (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { -+ (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { - VERIFY(nvlist_remove_nvpair(props, pair) == 0); -@@ -3726,3 +3897,2 @@ zfs_ioc_recv(zfs_cmd_t *zc) - file_t *fp; -- objset_t *os; - dmu_recv_cookie_t drc; -@@ -3736,3 +3906,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - nvlist_t *origprops = NULL; /* existing properties */ -- objset_t *origin = NULL; -+ char *origin = NULL; - char *tosnap; -@@ -3744,3 +3914,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - strchr(zc->zc_value, '%')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3759,3 +3929,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - nvlist_free(props); -- return (EBADF); -+ return (SET_ERROR(EBADF)); - } -@@ -3764,7 +3934,20 @@ zfs_ioc_recv(zfs_cmd_t *zc) - -- if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { -- if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && -- !dsl_prop_get_hasrecvd(os)) { -+ if (zc->zc_string[0]) -+ origin = zc->zc_string; -+ -+ error = dmu_recv_begin(tofs, tosnap, -+ &zc->zc_begin_record, force, origin, &drc); -+ if (error != 0) -+ goto out; -+ -+ /* -+ * Set properties before we receive the stream so that they are applied -+ * to the new data. Note that we must call dmu_recv_stream() if -+ * dmu_recv_begin() succeeds. -+ */ -+ if (props != NULL && !drc.drc_newfs) { -+ if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= -+ SPA_VERSION_RECVD_PROPS && -+ !dsl_prop_get_hasrecvd(tofs)) - first_recvd_props = B_TRUE; -- } - -@@ -3775,3 +3958,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- if (dsl_prop_get_received(os, &origprops) == 0) { -+ if (dsl_prop_get_received(tofs, &origprops) == 0) { - nvlist_t *errlist = NULL; -@@ -3787,56 +3970,26 @@ zfs_ioc_recv(zfs_cmd_t *zc) - props_reduce(props, origprops); -- if (zfs_check_clearable(tofs, origprops, -- &errlist) != 0) -+ if (zfs_check_clearable(tofs, origprops, &errlist) != 0) - (void) nvlist_merge(errors, errlist, 0); - nvlist_free(errlist); -- } -- -- dmu_objset_rele(os, FTAG); -- } -- -- if (zc->zc_string[0]) { -- error = dmu_objset_hold(zc->zc_string, FTAG, &origin); -- if (error) -- goto out; -- } - -- error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, -- &zc->zc_begin_record, force, origin, &drc); -- if (origin) -- dmu_objset_rele(origin, FTAG); -- if (error) -- goto out; -- -- /* -- * Set properties before we receive the stream so that they are applied -- * to the new data. Note that we must call dmu_recv_stream() if -- * dmu_recv_begin() succeeds. -- */ -- if (props) { -- nvlist_t *errlist; -- -- if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { -- if (drc.drc_newfs) { -- if (spa_version(os->os_spa) >= -- SPA_VERSION_RECVD_PROPS) -- first_recvd_props = B_TRUE; -- } else if (origprops != NULL) { -- if (clear_received_props(os, tofs, origprops, -- first_recvd_props ? NULL : props) != 0) -- zc->zc_obj |= ZPROP_ERR_NOCLEAR; -- } else { -+ if (clear_received_props(tofs, origprops, -+ first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; -- } -- dsl_prop_set_hasrecvd(os); -- } else if (!drc.drc_newfs) { -+ } else { - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } -+ } - -- (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, -- props, &errlist); -- (void) nvlist_merge(errors, errlist, 0); -- nvlist_free(errlist); -+ if (props != NULL) { -+ props_error = dsl_prop_set_hasrecvd(tofs); -+ -+ if (props_error == 0) { -+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, -+ props, errors); -+ } - } - -- if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { -+ if (zc->zc_nvlist_dst_size != 0 && -+ (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || -+ put_nvlist(zc, errors) != 0)) { - /* -@@ -3845,3 +3998,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- props_error = EINVAL; -+ props_error = SET_ERROR(EINVAL); - } -@@ -3864,3 +4017,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- end_err = dmu_recv_end(&drc); -+ end_err = dmu_recv_end(&drc, zsb); - if (error == 0) -@@ -3870,3 +4023,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - } else { -- error = dmu_recv_end(&drc); -+ error = dmu_recv_end(&drc, NULL); - } -@@ -3884,2 +4037,8 @@ zfs_ioc_recv(zfs_cmd_t *zc) - #endif -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(tofs); -+#endif -+ - /* -@@ -3887,18 +4046,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- if (error && props) { -- if (dmu_objset_hold(tofs, FTAG, &os) == 0) { -- if (clear_received_props(os, tofs, props, NULL) != 0) { -- /* -- * We failed to clear the received properties. -- * Since we may have left a $recvd value on the -- * system, we can't clear the $hasrecvd flag. -- */ -- zc->zc_obj |= ZPROP_ERR_NORESTORE; -- } else if (first_recvd_props) { -- dsl_prop_unset_hasrecvd(os); -- } -- dmu_objset_rele(os, FTAG); -- } else if (!drc.drc_newfs) { -- /* We failed to clear the received properties. */ -+ if (error != 0 && props != NULL && !drc.drc_newfs) { -+ if (clear_received_props(tofs, props, NULL) != 0) { -+ /* -+ * We failed to clear the received properties. -+ * Since we may have left a $recvd value on the -+ * system, we can't clear the $hasrecvd flag. -+ */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; -+ } else if (first_recvd_props) { -+ dsl_prop_unset_hasrecvd(tofs); - } -@@ -3954,66 +4107,66 @@ zfs_ioc_send(zfs_cmd_t *zc) - { -- objset_t *fromsnap = NULL; -- objset_t *tosnap; - int error; - offset_t off; -- dsl_dataset_t *ds; -- dsl_dataset_t *dsfrom = NULL; -- spa_t *spa; -- dsl_pool_t *dp; - boolean_t estimate = (zc->zc_guid != 0); - -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -- return (error); -+ if (zc->zc_obj != 0) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *tosnap; - -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) { -- spa_close(spa, FTAG); -- return (error); -- } -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) -+ return (error); - -- error = dmu_objset_from_ds(ds, &tosnap); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -- spa_close(spa, FTAG); -- return (error); -+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ -+ if (dsl_dir_is_clone(tosnap->ds_dir)) -+ zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj; -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - } - -- if (zc->zc_fromobj != 0) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -+ if (estimate) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *tosnap; -+ dsl_dataset_t *fromsnap = NULL; -+ -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); -- } -- error = dmu_objset_from_ds(dsfrom, &fromsnap); -- if (error) { -- dsl_dataset_rele(dsfrom, FTAG); -- dsl_dataset_rele(ds, FTAG); -+ -+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); - } -- } else { -- spa_close(spa, FTAG); -- } - -- if (estimate) { -- error = dmu_send_estimate(tosnap, fromsnap, zc->zc_obj, -+ if (zc->zc_fromobj != 0) { -+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, -+ FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ } -+ -+ error = dmu_send_estimate(tosnap, fromsnap, - &zc->zc_objset_type); -+ -+ if (fromsnap != NULL) -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - } else { - file_t *fp = getf(zc->zc_cookie); -- if (fp == NULL) { -- dsl_dataset_rele(ds, FTAG); -- if (dsfrom) -- dsl_dataset_rele(dsfrom, FTAG); -- return (EBADF); -- } -+ if (fp == NULL) -+ return (SET_ERROR(EBADF)); - - off = fp->f_offset; -- error = dmu_send(tosnap, fromsnap, zc->zc_obj, -- zc->zc_cookie, fp->f_vnode, &off); -+ error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, -+ zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); - -@@ -4023,5 +4176,2 @@ zfs_ioc_send(zfs_cmd_t *zc) - } -- if (dsfrom) -- dsl_dataset_rele(dsfrom, FTAG); -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -4040,2 +4190,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -4044,5 +4195,12 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - -- if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ - mutex_enter(&ds->ds_sendstream_lock); -@@ -4066,3 +4224,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -4070,2 +4228,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4143,3 +4302,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - mutex_exit(&spa_namespace_lock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -4159,3 +4318,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - if (zc->zc_nvlist_src == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4176,3 +4335,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - -- if (error) -+ if (error != 0) - return (error); -@@ -4188,3 +4347,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (ENODEV); -+ return (SET_ERROR(ENODEV)); - } -@@ -4200,3 +4359,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - if (zio_resume(spa) != 0) -- error = EIO; -+ error = SET_ERROR(EIO); - -@@ -4214,3 +4373,3 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc) - error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -+ if (error != 0) - return (error); -@@ -4254,3 +4413,3 @@ zfs_ioc_promote(zfs_cmd_t *zc) - (void) dmu_objset_find(zc->zc_value, -- zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); -+ zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); -@@ -4277,6 +4436,6 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); -- if (error) -+ if (error != 0) - return (error); -@@ -4310,6 +4469,6 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) - if (bufsize <= 0) -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - - error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); -- if (error) -+ if (error != 0) - return (error); -@@ -4354,4 +4513,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) - error = zfs_suspend_fs(zsb); -- if (error == 0) -+ if (error == 0) { -+ dmu_objset_refresh_ownership(zsb->z_os, -+ zsb); - error = zfs_resume_fs(zsb, zc->zc_name); -+ } - } -@@ -4363,3 +4525,3 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -4376,3 +4538,3 @@ zfs_ioc_share(zfs_cmd_t *zc) - { -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - } -@@ -4398,3 +4560,3 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -4415,2 +4577,3 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) - * outputs: -+ * zc_value short name of new snapshot - */ -@@ -4420,3 +4583,9 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) - char *snap_name; -+ char *hold_name; - int error; -+ minor_t minor; -+ -+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); -+ if (error != 0) -+ return (error); - -@@ -4424,18 +4593,12 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) - (u_longlong_t)ddi_get_lbolt64()); -+ hold_name = kmem_asprintf("%%%s", zc->zc_value); - -- if (strlen(snap_name) >= MAXNAMELEN) { -- strfree(snap_name); -- return (E2BIG); -- } -- -- error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, -- NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); -- if (error != 0) { -- strfree(snap_name); -- return (error); -- } -- -- (void) strcpy(zc->zc_value, snap_name); -+ error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, -+ hold_name); -+ if (error == 0) -+ (void) strcpy(zc->zc_value, snap_name); - strfree(snap_name); -- return (0); -+ strfree(hold_name); -+ zfs_onexit_fd_rele(zc->zc_cleanup_fd); -+ return (error); - } -@@ -4454,4 +4617,2 @@ zfs_ioc_diff(zfs_cmd_t *zc) - { -- objset_t *fromsnap; -- objset_t *tosnap; - file_t *fp; -@@ -4460,18 +4621,5 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); -- if (error) -- return (error); -- -- error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); -- if (error) { -- dmu_objset_rele(tosnap, FTAG); -- return (error); -- } -- - fp = getf(zc->zc_cookie); -- if (fp == NULL) { -- dmu_objset_rele(fromsnap, FTAG); -- dmu_objset_rele(tosnap, FTAG); -- return (EBADF); -- } -+ if (fp == NULL) -+ return (SET_ERROR(EBADF)); - -@@ -4479,3 +4627,3 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); -+ error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); - -@@ -4485,4 +4633,2 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- dmu_objset_rele(fromsnap, FTAG); -- dmu_objset_rele(tosnap, FTAG); - return (error); -@@ -4539,3 +4685,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - VN_RELE(vp); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4558,3 +4704,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - error = dmu_tx_assign(tx, TXG_WAIT); -- if (error) { -+ if (error != 0) { - dmu_tx_abort(tx); -@@ -4564,3 +4710,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - } -- if (error) { -+ if (error != 0) { - mutex_exit(&zsb->z_lock); -@@ -4629,3 +4775,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - default: -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4640,3 +4786,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -4645,21 +4791,18 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -- * zc_value short name of snap -- * zc_string user-supplied tag for this hold -- * zc_cookie recursive flag -- * zc_temphold set if hold is temporary -- * zc_cleanup_fd cleanup-on-exit file descriptor for calling process -- * zc_sendobj if non-zero, the objid for zc_name@zc_value -- * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg -+ * innvl: { -+ * "holds" -> { snapname -> holdname (string), ... } -+ * (optional) "cleanup_fd" -> fd (int32) -+ * } - * -- * outputs: none -+ * outnvl: { -+ * snapname -> error value (int32) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_hold(zfs_cmd_t *zc) -+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) - { -- boolean_t recursive = zc->zc_cookie; -- spa_t *spa; -- dsl_pool_t *dp; -- dsl_dataset_t *ds; -+ nvlist_t *holds; -+ int cleanup_fd = -1; - int error; -@@ -4667,55 +4810,15 @@ zfs_ioc_hold(zfs_cmd_t *zc) - -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -- -- if (zc->zc_sendobj == 0) { -- return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, -- zc->zc_string, recursive, zc->zc_temphold, -- zc->zc_cleanup_fd)); -- } -- -- if (recursive) -- return (EINVAL); -- -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -- return (error); -- -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) -- return (error); -- -- /* -- * Until we have a hold on this snapshot, it's possible that -- * zc_sendobj could've been destroyed and reused as part -- * of a later txg. Make sure we're looking at the right object. -- */ -- if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOENT); -- } -+ error = nvlist_lookup_nvlist(args, "holds", &holds); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); - -- if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { -- error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -+ if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { -+ error = zfs_onexit_fd_hold(cleanup_fd, &minor); -+ if (error != 0) - return (error); -- } -- } -- -- error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, -- zc->zc_temphold); -- if (minor != 0) { -- if (error == 0) { -- dsl_register_onexit_hold_cleanup(ds, zc->zc_string, -- minor); -- } -- zfs_onexit_fd_rele(zc->zc_cleanup_fd); - } -- dsl_dataset_rele(ds, FTAG); - -+ error = dsl_dataset_user_hold(holds, minor, errlist); -+ if (minor != 0) -+ zfs_onexit_fd_rele(cleanup_fd); - return (error); -@@ -4724,20 +4827,14 @@ zfs_ioc_hold(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of dataset from which we're releasing a user hold -- * zc_value short name of snap -- * zc_string user-supplied tag for this hold -- * zc_cookie recursive flag -+ * innvl is not used. - * -- * outputs: none -+ * outnvl: { -+ * holdname -> time added (uint64 seconds since epoch) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_release(zfs_cmd_t *zc) -+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) - { -- boolean_t recursive = zc->zc_cookie; -- -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -- -- return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, -- zc->zc_string, recursive)); -+ return (dsl_dataset_get_holds(snapname, outnvl)); - } -@@ -4745,20 +4842,17 @@ zfs_ioc_release(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -+ * innvl: { -+ * snapname -> { holdname, ... } -+ * ... -+ * } - * -- * outputs: -- * zc_nvlist_src{_size} nvlist of snapshot holds -+ * outnvl: { -+ * snapname -> error value (int32) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_get_holds(zfs_cmd_t *zc) -+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) - { -- nvlist_t *nvp; -- int error; -- -- if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { -- error = put_nvlist(zc, nvp); -- nvlist_free(nvp); -- } -- -- return (error); -+ return (dsl_dataset_user_release(holds, errlist)); - } -@@ -4768,2 +4862,3 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) - * zc_guid flags (ZEVENT_NONBLOCK) -+ * zc_cleanup_fd zevent file descriptor - * -@@ -4772,3 +4867,2 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) - * zc_cookie dropped events since last get -- * zc_cleanup_fd cleanup-on-exit file descriptor - */ -@@ -4803,3 +4897,3 @@ zfs_ioc_events_next(zfs_cmd_t *zc) - error = zfs_zevent_wait(ze); -- if (error) -+ if (error != 0) - break; -@@ -4824,3 +4918,25 @@ zfs_ioc_events_clear(zfs_cmd_t *zc) - -- return 0; -+ return (0); -+} -+ -+/* -+ * inputs: -+ * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END -+ * zc_cleanup zevent file descriptor -+ */ -+static int -+zfs_ioc_events_seek(zfs_cmd_t *zc) -+{ -+ zfs_zevent_t *ze; -+ minor_t minor; -+ int error; -+ -+ error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); -+ if (error != 0) -+ return (error); -+ -+ error = zfs_zevent_seek(ze, zc->zc_guid); -+ zfs_zevent_fd_rele(zc->zc_cleanup_fd); -+ -+ return (error); - } -@@ -4841,10 +4957,17 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - int error; -+ dsl_pool_t *dp; - dsl_dataset_t *new, *old; - -- error = dsl_dataset_hold(zc->zc_name, FTAG, &new); -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); -- error = dsl_dataset_hold(zc->zc_value, FTAG, &old); -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4856,2 +4979,3 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4860,23 +4984,37 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name full name of last snapshot -- * zc_value full name of first snapshot -+ * innvl: { -+ * "firstsnap" -> snapshot name -+ * } - * -- * outputs: -- * zc_cookie space in bytes -- * zc_objset_type compressed space in bytes -- * zc_perm_action uncompressed space in bytes -+ * outnvl: { -+ * "used" -> space in bytes -+ * "compressed" -> compressed space in bytes -+ * "uncompressed" -> uncompressed space in bytes -+ * } - */ - static int --zfs_ioc_space_snaps(zfs_cmd_t *zc) -+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) - { - int error; -+ dsl_pool_t *dp; - dsl_dataset_t *new, *old; -+ char *firstsnap; -+ uint64_t used, comp, uncomp; -+ -+ if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) -+ return (SET_ERROR(EINVAL)); - -- error = dsl_dataset_hold(zc->zc_name, FTAG, &new); -+ error = dsl_pool_hold(lastsnap, FTAG, &dp); - if (error != 0) - return (error); -- error = dsl_dataset_hold(zc->zc_value, FTAG, &old); -+ -+ error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4884,6 +5022,9 @@ zfs_ioc_space_snaps(zfs_cmd_t *zc) - -- error = dsl_dataset_space_wouldfree(old, new, &zc->zc_cookie, -- &zc->zc_objset_type, &zc->zc_perm_action); -+ error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); - dsl_dataset_rele(old, FTAG); - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ fnvlist_add_uint64(outnvl, "used", used); -+ fnvlist_add_uint64(outnvl, "compressed", comp); -+ fnvlist_add_uint64(outnvl, "uncompressed", uncomp); - return (error); -@@ -4892,142 +5033,389 @@ zfs_ioc_space_snaps(zfs_cmd_t *zc) - /* -- * pool create, destroy, and export don't log the history as part of -- * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export -- * do the logging of those commands. -+ * innvl: { -+ * "fd" -> file descriptor to write stream to (int32) -+ * (optional) "fromsnap" -> full snap name to send an incremental from -+ * } -+ * -+ * outnvl is unused - */ --static zfs_ioc_vec_t zfs_ioc_vec[] = { -- { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_READONLY }, -- { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_NONE }, -- { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_destroy_snaps_nvl, zfs_secpolicy_destroy_recursive, -- DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME, -- B_FALSE, POOL_CHECK_NONE }, -- { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME, -- B_FALSE, POOL_CHECK_NONE }, -- { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, -- DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, -- B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_space_written, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_send_progress, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE } --}; -+/* ARGSUSED */ -+static int -+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ int error; -+ offset_t off; -+ char *fromname = NULL; -+ int fd; -+ file_t *fp; -+ -+ error = nvlist_lookup_int32(innvl, "fd", &fd); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); -+ -+ if ((fp = getf(fd)) == NULL) -+ return (SET_ERROR(EBADF)); -+ -+ off = fp->f_offset; -+ error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); -+ -+ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) -+ fp->f_offset = off; -+ -+ releasef(fd); -+ return (error); -+} -+ -+/* -+ * Determine approximately how large a zfs send stream will be -- the number -+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). -+ * -+ * innvl: { -+ * (optional) "fromsnap" -> full snap name to send an incremental from -+ * } -+ * -+ * outnvl: { -+ * "space" -> bytes of space (uint64) -+ * } -+ */ -+static int -+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *fromsnap = NULL; -+ dsl_dataset_t *tosnap; -+ int error; -+ char *fromname; -+ uint64_t space; -+ -+ error = dsl_pool_hold(snapname, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ -+ error = nvlist_lookup_string(innvl, "fromsnap", &fromname); -+ if (error == 0) { -+ error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ } -+ -+ error = dmu_send_estimate(tosnap, fromsnap, &space); -+ fnvlist_add_uint64(outnvl, "space", space); -+ -+ if (fromsnap != NULL) -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+} -+ -+ -+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; -+ -+static void -+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, -+ boolean_t log_history, zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; -+ -+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST); -+ ASSERT3U(ioc, <, ZFS_IOC_LAST); -+ ASSERT3P(vec->zvec_legacy_func, ==, NULL); -+ ASSERT3P(vec->zvec_func, ==, NULL); -+ -+ vec->zvec_legacy_func = func; -+ vec->zvec_secpolicy = secpolicy; -+ vec->zvec_namecheck = namecheck; -+ vec->zvec_allow_log = log_history; -+ vec->zvec_pool_check = pool_check; -+} -+ -+/* -+ * See the block comment at the beginning of this file for details on -+ * each argument to this function. -+ */ -+static void -+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, -+ zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, -+ boolean_t allow_log) -+{ -+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; -+ -+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST); -+ ASSERT3U(ioc, <, ZFS_IOC_LAST); -+ ASSERT3P(vec->zvec_legacy_func, ==, NULL); -+ ASSERT3P(vec->zvec_func, ==, NULL); -+ -+ /* if we are logging, the name must be valid */ -+ ASSERT(!allow_log || namecheck != NO_NAME); -+ -+ vec->zvec_name = name; -+ vec->zvec_func = func; -+ vec->zvec_secpolicy = secpolicy; -+ vec->zvec_namecheck = namecheck; -+ vec->zvec_pool_check = pool_check; -+ vec->zvec_smush_outnvlist = smush_outnvlist; -+ vec->zvec_allow_log = allow_log; -+} -+ -+static void -+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, boolean_t log_history, -+ zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ POOL_NAME, log_history, pool_check); -+} -+ -+static void -+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_FALSE, pool_check); -+} -+ -+static void -+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -+{ -+ zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, -+ POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+} -+ -+static void -+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ NO_NAME, B_FALSE, POOL_CHECK_NONE); -+} -+ -+static void -+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, -+ zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); -+} -+ -+static void -+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -+{ -+ zfs_ioctl_register_dataset_read_secpolicy(ioc, func, -+ zfs_secpolicy_read); -+} -+ -+static void -+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+} -+ -+static void -+zfs_ioctl_init(void) -+{ -+ zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, -+ zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, -+ zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, -+ zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, -+ zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, -+ zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("create", ZFS_IOC_CREATE, -+ zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("clone", ZFS_IOC_CLONE, -+ zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, -+ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("hold", ZFS_IOC_HOLD, -+ zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ zfs_ioctl_register("release", ZFS_IOC_RELEASE, -+ zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, -+ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, -+ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); -+ -+ /* IOCTLS that use the legacy function signature */ -+ -+ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, -+ zfs_ioc_pool_scan); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, -+ zfs_ioc_pool_upgrade); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, -+ zfs_ioc_vdev_add); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, -+ zfs_ioc_vdev_remove); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, -+ zfs_ioc_vdev_set_state); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, -+ zfs_ioc_vdev_attach); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, -+ zfs_ioc_vdev_detach); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, -+ zfs_ioc_vdev_setpath); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, -+ zfs_ioc_vdev_setfru); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, -+ zfs_ioc_pool_set_props); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, -+ zfs_ioc_vdev_split); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, -+ zfs_ioc_pool_reguid); -+ -+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, -+ zfs_ioc_pool_configs, zfs_secpolicy_none); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, -+ zfs_ioc_pool_tryimport, zfs_secpolicy_config); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, -+ zfs_ioc_inject_fault, zfs_secpolicy_inject); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, -+ zfs_ioc_clear_fault, zfs_secpolicy_inject); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, -+ zfs_ioc_inject_list_next, zfs_secpolicy_inject); -+ -+ /* -+ * pool destroy, and export don't log the history as part of -+ * zfsdev_ioctl, but rather zfs_ioc_pool_export -+ * does the logging of those commands. -+ */ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, -+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, -+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, -+ zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); -+ zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, -+ zfs_ioc_dsobj_to_dsname, -+ zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, -+ zfs_ioc_pool_get_history, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); -+ -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, -+ zfs_ioc_space_written); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, -+ zfs_ioc_objset_recvd_props); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, -+ zfs_ioc_next_obj); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, -+ zfs_ioc_get_fsacl); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, -+ zfs_ioc_objset_stats); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, -+ zfs_ioc_objset_zplprops); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, -+ zfs_ioc_dataset_list_next); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, -+ zfs_ioc_snapshot_list_next); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, -+ zfs_ioc_send_progress); -+ -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, -+ zfs_ioc_diff, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, -+ zfs_ioc_obj_to_stats, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, -+ zfs_ioc_obj_to_path, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, -+ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, -+ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, -+ zfs_ioc_send, zfs_secpolicy_send); -+ -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, -+ zfs_secpolicy_none); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, -+ zfs_secpolicy_destroy); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, -+ zfs_secpolicy_rename); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, -+ zfs_secpolicy_recv); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, -+ zfs_secpolicy_promote); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, -+ zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, -+ zfs_secpolicy_set_fsacl); -+ -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, -+ zfs_secpolicy_share, POOL_CHECK_NONE); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, -+ zfs_secpolicy_smb_acl, POOL_CHECK_NONE); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, -+ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, -+ zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+ -+ /* -+ * ZoL functions -+ */ -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+} - -@@ -5048,5 +5436,5 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type, - if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) -- error = EAGAIN; -+ error = SET_ERROR(EAGAIN); - else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) -- error = EROFS; -+ error = SET_ERROR(EROFS); - spa_close(spa, FTAG); -@@ -5064,8 +5452,11 @@ zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) - for (zs = list_head(&zfsdev_state_list); zs != NULL; -- zs = list_next(&zfsdev_state_list, zs)) { -+ zs = list_next(&zfsdev_state_list, zs)) { - if (zs->zs_minor == minor) { - switch (which) { -- case ZST_ONEXIT: return (zs->zs_onexit); -- case ZST_ZEVENT: return (zs->zs_zevent); -- case ZST_ALL: return (zs); -+ case ZST_ONEXIT: -+ return (zs->zs_onexit); -+ case ZST_ZEVENT: -+ return (zs->zs_zevent); -+ case ZST_ALL: -+ return (zs); - } -@@ -5074,3 +5465,3 @@ zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) - -- return NULL; -+ return (NULL); - } -@@ -5086,3 +5477,3 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) - -- return ptr; -+ return (ptr); - } -@@ -5130,7 +5521,7 @@ zfsdev_state_init(struct file *filp) - -- minor = zfsdev_minor_alloc(); -- if (minor == 0) -- return (ENXIO); -+ minor = zfsdev_minor_alloc(); -+ if (minor == 0) -+ return (SET_ERROR(ENXIO)); - -- zs = kmem_zalloc( sizeof(zfsdev_state_t), KM_SLEEP); -+ zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); - -@@ -5161,5 +5552,5 @@ zfsdev_state_destroy(struct file *filp) - list_remove(&zfsdev_state_list, zs); -- kmem_free(zs, sizeof(zfsdev_state_t)); -+ kmem_free(zs, sizeof (zfsdev_state_t)); - -- return 0; -+ return (0); - } -@@ -5194,8 +5585,19 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - zfs_cmd_t *zc; -- uint_t vec; -- int error, rc, flag = 0; -+ uint_t vecnum; -+ int error, rc, len = 0, flag = 0; -+ const zfs_ioc_vec_t *vec; -+ char *saved_poolname = NULL; -+ nvlist_t *innvl = NULL; -+ -+ vecnum = cmd - ZFS_IOC_FIRST; -+ if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) -+ return (-SET_ERROR(EINVAL)); -+ vec = &zfs_ioc_vec[vecnum]; - -- vec = cmd - ZFS_IOC; -- if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) -- return (-EINVAL); -+ /* -+ * The registered ioctl list may be sparse, verify that either -+ * a normal or legacy handler are registered. -+ */ -+ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) -+ return (-SET_ERROR(EINVAL)); - -@@ -5204,7 +5606,14 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); -- if (error != 0) -- error = EFAULT; -+ if (error != 0) { -+ error = SET_ERROR(EFAULT); -+ goto out; -+ } - -- if ((error == 0) && !(flag & FKIOCTL)) -- error = zfs_ioc_vec[vec].zvec_secpolicy(zc, CRED()); -+ zc->zc_iflags = flag & FKIOCTL; -+ if (zc->zc_nvlist_src_size != 0) { -+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -+ zc->zc_iflags, &innvl); -+ if (error != 0) -+ goto out; -+ } - -@@ -5214,36 +5623,103 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - */ -- if (error == 0) { -- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; -- zc->zc_iflags = flag & FKIOCTL; -- switch (zfs_ioc_vec[vec].zvec_namecheck) { -- case POOL_NAME: -- if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) -- error = EINVAL; -+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; -+ switch (vec->zvec_namecheck) { -+ case POOL_NAME: -+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) -+ error = SET_ERROR(EINVAL); -+ else - error = pool_status_check(zc->zc_name, -- zfs_ioc_vec[vec].zvec_namecheck, -- zfs_ioc_vec[vec].zvec_pool_check); -- break; -+ vec->zvec_namecheck, vec->zvec_pool_check); -+ break; - -- case DATASET_NAME: -- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) -- error = EINVAL; -+ case DATASET_NAME: -+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) -+ error = SET_ERROR(EINVAL); -+ else - error = pool_status_check(zc->zc_name, -- zfs_ioc_vec[vec].zvec_namecheck, -- zfs_ioc_vec[vec].zvec_pool_check); -- break; -+ vec->zvec_namecheck, vec->zvec_pool_check); -+ break; - -- case NO_NAME: -- break; -- } -+ case NO_NAME: -+ break; - } - -- if (error == 0) -- error = zfs_ioc_vec[vec].zvec_func(zc); - -+ if (error == 0 && !(flag & FKIOCTL)) -+ error = vec->zvec_secpolicy(zc, innvl, CRED()); -+ -+ if (error != 0) -+ goto out; -+ -+ /* legacy ioctls can modify zc_name */ -+ len = strcspn(zc->zc_name, "/@#") + 1; -+ saved_poolname = kmem_alloc(len, KM_SLEEP); -+ (void) strlcpy(saved_poolname, zc->zc_name, len); -+ -+ if (vec->zvec_func != NULL) { -+ nvlist_t *outnvl; -+ int puterror = 0; -+ spa_t *spa; -+ nvlist_t *lognv = NULL; -+ -+ ASSERT(vec->zvec_legacy_func == NULL); -+ -+ /* -+ * Add the innvl to the lognv before calling the func, -+ * in case the func changes the innvl. -+ */ -+ if (vec->zvec_allow_log) { -+ lognv = fnvlist_alloc(); -+ fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, -+ vec->zvec_name); -+ if (!nvlist_empty(innvl)) { -+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, -+ innvl); -+ } -+ } -+ -+ VERIFY0(nvlist_alloc(&outnvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ error = vec->zvec_func(zc->zc_name, innvl, outnvl); -+ -+ if (error == 0 && vec->zvec_allow_log && -+ spa_open(zc->zc_name, &spa, FTAG) == 0) { -+ if (!nvlist_empty(outnvl)) { -+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, -+ outnvl); -+ } -+ (void) spa_history_log_nvl(spa, lognv); -+ spa_close(spa, FTAG); -+ } -+ fnvlist_free(lognv); -+ -+ if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { -+ int smusherror = 0; -+ if (vec->zvec_smush_outnvlist) { -+ smusherror = nvlist_smush(outnvl, -+ zc->zc_nvlist_dst_size); -+ } -+ if (smusherror == 0) -+ puterror = put_nvlist(zc, outnvl); -+ } -+ -+ if (puterror != 0) -+ error = puterror; -+ -+ nvlist_free(outnvl); -+ } else { -+ error = vec->zvec_legacy_func(zc); -+ } -+ -+out: -+ nvlist_free(innvl); - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); -- if (error == 0) { -- if (rc != 0) -- error = EFAULT; -- if (zfs_ioc_vec[vec].zvec_his_log) -- zfs_log_history(zc); -+ if (error == 0 && rc != 0) -+ error = SET_ERROR(EFAULT); -+ if (error == 0 && vec->zvec_allow_log) { -+ char *s = tsd_get(zfs_allow_log_key); -+ if (s != NULL) -+ strfree(s); -+ (void) tsd_set(zfs_allow_log_key, saved_poolname); -+ } else { -+ if (saved_poolname != NULL) -+ kmem_free(saved_poolname, len); - } -@@ -5258,6 +5734,6 @@ zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - { -- return zfsdev_ioctl(filp, cmd, arg); -+ return (zfsdev_ioctl(filp, cmd, arg)); - } - #else --#define zfsdev_compat_ioctl NULL -+#define zfsdev_compat_ioctl NULL - #endif -@@ -5265,7 +5741,7 @@ zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - static const struct file_operations zfsdev_fops = { -- .open = zfsdev_open, -- .release = zfsdev_release, -- .unlocked_ioctl = zfsdev_ioctl, -- .compat_ioctl = zfsdev_compat_ioctl, -- .owner = THIS_MODULE, -+ .open = zfsdev_open, -+ .release = zfsdev_release, -+ .unlocked_ioctl = zfsdev_ioctl, -+ .compat_ioctl = zfsdev_compat_ioctl, -+ .owner = THIS_MODULE, - }; -@@ -5273,5 +5749,5 @@ static const struct file_operations zfsdev_fops = { - static struct miscdevice zfs_misc = { -- .minor = MISC_DYNAMIC_MINOR, -- .name = ZFS_DRIVER, -- .fops = &zfsdev_fops, -+ .minor = MISC_DYNAMIC_MINOR, -+ .name = ZFS_DRIVER, -+ .fops = &zfsdev_fops, - }; -@@ -5288,3 +5764,3 @@ zfs_attach(void) - error = misc_register(&zfs_misc); -- if (error) { -+ if (error != 0) { - printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); -@@ -5302,3 +5778,3 @@ zfs_detach(void) - error = misc_deregister(&zfs_misc); -- if (error) -+ if (error != 0) - printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); -@@ -5309,9 +5785,13 @@ zfs_detach(void) - --uint_t zfs_fsyncer_key; --extern uint_t rrw_tsd_key; -+static void -+zfs_allow_log_destroy(void *arg) -+{ -+ char *poolname = arg; -+ strfree(poolname); -+} - - #ifdef DEBUG --#define ZFS_DEBUG_STR " (DEBUG mode)" -+#define ZFS_DEBUG_STR " (DEBUG mode)" - #else --#define ZFS_DEBUG_STR "" -+#define ZFS_DEBUG_STR "" - #endif -@@ -5329,2 +5809,4 @@ _init(void) - -+ zfs_ioctl_init(); -+ - if ((error = zfs_attach()) != 0) -@@ -5333,8 +5815,12 @@ _init(void) - tsd_create(&zfs_fsyncer_key, NULL); -- tsd_create(&rrw_tsd_key, NULL); -+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy); -+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); - - printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " -- "ZFS pool version %s, ZFS filesystem version %s\n", -- ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, -- SPA_VERSION_STRING, ZPL_VERSION_STRING); -+ "ZFS pool version %s, ZFS filesystem version %s\n", -+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, -+ SPA_VERSION_STRING, ZPL_VERSION_STRING); -+#ifndef CONFIG_FS_POSIX_ACL -+ printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); -+#endif /* CONFIG_FS_POSIX_ACL */ - -@@ -5348,4 +5834,4 @@ out1: - printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" -- ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, -- ZFS_DEBUG_STR, error); -+ ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, -+ ZFS_DEBUG_STR, error); - -@@ -5364,5 +5850,6 @@ _fini(void) - tsd_destroy(&rrw_tsd_key); -+ tsd_destroy(&zfs_allow_log_key); - - printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", -- ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); -+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); - -@@ -5378,2 +5865,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - #endif /* HAVE_SPL */ -diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c -index cbd6f1c..cfce831 100644 ---- a/module/zfs/zfs_log.c -+++ b/module/zfs/zfs_log.c -@@ -214,5 +214,4 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) - /* -- * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, -- * TX_MKDIR_ATTR and TX_MKXATTR -- * transactions. -+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and -+ * TK_MKXATTR transactions. - * -@@ -241,3 +240,2 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - lr_acl_create_t *lracl; -- xvattr_t *xvap = (xvattr_t *)vap; - size_t aclsize = 0; -@@ -245,2 +243,3 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - size_t txsize; -+ xvattr_t *xvap = (xvattr_t *)vap; - void *end; -@@ -271,3 +270,2 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - } else { -- aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; - txsize = -@@ -344,3 +342,3 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. -+ * Handles both TX_REMOVE and TX_RMDIR transactions. - */ -@@ -368,3 +366,3 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_link() handles TX_LINK transactions. -+ * Handles TX_LINK transactions. - */ -@@ -391,3 +389,3 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_symlink() handles TX_SYMLINK transactions. -+ * Handles TX_SYMLINK transactions. - */ -@@ -423,3 +421,3 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_rename() handles TX_RENAME transactions. -+ * Handles TX_RENAME transactions. - */ -@@ -449,3 +447,5 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_write() handles TX_WRITE transactions. -+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is -+ * called as soon as the write is on stable storage (be it via a DMU sync or a -+ * ZIL commit). - */ -@@ -455,3 +455,4 @@ void - zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, -- znode_t *zp, offset_t off, ssize_t resid, int ioflag) -+ znode_t *zp, offset_t off, ssize_t resid, int ioflag, -+ zil_callback_t callback, void *callback_data) - { -@@ -462,4 +463,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - -- if (zil_replaying(zilog, tx) || zp->z_unlinked) -+ if (zil_replaying(zilog, tx) || zp->z_unlinked) { -+ if (callback != NULL) -+ callback(callback_data); - return; -+ } - -@@ -520,2 +524,4 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - -+ itx->itx_callback = callback; -+ itx->itx_callback_data = callback_data; - zil_itx_assign(zilog, itx, tx); -@@ -528,3 +534,3 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_truncate() handles TX_TRUNCATE transactions. -+ * Handles TX_TRUNCATE transactions. - */ -@@ -551,3 +557,3 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_setattr() handles TX_SETATTR transactions. -+ * Handles TX_SETATTR transactions. - */ -@@ -613,3 +619,3 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_acl() handles TX_ACL transactions. -+ * Handles TX_ACL transactions. - */ -diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c -index 2f60b5e..2b286e7 100644 ---- a/module/zfs/zfs_onexit.c -+++ b/module/zfs/zfs_onexit.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -111,3 +112,3 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) - if (*zo == NULL) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -130,3 +131,3 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp) - if (fp == NULL) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -157,3 +158,3 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - -- ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); -+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_PUSHPAGE); - list_link_init(&ap->za_link); -@@ -213,3 +214,3 @@ zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) - mutex_exit(&zo->zo_lock); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -242,3 +243,3 @@ zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - mutex_exit(&zo->zo_lock); -diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c -index 813250c..6ac10e2 100644 ---- a/module/zfs/zfs_replay.c -+++ b/module/zfs/zfs_replay.c -@@ -23,2 +23,3 @@ - * Copyright (c) 2012 Cyril Plisko. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -75,3 +76,3 @@ zfs_replay_error(zfs_sb_t *zsb, lr_t *lr, boolean_t byteswap) - { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -389,3 +390,3 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -515,3 +516,3 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -555,3 +556,3 @@ zfs_replay_remove(zfs_sb_t *zsb, lr_remove_t *lr, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -679,3 +680,3 @@ zfs_replay_write(zfs_sb_t *zsb, lr_write_t *lr, boolean_t byteswap) - else if (written < length) -- error = EIO; /* short write */ -+ error = SET_ERROR(EIO); /* short write */ - -diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c -index 136972b..2533ced 100644 ---- a/module/zfs/zfs_rlock.c -+++ b/module/zfs/zfs_rlock.c -@@ -30,3 +30,3 @@ - * This file contains the code to implement file range locking in -- * ZFS, although there isn't much specific to ZFS (all that comes to mind -+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is - * support for growing the blocksize). -@@ -552,3 +552,3 @@ zfs_range_unlock(rl_t *rl) - ASSERT(!rl->r_proxy); -- list_create(&free_list, sizeof(rl_t), offsetof(rl_t, rl_node)); -+ list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node)); - -diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c -index 621c5f9..ebe92bb 100644 ---- a/module/zfs/zfs_sa.c -+++ b/module/zfs/zfs_sa.c -@@ -266,3 +266,3 @@ out: - * since the SA code can read both old/new znode formats -- * with probably little to know performance difference. -+ * with probably little to no performance difference. - * -@@ -312,3 +312,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - /* First do a bulk query of the attributes that aren't cached */ -- bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); -+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); -@@ -326,3 +326,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) { -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); - goto done; -@@ -335,3 +335,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - count = 0; -- sa_attrs = kmem_zalloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); -+ sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); -@@ -392,4 +392,4 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - zp->z_is_sa = B_TRUE; -- kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * 20); -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); -+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * 20); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); - done: -diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c -index 9ae7ab5..a27ac69 100644 ---- a/module/zfs/zfs_vfsops.c -+++ b/module/zfs/zfs_vfsops.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -139,2 +140,8 @@ atime_changed_cb(void *arg, uint64_t newval) - static void -+relatime_changed_cb(void *arg, uint64_t newval) -+{ -+ ((zfs_sb_t *)arg)->z_relatime = newval; -+} -+ -+static void - xattr_changed_cb(void *arg, uint64_t newval) -@@ -156,2 +163,26 @@ xattr_changed_cb(void *arg, uint64_t newval) - static void -+acltype_changed_cb(void *arg, uint64_t newval) -+{ -+ zfs_sb_t *zsb = arg; -+ -+ switch (newval) { -+ case ZFS_ACLTYPE_OFF: -+ zsb->z_acl_type = ZFS_ACLTYPE_OFF; -+ zsb->z_sb->s_flags &= ~MS_POSIXACL; -+ break; -+ case ZFS_ACLTYPE_POSIXACL: -+#ifdef CONFIG_FS_POSIX_ACL -+ zsb->z_acl_type = ZFS_ACLTYPE_POSIXACL; -+ zsb->z_sb->s_flags |= MS_POSIXACL; -+#else -+ zsb->z_acl_type = ZFS_ACLTYPE_OFF; -+ zsb->z_sb->s_flags &= ~MS_POSIXACL; -+#endif /* CONFIG_FS_POSIX_ACL */ -+ break; -+ default: -+ break; -+ } -+} -+ -+static void - blksz_changed_cb(void *arg, uint64_t newval) -@@ -249,24 +280,30 @@ zfs_register_callbacks(zfs_sb_t *zsb) - ds = dmu_objset_ds(os); -+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dsl_prop_register(ds, -- "atime", atime_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); -+ error = error ? error : dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "xattr", xattr_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "recordsize", blksz_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "readonly", readonly_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "devices", devices_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "setuid", setuid_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "exec", exec_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "snapdir", snapdir_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "aclinherit", acl_inherit_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "vscan", vscan_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "nbmand", nbmand_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); -+ error = error ? error : dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); -+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error) -@@ -285,14 +322,28 @@ unregister: - */ -- (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, -- zsb); -- (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), -+ atime_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), -+ relatime_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), -+ xattr_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), -+ blksz_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), -+ readonly_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), -+ devices_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), -+ setuid_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), -+ exec_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), -+ snapdir_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), -+ acltype_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), -+ acl_inherit_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), -+ vscan_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), -+ nbmand_changed_cb, zsb); - -@@ -306,4 +357,2 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - { -- int error = 0; -- - /* -@@ -312,3 +361,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -321,3 +370,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - if (data == NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -364,3 +413,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - } -- return (error); -+ return (0); - } -@@ -397,3 +446,3 @@ zfs_userquota_prop_to_obj(zfs_sb_t *zsb, zfs_userquota_prop_t type) - default: -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -413,3 +462,3 @@ zfs_userspace_many(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (!dmu_objset_userspace_present(zsb->z_os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -458,3 +507,3 @@ id_to_fuidstr(zfs_sb_t *zsb, const char *domain, uid_t rid, - if (domainid == -1) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -476,3 +525,3 @@ zfs_userspace_one(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (!dmu_objset_userspace_present(zsb->z_os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -504,6 +553,6 @@ zfs_set_userquota(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (zsb->z_version < ZPL_VERSION_USERSPACE) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -638,3 +687,3 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - (u_longlong_t)spa_version(dmu_objset_spa(os))); -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - goto out; -@@ -653,2 +702,6 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - -+ if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &zval)) != 0) -+ goto out; -+ zsb->z_acl_type = (uint_t)zval; -+ - /* -@@ -727,3 +780,3 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - offsetof(znode_t, z_link_node)); -- rrw_init(&zsb->z_teardown_lock); -+ rrw_init(&zsb->z_teardown_lock, B_FALSE); - rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); -@@ -873,2 +926,5 @@ zfs_unregister_callbacks(zfs_sb_t *zsb) - -+ VERIFY(dsl_prop_unregister(ds, "relatime", relatime_changed_cb, -+ zsb) == 0); -+ - VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, -@@ -894,2 +950,5 @@ zfs_unregister_callbacks(zfs_sb_t *zsb) - -+ VERIFY(dsl_prop_unregister(ds, "acltype", acltype_changed_cb, -+ zsb) == 0); -+ - VERIFY(dsl_prop_unregister(ds, "aclinherit", -@@ -908,9 +967,8 @@ EXPORT_SYMBOL(zfs_unregister_callbacks); - /* -- * zfs_check_global_label: -- * Check that the hex label string is appropriate for the dataset -- * being mounted into the global_zone proper. -+ * Check that the hex label string is appropriate for the dataset being -+ * mounted into the global_zone proper. - * -- * Return an error if the hex label string is not default or -- * admin_low/admin_high. For admin_low labels, the corresponding -- * dataset must be readonly. -+ * Return an error if the hex label string is not default or -+ * admin_low/admin_high. For admin_low labels, the corresponding -+ * dataset must be readonly. - */ -@@ -929,6 +987,6 @@ zfs_check_global_label(const char *dsname, const char *hexsl) - zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); - } -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -1048,2 +1106,10 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - -+ /* -+ * If someone has not already unmounted this file system, -+ * drain the iput_taskq to ensure all active references to the -+ * zfs_sb_t have been handled only then can it be safely destroyed. -+ */ -+ if (zsb->z_os) -+ taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); -+ - rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); -@@ -1062,10 +1128,2 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - /* -- * If someone has not already unmounted this file system, -- * drain the iput_taskq to ensure all active references to the -- * zfs_sb_t have been handled only then can it be safely destroyed. -- */ -- if (zsb->z_os) -- taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); -- -- /* - * Close the zil. NB: Can't close the zil while zfs_inactive -@@ -1088,3 +1146,3 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - rrw_exit(&zsb->z_teardown_lock, FTAG); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1101,6 +1159,4 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - zp = list_next(&zsb->z_all_znodes, zp)) { -- if (zp->z_sa_hdl) { -- ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0); -+ if (zp->z_sa_hdl) - zfs_znode_dmu_fini(zp); -- } - } -@@ -1139,3 +1195,3 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - txg_wait_synced(dmu_objset_pool(zsb->z_os), 0); -- (void) dmu_objset_evict_dbufs(zsb->z_os); -+ dmu_objset_evict_dbufs(zsb->z_os); - -@@ -1208,5 +1264,10 @@ zfs_domount(struct super_block *sb, void *data, int silent) - readonly_changed_cb(zsb, B_TRUE); -- if ((error = dsl_prop_get_integer(osname,"xattr",&pval,NULL))) -+ if ((error = dsl_prop_get_integer(osname, -+ "xattr", &pval, NULL))) - goto out; - xattr_changed_cb(zsb, pval); -+ if ((error = dsl_prop_get_integer(osname, -+ "acltype", &pval, NULL))) -+ goto out; -+ acltype_changed_cb(zsb, pval); - zsb->z_issnap = B_TRUE; -@@ -1232,3 +1293,3 @@ zfs_domount(struct super_block *sb, void *data, int silent) - (void) zfs_umount(sb); -- error = ENOMEM; -+ error = SET_ERROR(ENOMEM); - goto out; -@@ -1347,3 +1408,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - if (err) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1362,3 +1423,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1396,3 +1457,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1412,3 +1473,5 @@ EXPORT_SYMBOL(zfs_vget); - * Note, if successful, then we return with the 'z_teardown_lock' and -- * 'z_teardown_inactive_lock' write held. -+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying -+ * dataset and objset intact so that they can be atomically handed off during -+ * a subsequent rollback or recv operation and the resume thereafter. - */ -@@ -1422,4 +1485,2 @@ zfs_suspend_fs(zfs_sb_t *zsb) - -- dmu_objset_disown(zsb->z_os, zsb); -- - return (0); -@@ -1435,2 +1496,4 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) - int err, err2; -+ znode_t *zp; -+ uint64_t sa_obj = 0; - -@@ -1439,43 +1502,59 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) - -- err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os); -- if (err) { -- zsb->z_os = NULL; -- } else { -- znode_t *zp; -- uint64_t sa_obj = 0; -+ /* -+ * We already own this, so just hold and rele it to update the -+ * objset_t, as the one we had before may have been evicted. -+ */ -+ VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os)); -+ VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb); -+ VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset)); -+ dmu_objset_rele(zsb->z_os, zsb); - -- err2 = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, -- ZFS_SA_ATTRS, 8, 1, &sa_obj); -+ /* -+ * Make sure version hasn't changed -+ */ - -- if ((err || err2) && zsb->z_version >= ZPL_VERSION_SA) -- goto bail; -+ err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION, -+ &zsb->z_version); - -+ if (err) -+ goto bail; - -- if ((err = sa_setup(zsb->z_os, sa_obj, -- zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) -- goto bail; -+ err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, -+ ZFS_SA_ATTRS, 8, 1, &sa_obj); - -- VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); -- zsb->z_rollback_time = jiffies; -+ if (err && zsb->z_version >= ZPL_VERSION_SA) -+ goto bail; - -- /* -- * Attempt to re-establish all the active inodes with their -- * dbufs. If a zfs_rezget() fails, then we unhash the inode -- * and mark it stale. This prevents a collision if a new -- * inode/object is created which must use the same inode -- * number. The stale inode will be be released when the -- * VFS prunes the dentry holding the remaining references -- * on the stale inode. -- */ -- mutex_enter(&zsb->z_znodes_lock); -- for (zp = list_head(&zsb->z_all_znodes); zp; -- zp = list_next(&zsb->z_all_znodes, zp)) { -- err2 = zfs_rezget(zp); -- if (err2) { -- remove_inode_hash(ZTOI(zp)); -- zp->z_is_stale = B_TRUE; -- } -+ if ((err = sa_setup(zsb->z_os, sa_obj, -+ zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) -+ goto bail; -+ -+ if (zsb->z_version >= ZPL_VERSION_SA) -+ sa_register_update_callback(zsb->z_os, -+ zfs_sa_upgrade); -+ -+ VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); -+ -+ zfs_set_fuid_feature(zsb); -+ zsb->z_rollback_time = jiffies; -+ -+ /* -+ * Attempt to re-establish all the active inodes with their -+ * dbufs. If a zfs_rezget() fails, then we unhash the inode -+ * and mark it stale. This prevents a collision if a new -+ * inode/object is created which must use the same inode -+ * number. The stale inode will be be released when the -+ * VFS prunes the dentry holding the remaining references -+ * on the stale inode. -+ */ -+ mutex_enter(&zsb->z_znodes_lock); -+ for (zp = list_head(&zsb->z_all_znodes); zp; -+ zp = list_next(&zsb->z_all_znodes, zp)) { -+ err2 = zfs_rezget(zp); -+ if (err2) { -+ remove_inode_hash(ZTOI(zp)); -+ zp->z_is_stale = B_TRUE; - } -- mutex_exit(&zsb->z_znodes_lock); - } -+ mutex_exit(&zsb->z_znodes_lock); - -@@ -1488,4 +1567,4 @@ bail: - /* -- * Since we couldn't reopen zfs_sb_t or, setup the -- * sa framework, force unmount this file system. -+ * Since we couldn't setup the sa framework, try to force -+ * unmount this file system. - */ -@@ -1506,6 +1585,6 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (newvers < zsb->z_version) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1513,3 +1592,3 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - spa_version(dmu_objset_spa(zsb->z_os))) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -1552,5 +1631,4 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - -- spa_history_log_internal(LOG_DS_UPGRADE, -- dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", -- zsb->z_version, newvers, dmu_objset_id(os)); -+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, -+ "from %llu to %llu", zsb->z_version, newvers); - -@@ -1560,4 +1638,3 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - -- if (zsb->z_version >= ZPL_VERSION_FUID) -- zfs_set_fuid_feature(zsb); -+ zfs_set_fuid_feature(zsb); - -@@ -1574,3 +1651,3 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) - const char *pname; -- int error = ENOENT; -+ int error = SET_ERROR(ENOENT); - -@@ -1601,2 +1678,5 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) - break; -+ case ZFS_PROP_ACLTYPE: -+ *value = ZFS_ACLTYPE_OFF; -+ break; - default: -@@ -1623,2 +1703,3 @@ zfs_fini(void) - { -+ taskq_wait(system_taskq); - unregister_filesystem(&zpl_fs_type); -diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c -index 876d44b..91f743a 100644 ---- a/module/zfs/zfs_vnops.c -+++ b/module/zfs/zfs_vnops.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -108,7 +108,14 @@ - * -- * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). -- * This is critical because we don't want to block while holding locks. -- * Note, in particular, that if a lock is sometimes acquired before -- * the tx assigns, and sometimes after (e.g. z_lock), then failing to -- * use a non-blocking assign can deadlock the system. The scenario: -+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to -+ * dmu_tx_assign(). This is critical because we don't want to block -+ * while holding locks. -+ * -+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This -+ * reduces lock contention and CPU usage when we must wait (note that if -+ * throughput is constrained by the storage, nearly every transaction -+ * must wait). -+ * -+ * Note, in particular, that if a lock is sometimes acquired before -+ * the tx assigns, and sometimes after (e.g. z_lock), then failing -+ * to use a non-blocking assign can deadlock the system. The scenario: - * -@@ -120,3 +127,7 @@ - * If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT, -- * then drop all locks, call dmu_tx_wait(), and try again. -+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent -+ * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, -+ * to indicate that this operation has already called dmu_tx_wait(). -+ * This will ensure that we don't retry forever, waiting a short bit -+ * each time. - * -@@ -125,3 +136,3 @@ - * in the intent log matches the order in which they actually occurred. -- * During ZIL replay the zfs_log_* functions will update the sequence -+ * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. -@@ -142,3 +153,3 @@ - * dmu_tx_hold_*(); // hold each object you might modify -- * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign -+ * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - * if (error) { -@@ -148,2 +159,3 @@ - * if (error == ERESTART) { -+ * waited = B_TRUE; - * dmu_tx_wait(tx); -@@ -194,3 +206,3 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -202,3 +214,3 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -225,9 +237,5 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) - -- /* -- * Zero the synchronous opens in the znode. Under Linux the -- * zfs_close() hook is not symmetric with zfs_open(), it is -- * only called once when the last reference is dropped. -- */ -+ /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) -- zp->z_sync_cnt = 0; -+ atomic_dec_32(&zp->z_sync_cnt); - -@@ -258,3 +266,3 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) - if (noff >= file_sz) { -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -277,3 +285,3 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) - } -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -359,3 +367,3 @@ update_pages(struct inode *ip, int64_t start, int len, - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when -- * the file is memory mapped. -+ * the file is memory mapped. - */ -@@ -420,4 +428,3 @@ unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -446,3 +453,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -454,3 +461,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -471,3 +478,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } -@@ -540,3 +547,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - break; -@@ -550,3 +557,2 @@ out: - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - ZFS_EXIT(zsb); -@@ -629,3 +635,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -640,3 +646,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -649,6 +655,5 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } - --#ifdef HAVE_UIO_ZEROCOPY - /* -@@ -658,2 +663,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - */ -+#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && -@@ -662,4 +668,4 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - else -+#endif - uio_prefaultpages(MIN(n, max_blksz), uio); --#endif /* HAVE_UIO_ZEROCOPY */ - -@@ -696,3 +702,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - } -@@ -715,3 +721,2 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - woff = uio->uio_loffset; --again: - if (zfs_owner_overquota(zsb, zp, B_FALSE) || -@@ -720,3 +725,3 @@ again: - dmu_return_arcbuf(abuf); -- error = EDQUOT; -+ error = SET_ERROR(EDQUOT); - break; -@@ -765,9 +770,4 @@ again: - zfs_sa_upgrade_txholds(tx, zp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto again; -- } - dmu_tx_abort(tx); -@@ -895,3 +895,4 @@ again: - -- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); -+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, -+ NULL, NULL); - dmu_tx_commit(tx); -@@ -989,3 +990,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (zfs_zget(zsb, object, &zp) != 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { -@@ -996,3 +997,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os))); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1014,3 +1015,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (offset >= zp->z_size) { -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } else { -@@ -1041,6 +1042,6 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (lr->lr_offset >= zp->z_size) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - #ifdef DEBUG - if (zil_fault_io) { -- error = EIO; -+ error = SET_ERROR(EIO); - zil_fault_io = 0; -@@ -1053,2 +1054,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -1118,4 +1125,3 @@ EXPORT_SYMBOL(zfs_access); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1137,5 +1143,5 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - if (!S_ISDIR(dip->i_mode)) { -- return (ENOTDIR); -+ return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1162,3 +1168,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - iput(tvp); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } else { -@@ -1184,3 +1190,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1208,3 +1214,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (ENOTDIR); -+ return (SET_ERROR(ENOTDIR)); - } -@@ -1223,3 +1229,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1251,4 +1257,3 @@ EXPORT_SYMBOL(zfs_lookup); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1276,2 +1281,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - boolean_t have_acl = B_FALSE; -+ boolean_t waited = B_FALSE; - -@@ -1287,3 +1293,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1297,3 +1303,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1331,3 +1337,3 @@ top: - if (strcmp(name, "..") == 0) -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - ZFS_EXIT(zsb); -@@ -1358,3 +1364,3 @@ top: - zfs_acl_ids_free(&acl_ids); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -1369,3 +1375,3 @@ top: - zfs_acl_ids_free(&acl_ids); -- error = EDQUOT; -+ error = SET_ERROR(EDQUOT); - goto out; -@@ -1388,3 +1394,3 @@ top: - } -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1392,2 +1398,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1428,3 +1435,3 @@ top: - if (excl) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - goto out; -@@ -1435,3 +1442,3 @@ top: - if (S_ISDIR(ZTOI(zp)->i_mode)) { -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - goto out; -@@ -1521,2 +1528,3 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr) - int zflg = ZEXISTS; -+ boolean_t waited = B_FALSE; - -@@ -1560,3 +1568,3 @@ top: - if (S_ISDIR(ip->i_mode)) { -- error = EPERM; -+ error = SET_ERROR(EPERM); - goto out; -@@ -1596,3 +1604,3 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1603,2 +1611,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1707,2 +1716,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - boolean_t fuid_dirtied; -+ boolean_t waited = B_FALSE; - -@@ -1718,3 +1728,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1726,3 +1736,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1732,3 +1742,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1778,3 +1788,3 @@ top: - ZFS_EXIT(zsb); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } -@@ -1798,3 +1808,3 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1802,2 +1812,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1860,4 +1871,3 @@ EXPORT_SYMBOL(zfs_mkdir); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1880,2 +1890,3 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, - int zflg = ZEXISTS; -+ boolean_t waited = B_FALSE; - -@@ -1906,3 +1917,3 @@ top: - if (!S_ISDIR(ip->i_mode)) { -- error = ENOTDIR; -+ error = SET_ERROR(ENOTDIR); - goto out; -@@ -1911,3 +1922,3 @@ top: - if (ip == cwd) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -1933,3 +1944,3 @@ top: - zfs_sa_upgrade_txholds(tx, dzp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1940,2 +1951,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -2006,8 +2018,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - zap_attribute_t zap; -- int outcount; - int error; - uint8_t prefetch; -+ uint8_t type; - int done = 0; - uint64_t parent; -- loff_t *pos = &(ctx->pos); -+ uint64_t offset; /* must be unsigned; checks for < 1 */ - -@@ -2023,3 +2035,2 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- error = 0; - if (zp->z_unlinked) -@@ -2027,3 +2038,5 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - -+ error = 0; - os = zsb->z_os; -+ offset = ctx->pos; - prefetch = zp->z_zn_prefetch; -@@ -2033,3 +2046,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- if (*pos <= 3) { -+ if (offset <= 3) { - /* -@@ -2042,3 +2055,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- zap_cursor_init_serialized(&zc, os, zp->z_id, *pos); -+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } -@@ -2048,4 +2061,2 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- outcount = 0; -- - while (!done) { -@@ -2055,3 +2066,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- if (*pos == 0) { -+ if (offset == 0) { - (void) strcpy(zap.za_name, "."); -@@ -2059,3 +2070,4 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = zp->z_id; -- } else if (*pos == 1) { -+ type = DT_DIR; -+ } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); -@@ -2063,3 +2075,4 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = parent; -- } else if (*pos == 2 && zfs_show_ctldir(zp)) { -+ type = DT_DIR; -+ } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); -@@ -2067,2 +2080,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = ZFSCTL_INO_ROOT; -+ type = DT_DIR; - } else { -@@ -2091,6 +2105,6 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - (u_longlong_t)zp->z_id, -- (u_longlong_t)*pos, -+ (u_longlong_t)offset, - zap.za_integer_length, - (u_longlong_t)zap.za_num_integers); -- error = ENXIO; -+ error = SET_ERROR(ENXIO); - goto update; -@@ -2099,2 +2113,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); -+ type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } -@@ -2102,3 +2117,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name), -- objnum, ZFS_DIRENT_TYPE(zap.za_first_integer)); -+ objnum, type); - if (done) -@@ -2111,8 +2126,12 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - -- if (*pos > 2 || (*pos == 2 && !zfs_show_ctldir(zp))) { -+ /* -+ * Move to the next entry, fill in the previous offset. -+ */ -+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); -- *pos = zap_cursor_serialize(&zc); -+ offset = zap_cursor_serialize(&zc); - } else { -- (*pos)++; -+ offset += 1; - } -+ ctx->pos = offset; - } -@@ -2126,3 +2145,2 @@ update: - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - -@@ -2376,2 +2394,4 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) - zfs_sb_t *zsb = ITOZSB(ip); -+ uint32_t blksize; -+ u_longlong_t nblocks; - -@@ -2385,3 +2405,6 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) - -- sa_object_size(zp->z_sa_hdl, (uint32_t *)&sp->blksize, &sp->blocks); -+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); -+ sp->blksize = blksize; -+ sp->blocks = nblocks; -+ - if (unlikely(zp->z_blksz == 0)) { -@@ -2429,3 +2452,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - uint_t mask = vap->va_mask; -- uint_t saved_mask; -+ uint_t saved_mask = 0; - int trim_mask = 0; -@@ -2465,3 +2488,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2470,3 +2493,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EISDIR); -+ return (SET_ERROR(EISDIR)); - } -@@ -2475,3 +2498,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2484,7 +2507,7 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - -- tmpxvattr = kmem_alloc(sizeof(xvattr_t), KM_SLEEP); -+ tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); - xva_init(tmpxvattr); - -- bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); -- xattr_bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); -+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); -+ xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); - -@@ -2512,4 +2535,6 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - if (mask & (ATTR_ATIME | ATTR_MTIME)) { -- if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || -- ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { -+ if (((mask & ATTR_ATIME) && -+ TIMESPEC_OVERFLOW(&vap->va_atime)) || -+ ((mask & ATTR_MTIME) && -+ TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = EOVERFLOW; -@@ -2825,8 +2850,5 @@ top: - -- err = dmu_tx_assign(tx, TXG_NOWAIT); -- if (err) { -- if (err == ERESTART) -- dmu_tx_wait(tx); -+ err = dmu_tx_assign(tx, TXG_WAIT); -+ if (err) - goto out; -- } - -@@ -3027,5 +3049,5 @@ out2: - out3: -- kmem_free(xattr_bulk, sizeof(sa_bulk_attr_t) * 7); -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 7); -- kmem_free(tmpxvattr, sizeof(xvattr_t)); -+ kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * 7); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 7); -+ kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zsb); -@@ -3111,3 +3133,3 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) - if (oidp == szp->z_id) /* We're a descendant of szp */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3143,4 +3165,3 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -3164,2 +3185,3 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - int zflg = 0; -+ boolean_t waited = B_FALSE; - -@@ -3169,5 +3191,5 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - -- if (tdip->i_sb != sdip->i_sb) { -+ if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { - ZFS_EXIT(zsb); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -3179,3 +3201,3 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3197,3 +3219,3 @@ top: - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3342,3 +3364,3 @@ top: - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { -- error = ENOTDIR; -+ error = SET_ERROR(ENOTDIR); - goto out; -@@ -3347,3 +3369,3 @@ top: - if (S_ISDIR(ZTOI(tzp)->i_mode)) { -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - goto out; -@@ -3378,3 +3400,3 @@ top: - dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3392,2 +3414,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3478,4 +3501,3 @@ EXPORT_SYMBOL(zfs_rename); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -3500,2 +3522,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - uint64_t txtype = TX_SYMLINK; -+ boolean_t waited = B_FALSE; - -@@ -3510,3 +3533,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3517,3 +3540,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - ZFS_EXIT(zsb); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -3549,3 +3572,3 @@ top: - ZFS_EXIT(zsb); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } -@@ -3564,3 +3587,3 @@ top: - zfs_fuid_txhold(zsb, tx); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3568,2 +3591,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3660,3 +3684,2 @@ zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - ZFS_EXIT(zsb); -@@ -3695,2 +3718,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - uid_t owner; -+ boolean_t waited = B_FALSE; - -@@ -3708,8 +3732,8 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } - -- if (sip->i_sb != tdip->i_sb) { -+ if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zsb); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -3728,3 +3752,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -3734,3 +3758,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3749,3 +3773,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3755,3 +3779,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -3778,3 +3802,3 @@ top: - zfs_sa_upgrade_txholds(tx, dzp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3782,2 +3806,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3817,3 +3842,3 @@ EXPORT_SYMBOL(zfs_link); - static void --zfs_putpage_commit_cb(void *arg, int error) -+zfs_putpage_commit_cb(void *arg) - { -@@ -3821,11 +3846,3 @@ zfs_putpage_commit_cb(void *arg, int error) - -- if (error) { -- __set_page_dirty_nobuffers(pp); -- -- if (error != ECANCELED) -- SetPageError(pp); -- } else { -- ClearPageError(pp); -- } -- -+ ClearPageError(pp); - end_page_writeback(pp); -@@ -3863,3 +3880,2 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - int cnt = 0; -- int sync; - -@@ -3870,5 +3886,5 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- pgoff = page_offset(pp); /* Page byte-offset in file */ -- offset = i_size_read(ip); /* File length in bytes */ -- pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */ -+ pgoff = page_offset(pp); /* Page byte-offset in file */ -+ offset = i_size_read(ip); /* File length in bytes */ -+ pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */ - P2ROUNDUP(offset, PAGE_CACHE_SIZE)-pgoff); -@@ -3904,7 +3920,2 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) || -- (wbc->sync_mode == WB_SYNC_ALL)); -- if (!sync) -- dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp); -- - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); -@@ -3918,12 +3929,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- /* Will call all registered commit callbacks */ - dmu_tx_abort(tx); -- -- /* -- * For the synchronous case the commit callback must be -- * explicitly called because there is no registered callback. -- */ -- if (sync) -- zfs_putpage_commit_cb(pp, ECANCELED); -- -+ __set_page_dirty_nobuffers(pp); -+ ClearPageError(pp); -+ end_page_writeback(pp); - zfs_range_unlock(rl); -@@ -3950,3 +3955,4 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0); -+ zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, -+ zfs_putpage_commit_cb, pp); - dmu_tx_commit(tx); -@@ -3955,5 +3961,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- if (sync) { -+ if (wbc->sync_mode != WB_SYNC_NONE) { -+ /* -+ * Note that this is rarely called under writepages(), because -+ * writepages() normally handles the entire commit for -+ * performance reasons. -+ */ - zil_commit(zsb->z_log, zp->z_id); -- zfs_putpage_commit_cb(pp, err); - } -@@ -3966,3 +3976,3 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - * Update the system attributes when the inode has been dirtied. For the -- * moment we're conservative and only update the atime, mtime, and ctime. -+ * moment we only update the mode, atime, mtime, and ctime. - */ -@@ -3974,4 +3984,4 @@ zfs_dirty_inode(struct inode *ip, int flags) - dmu_tx_t *tx; -- uint64_t atime[2], mtime[2], ctime[2]; -- sa_bulk_attr_t bulk[3]; -+ uint64_t mode, atime[2], mtime[2], ctime[2]; -+ sa_bulk_attr_t bulk[4]; - int error; -@@ -3994,2 +4004,3 @@ zfs_dirty_inode(struct inode *ip, int flags) - mutex_enter(&zp->z_lock); -+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zsb), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zsb), NULL, &atime, 16); -@@ -3998,3 +4009,3 @@ zfs_dirty_inode(struct inode *ip, int flags) - -- /* Preserve the mtime and ctime provided by the inode */ -+ /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); -@@ -4002,2 +4013,5 @@ zfs_dirty_inode(struct inode *ip, int flags) - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); -+ mode = ip->i_mode; -+ -+ zp->z_mode = mode; - zp->z_atime_dirty = 0; -@@ -4083,13 +4097,13 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - { -- znode_t *zp = ITOZ(ip); -- zfs_sb_t *zsb = ITOZSB(ip); -- objset_t *os; -+ znode_t *zp = ITOZ(ip); -+ zfs_sb_t *zsb = ITOZSB(ip); -+ objset_t *os; - struct page *cur_pp; -- u_offset_t io_off, total; -- size_t io_len; -- loff_t i_size; -- unsigned page_idx; -- int err; -+ u_offset_t io_off, total; -+ size_t io_len; -+ loff_t i_size; -+ unsigned page_idx; -+ int err; - -- os = zsb->z_os; -+ os = zsb->z_os; - io_len = nr_pages << PAGE_CACHE_SHIFT; -@@ -4116,3 +4130,3 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - if (err == ECKSUM) -- err = EIO; -+ err = SET_ERROR(EIO); - return (err); -@@ -4132,4 +4146,3 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -4188,3 +4201,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -4194,3 +4207,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -4199,3 +4212,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -4232,3 +4245,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4236,3 +4249,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - if (lckdat->l_start < 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4248,3 +4261,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4268,4 +4281,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -4289,3 +4301,3 @@ zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4299,3 +4311,3 @@ zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4350,3 +4362,3 @@ zfs_fid(struct inode *ip, fid_t *fidp) - ZFS_EXIT(zsb); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -4454,3 +4466,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - if (xuio->xu_type != UIOTYPE_ZEROCOPY) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4467,3 +4479,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4532,3 +4544,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4541,3 +4553,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4546,3 +4558,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c -index aaf17e1..2ab896f 100644 ---- a/module/zfs/zfs_znode.c -+++ b/module/zfs/zfs_znode.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -357,2 +357,3 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - struct inode *ip; -+ uint64_t mode; - uint64_t parent; -@@ -388,3 +389,3 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - -- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &zp->z_mode, 8); -+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zsb), NULL, &zp->z_gen, 8); -@@ -408,2 +409,4 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - -+ zp->z_mode = mode; -+ - /* -@@ -442,3 +445,3 @@ error: - iput(ip); -- return NULL; -+ return (NULL); - } -@@ -649,3 +652,3 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - */ -- sa_attrs = kmem_alloc(sizeof(sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); -+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); - -@@ -751,3 +754,3 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - } -- kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * ZPL_END); -+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); - ZFS_OBJ_HOLD_EXIT(zsb, obj); -@@ -756,5 +759,4 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - /* -- * zfs_xvattr_set only updates the in-core attributes -- * it is assumed the caller will be doing an sa_bulk_update -- * to push the changes out -+ * Update in-core attributes. It is assumed the caller will be doing an -+ * sa_bulk_update to push the changes out. - */ -@@ -859,3 +861,2 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) - sa_handle_t *hdl; -- struct inode *ip; - -@@ -864,4 +865,2 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) - again: -- ip = ilookup(zsb->z_sb, obj_num); -- - ZFS_OBJ_HOLD_ENTER(zsb, obj_num); -@@ -871,3 +870,2 @@ again: - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); - return (err); -@@ -882,4 +880,3 @@ again: - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -888,18 +885,5 @@ again: - if (hdl != NULL) { -- if (ip == NULL) { -- /* -- * ilookup returned NULL, which means -- * the znode is dying - but the SA handle isn't -- * quite dead yet, we need to drop any locks -- * we're holding, re-schedule the task and try again. -- */ -- sa_buf_rele(db, NULL); -- ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- -- schedule(); -- goto again; -- } -- - zp = sa_get_userdata(hdl); - -+ - /* -@@ -915,5 +899,24 @@ again: - if (zp->z_unlinked) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -- igrab(ZTOI(zp)); -+ /* -+ * If igrab() returns NULL the VFS has independently -+ * determined the inode should be evicted and has -+ * called iput_final() to start the eviction process. -+ * The SA handle is still valid but because the VFS -+ * requires that the eviction succeed we must drop -+ * our locks and references to allow the eviction to -+ * complete. The zfs_zget() may then be retried. -+ * -+ * This unlikely case could be optimized by registering -+ * a sops->drop_inode() callback. The callback would -+ * need to detect the active SA hold thereby informing -+ * the VFS that this inode should not be evicted. -+ */ -+ if (igrab(ZTOI(zp)) == NULL) { -+ mutex_exit(&zp->z_lock); -+ sa_buf_rele(db, NULL); -+ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -+ goto again; -+ } - *zpp = zp; -@@ -921,6 +924,5 @@ again: - } -- sa_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); -+ sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); - return (err); -@@ -928,4 +930,2 @@ again: - -- ASSERT3P(ip, ==, NULL); -- - /* -@@ -943,3 +943,3 @@ again: - if (zp == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -999,3 +999,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1025,3 +1025,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1033,3 +1033,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1109,2 +1109,59 @@ zfs_zinactive(znode_t *zp) - -+static inline int -+zfs_compare_timespec(struct timespec *t1, struct timespec *t2) -+{ -+ if (t1->tv_sec < t2->tv_sec) -+ return (-1); -+ -+ if (t1->tv_sec > t2->tv_sec) -+ return (1); -+ -+ return (t1->tv_nsec - t2->tv_nsec); -+} -+ -+/* -+ * Determine whether the znode's atime must be updated. The logic mostly -+ * duplicates the Linux kernel's relatime_need_update() functionality. -+ * This function is only called if the underlying filesystem actually has -+ * atime updates enabled. -+ */ -+static inline boolean_t -+zfs_atime_need_update(znode_t *zp, timestruc_t *now) -+{ -+ if (!ZTOZSB(zp)->z_relatime) -+ return (B_TRUE); -+ -+ /* -+ * In relatime mode, only update the atime if the previous atime -+ * is earlier than either the ctime or mtime or if at least a day -+ * has passed since the last update of atime. -+ */ -+ if (zfs_compare_timespec(&ZTOI(zp)->i_mtime, &ZTOI(zp)->i_atime) >= 0) -+ return (B_TRUE); -+ -+ if (zfs_compare_timespec(&ZTOI(zp)->i_ctime, &ZTOI(zp)->i_atime) >= 0) -+ return (B_TRUE); -+ -+ if ((long)now->tv_sec - ZTOI(zp)->i_atime.tv_sec >= 24*60*60) -+ return (B_TRUE); -+ -+ return (B_FALSE); -+} -+ -+/* -+ * Prepare to update znode time stamps. -+ * -+ * IN: zp - znode requiring timestamp update -+ * flag - ATTR_MTIME, ATTR_CTIME, ATTR_ATIME flags -+ * have_tx - true of caller is creating a new txg -+ * -+ * OUT: zp - new atime (via underlying inode's i_atime) -+ * mtime - new mtime -+ * ctime - new ctime -+ * -+ * NOTE: The arguments are somewhat redundant. The following condition -+ * is always true: -+ * -+ * have_tx == !(flag & ATTR_ATIME) -+ */ - void -@@ -1115,13 +1172,22 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], - -+ ASSERT(have_tx == !(flag & ATTR_ATIME)); - gethrestime(&now); - -- if (have_tx) { /* will sa_bulk_update happen really soon? */ -+ /* -+ * NOTE: The following test intentionally does not update z_atime_dirty -+ * in the case where an ATIME update has been requested but for which -+ * the update is omitted due to relatime logic. The rationale being -+ * that if the flag was set somewhere else, we should leave it alone -+ * here. -+ */ -+ if (flag & ATTR_ATIME) { -+ if (zfs_atime_need_update(zp, &now)) { -+ ZFS_TIME_ENCODE(&now, zp->z_atime); -+ ZTOI(zp)->i_atime.tv_sec = zp->z_atime[0]; -+ ZTOI(zp)->i_atime.tv_nsec = zp->z_atime[1]; -+ zp->z_atime_dirty = 1; -+ } -+ } else { - zp->z_atime_dirty = 0; - zp->z_seq++; -- } else { -- zp->z_atime_dirty = 1; -- } -- -- if (flag & ATTR_ATIME) { -- ZFS_TIME_ENCODE(&now, zp->z_atime); - } -@@ -1185,4 +1251,3 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1209,3 +1274,2 @@ zfs_extend(znode_t *zp, uint64_t end) - } --top: - tx = dmu_tx_create(zsb->z_os); -@@ -1229,9 +1293,4 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto top; -- } - dmu_tx_abort(tx); -@@ -1263,4 +1322,3 @@ top: - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1302,4 +1360,3 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1376,4 +1433,3 @@ top: - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1410,3 +1466,3 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) - if (!lock_may_write(ip, off, length)) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } -@@ -1426,9 +1482,4 @@ log: - zfs_sa_upgrade_txholds(tx, zp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto log; -- } - dmu_tx_abort(tx); -@@ -1631,3 +1682,3 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, - sa_buf_rele(*db, tag); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1718,6 +1769,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, - for (;;) { -- uint64_t pobj; -+ uint64_t pobj = 0; - char component[MAXNAMELEN + 2]; - size_t complen; -- int is_xattrdir; -+ int is_xattrdir = 0; - -diff --git a/module/zfs/zil.c b/module/zfs/zil.c -index c179693..b69a7bf 100644 ---- a/module/zfs/zil.c -+++ b/module/zfs/zil.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -72,15 +72,15 @@ - zil_stats_t zil_stats = { -- { "zil_commit_count", KSTAT_DATA_UINT64 }, -- { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_commit_count", KSTAT_DATA_UINT64 }, -+ { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, - }; -@@ -90,5 +90,5 @@ static kstat_t *zil_ksp; - /* -- * This global ZIL switch affects all pools -+ * Disable intent logging replay. This global ZIL switch affects all pools. - */ --int zil_replay_disable = 0; /* disable intent logging replay */ -+int zil_replay_disable = 0; - -@@ -166,3 +166,3 @@ zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) - if (avl_find(t, dva, &where) != NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -237,3 +237,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { -- error = ECKSUM; -+ error = SET_ERROR(ECKSUM); - } else { -@@ -251,3 +251,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - (zilc->zc_nused > (size - sizeof (*zilc)))) { -- error = ECKSUM; -+ error = SET_ERROR(ECKSUM); - } else { -@@ -259,3 +259,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - -- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); -+ VERIFY(arc_buf_remove_ref(abuf, &abuf)); - } -@@ -321,3 +321,3 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - -- bzero(&next_blk, sizeof(blkptr_t)); -+ bzero(&next_blk, sizeof (blkptr_t)); - -@@ -358,3 +358,3 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); -- if (error) -+ if (error != 0) - break; -@@ -494,3 +494,3 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) - -- if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { -+ if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { - /* up the hold count until we can be written out */ -@@ -660,4 +660,4 @@ zil_claim(const char *osname, void *txarg) - -- error = dmu_objset_hold(osname, FTAG, &os); -- if (error) { -+ error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); -+ if (error != 0) { - cmn_err(CE_WARN, "can't open objset for %s", osname); -@@ -674,3 +674,3 @@ zil_claim(const char *osname, void *txarg) - dsl_dataset_dirty(dmu_objset_ds(os), tx); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - return (0); -@@ -699,3 +699,3 @@ zil_claim(const char *osname, void *txarg) - ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - return (0); -@@ -719,3 +719,3 @@ zil_check_log_chain(const char *osname, void *tx) - error = dmu_objset_hold(osname, FTAG, &os); -- if (error) { -+ if (error != 0) { - cmn_err(CE_WARN, "can't open objset for %s", osname); -@@ -915,3 +915,3 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), -- zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, -+ zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | -@@ -924,2 +924,3 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) - * Define a limited set of intent log block sizes. -+ * - * These must be a multiple of 4KB. Note only the amount used (again -@@ -1016,10 +1017,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) - use_slog = USE_SLOG(zilog); -- error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); -- if (use_slog) -- { -+ error = zio_alloc_zil(spa, txg, bp, zil_blksz, -+ USE_SLOG(zilog)); -+ if (use_slog) { - ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); -- } -- else -- { -+ } else { - ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); -@@ -1027,3 +1026,3 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) - } -- if (!error) { -+ if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); -@@ -1134,3 +1133,4 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - ZIL_STAT_BUMP(zil_itx_needcopy_count); -- ZIL_STAT_INCR(zil_itx_needcopy_bytes, lrw->lr_length); -+ ZIL_STAT_INCR(zil_itx_needcopy_bytes, -+ lrw->lr_length); - } else { -@@ -1139,3 +1139,4 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - ZIL_STAT_BUMP(zil_itx_indirect_count); -- ZIL_STAT_INCR(zil_itx_indirect_bytes, lrw->lr_length); -+ ZIL_STAT_INCR(zil_itx_indirect_bytes, -+ lrw->lr_length); - } -@@ -1147,3 +1148,3 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - } -- if (error) { -+ if (error != 0) { - ASSERT(error == ENOENT || error == EEXIST || -@@ -1184,2 +1185,4 @@ zil_itx_create(uint64_t txtype, size_t lrsize) - itx->itx_sync = B_TRUE; /* default is synchronous */ -+ itx->itx_callback = NULL; -+ itx->itx_callback_data = NULL; - -@@ -1209,2 +1212,4 @@ zil_itxg_clean(itxs_t *itxs) - while ((itx = list_head(list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(list, itx); -@@ -1219,2 +1224,4 @@ zil_itxg_clean(itxs_t *itxs) - while ((itx = list_head(list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(list, itx); -@@ -1285,2 +1292,4 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) - while ((itx = list_head(&clean_list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(&clean_list, itx); -@@ -1336,3 +1345,4 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) - itxg->itxg_txg = txg; -- itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_PUSHPAGE); -+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), -+ KM_PUSHPAGE); - -@@ -1356,3 +1366,4 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) - if (ian == NULL) { -- ian = kmem_alloc(sizeof (itx_async_node_t), KM_PUSHPAGE); -+ ian = kmem_alloc(sizeof (itx_async_node_t), -+ KM_PUSHPAGE); - list_create(&ian->ia_list, sizeof (itx_t), -@@ -1530,3 +1541,4 @@ zil_commit_writer(zilog_t *zilog) - DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); -- while ((itx = list_head(&zilog->zl_itx_commit_list))) { -+ for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL; -+ itx = list_next(&zilog->zl_itx_commit_list, itx)) { - txg = itx->itx_lr.lrc_txg; -@@ -1536,5 +1548,2 @@ zil_commit_writer(zilog_t *zilog) - lwb = zil_lwb_commit(zilog, itx, lwb); -- list_remove(&zilog->zl_itx_commit_list, itx); -- kmem_free(itx, offsetof(itx_t, itx_lr) -- + itx->itx_lr.lrc_reclen); - } -@@ -1560,2 +1569,13 @@ zil_commit_writer(zilog_t *zilog) - -+ while ((itx = list_head(&zilog->zl_itx_commit_list))) { -+ txg = itx->itx_lr.lrc_txg; -+ ASSERT(txg); -+ -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); -+ list_remove(&zilog->zl_itx_commit_list, itx); -+ kmem_free(itx, offsetof(itx_t, itx_lr) -+ + itx->itx_lr.lrc_reclen); -+ } -+ - mutex_enter(&zilog->zl_lock); -@@ -1727,3 +1747,3 @@ zil_init(void) - zil_ksp = kstat_create("zfs", 0, "zil", "misc", -- KSTAT_TYPE_NAMED, sizeof(zil_stats) / sizeof(kstat_named_t), -+ KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); -@@ -1809,2 +1829,5 @@ zil_free(zilog_t *zilog) - -+ ASSERT0(zilog->zl_suspend); -+ ASSERT0(zilog->zl_suspending); -+ - ASSERT(list_is_empty(&zilog->zl_lwb_list)); -@@ -1907,2 +1930,4 @@ zil_close(zilog_t *zilog) - -+static char *suspend_tag = "zil suspending"; -+ - /* -@@ -1910,20 +1935,67 @@ zil_close(zilog_t *zilog) - * synchronous semantics, but we rely on txg_wait_synced() to do it. -- * We suspend the log briefly when taking a snapshot so that the snapshot -- * contains all the data it's supposed to, and has an empty intent log. -+ * On old version pools, we suspend the log briefly when taking a -+ * snapshot so that it will have an empty intent log. -+ * -+ * Long holds are not really intended to be used the way we do here -- -+ * held for such a short time. A concurrent caller of dsl_dataset_long_held() -+ * could fail. Therefore we take pains to only put a long hold if it is -+ * actually necessary. Fortunately, it will only be necessary if the -+ * objset is currently mounted (or the ZVOL equivalent). In that case it -+ * will already have a long hold, so we are not really making things any worse. -+ * -+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or -+ * zvol_state_t), and use their mechanism to prevent their hold from being -+ * dropped (e.g. VFS_HOLD()). However, that would be even more pain for -+ * very little gain. -+ * -+ * if cookiep == NULL, this does both the suspend & resume. -+ * Otherwise, it returns with the dataset "long held", and the cookie -+ * should be passed into zil_resume(). - */ - int --zil_suspend(zilog_t *zilog) -+zil_suspend(const char *osname, void **cookiep) - { -- const zil_header_t *zh = zilog->zl_header; -+ objset_t *os; -+ zilog_t *zilog; -+ const zil_header_t *zh; -+ int error; -+ -+ error = dmu_objset_hold(osname, suspend_tag, &os); -+ if (error != 0) -+ return (error); -+ zilog = dmu_objset_zil(os); - - mutex_enter(&zilog->zl_lock); -+ zh = zilog->zl_header; -+ - if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ - mutex_exit(&zilog->zl_lock); -- return (EBUSY); -+ dmu_objset_rele(os, suspend_tag); -+ return (SET_ERROR(EBUSY)); - } -- if (zilog->zl_suspend++ != 0) { -+ -+ /* -+ * Don't put a long hold in the cases where we can avoid it. This -+ * is when there is no cookie so we are doing a suspend & resume -+ * (i.e. called from zil_vdev_offline()), and there's nothing to do -+ * for the suspend because it's already suspended, or there's no ZIL. -+ */ -+ if (cookiep == NULL && !zilog->zl_suspending && -+ (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { -+ mutex_exit(&zilog->zl_lock); -+ dmu_objset_rele(os, suspend_tag); -+ return (0); -+ } -+ -+ dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); -+ dsl_pool_rele(dmu_objset_pool(os), suspend_tag); -+ -+ zilog->zl_suspend++; -+ -+ if (zilog->zl_suspend > 1) { - /* -- * Someone else already began a suspend. -+ * Someone else is already suspending it. - * Just wait for them to finish. - */ -+ - while (zilog->zl_suspending) -@@ -1931,4 +2003,23 @@ zil_suspend(zilog_t *zilog) - mutex_exit(&zilog->zl_lock); -+ -+ if (cookiep == NULL) -+ zil_resume(os); -+ else -+ *cookiep = os; -+ return (0); -+ } -+ -+ /* -+ * If there is no pointer to an on-disk block, this ZIL must not -+ * be active (e.g. filesystem not mounted), so there's nothing -+ * to clean up. -+ */ -+ if (BP_IS_HOLE(&zh->zh_log)) { -+ ASSERT(cookiep != NULL); /* fast path already handled */ -+ -+ *cookiep = os; -+ mutex_exit(&zilog->zl_lock); - return (0); - } -+ - zilog->zl_suspending = B_TRUE; -@@ -1945,2 +2036,6 @@ zil_suspend(zilog_t *zilog) - -+ if (cookiep == NULL) -+ zil_resume(os); -+ else -+ *cookiep = os; - return (0); -@@ -1949,4 +2044,7 @@ zil_suspend(zilog_t *zilog) - void --zil_resume(zilog_t *zilog) -+zil_resume(void *cookie) - { -+ objset_t *os = cookie; -+ zilog_t *zilog = dmu_objset_zil(os); -+ - mutex_enter(&zilog->zl_lock); -@@ -1955,2 +2053,4 @@ zil_resume(zilog_t *zilog) - mutex_exit(&zilog->zl_lock); -+ dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); -+ dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); - } -@@ -2027,3 +2127,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - zr->zr_lr + reclen); -- if (error) -+ if (error != 0) - return (zil_replay_error(zilog, lr, error)); -@@ -2048,3 +2148,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); -- if (error) { -+ if (error != 0) { - /* -@@ -2058,3 +2158,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); -- if (error) -+ if (error != 0) - return (zil_replay_error(zilog, lr, error)); -@@ -2130,17 +2230,8 @@ zil_vdev_offline(const char *osname, void *arg) - { -- objset_t *os; -- zilog_t *zilog; - int error; - -- error = dmu_objset_hold(osname, FTAG, &os); -- if (error) -- return (error); -- -- zilog = dmu_objset_zil(os); -- if (zil_suspend(zilog) != 0) -- error = EEXIST; -- else -- zil_resume(zilog); -- dmu_objset_rele(os, FTAG); -- return (error); -+ error = zil_suspend(osname, NULL); -+ if (error != 0) -+ return (SET_ERROR(EEXIST)); -+ return (0); - } -diff --git a/module/zfs/zio.c b/module/zfs/zio.c -index ccefaf8..97f2549 100644 ---- a/module/zfs/zio.c -+++ b/module/zfs/zio.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. -@@ -41,22 +41,2 @@ - * ========================================================================== -- * I/O priority table -- * ========================================================================== -- */ --uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { -- 0, /* ZIO_PRIORITY_NOW */ -- 0, /* ZIO_PRIORITY_SYNC_READ */ -- 0, /* ZIO_PRIORITY_SYNC_WRITE */ -- 0, /* ZIO_PRIORITY_LOG_WRITE */ -- 1, /* ZIO_PRIORITY_CACHE_FILL */ -- 1, /* ZIO_PRIORITY_AGG */ -- 4, /* ZIO_PRIORITY_FREE */ -- 4, /* ZIO_PRIORITY_ASYNC_WRITE */ -- 6, /* ZIO_PRIORITY_ASYNC_READ */ -- 10, /* ZIO_PRIORITY_RESILVER */ -- 20, /* ZIO_PRIORITY_SCRUB */ -- 2, /* ZIO_PRIORITY_DDT_PREFETCH */ --}; -- --/* -- * ========================================================================== - * I/O type descriptions -@@ -64,3 +44,3 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { - */ --char *zio_type_name[ZIO_TYPES] = { -+const char *zio_type_name[ZIO_TYPES] = { - "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" -@@ -154,3 +134,3 @@ zio_init(void) - sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); -- zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), -+ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof (vdev_io_t), - PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); -@@ -171,7 +151,17 @@ zio_init(void) - -+#ifndef _KERNEL -+ /* -+ * If we are using watchpoints, put each buffer on its own page, -+ * to eliminate the performance overhead of trapping to the -+ * kernel when modifying a non-watched buffer that shares the -+ * page with a watched buffer. -+ */ -+ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) -+ continue; -+#endif - if (size <= 4 * SPA_MINBLOCKSIZE) { - align = SPA_MINBLOCKSIZE; -- } else if (P2PHASE(size, PAGESIZE) == 0) { -+ } else if (IS_P2ALIGNED(size, PAGESIZE)) { - align = PAGESIZE; -- } else if (P2PHASE(size, p2 >> 2) == 0) { -+ } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = p2 >> 2; -@@ -219,3 +209,4 @@ zio_init(void) - */ -- zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); -+ if (zfs_mg_alloc_failures == 0) -+ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); - -@@ -397,3 +388,3 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) - zio->io_data, data, zio->io_size, size) != 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - } -@@ -540,3 +531,6 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) - ASSERT3U(*countp, >, 0); -- if (--*countp == 0 && pio->io_stall == countp) { -+ -+ (*countp)--; -+ -+ if (*countp == 0 && pio->io_stall == countp) { - pio->io_stall = NULL; -@@ -564,3 +558,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- zio_type_t type, int priority, enum zio_flag flags, -+ zio_type_t type, zio_priority_t priority, enum zio_flag flags, - vdev_t *vd, uint64_t offset, const zbookmark_t *zb, -@@ -611,2 +605,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_ready = NULL; -+ zio->io_physdone = NULL; - zio->io_done = done; -@@ -620,3 +615,2 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_offset = offset; -- zio->io_deadline = 0; - zio->io_timestamp = 0; -@@ -637,2 +631,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_child_count = 0; -+ zio->io_phys_children = 0; - zio->io_parent_count = 0; -@@ -697,3 +692,3 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb) -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) - { -@@ -713,4 +708,5 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, -- zio_done_func_t *ready, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb) -+ zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, -+ void *private, -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) - { -@@ -725,5 +721,3 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zp->zp_copies > 0 && -- zp->zp_copies <= spa_max_replication(spa) && -- zp->zp_dedup <= 1 && -- zp->zp_dedup_verify <= 1); -+ zp->zp_copies <= spa_max_replication(spa)); - -@@ -735,2 +729,3 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio->io_ready = ready; -+ zio->io_physdone = physdone; - zio->io_prop = *zp; -@@ -742,4 +737,4 @@ zio_t * - zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, -- uint64_t size, zio_done_func_t *done, void *private, int priority, -- enum zio_flag flags, zbookmark_t *zb) -+ uint64_t size, zio_done_func_t *done, void *private, -+ zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) - { -@@ -755,3 +750,3 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, - void --zio_write_override(zio_t *zio, blkptr_t *bp, int copies) -+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) - { -@@ -762,2 +757,9 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies) - -+ /* -+ * We must reset the io_prop to match the values that existed -+ * when the bp was first written by dmu_sync() keeping in mind -+ * that nopwrite and dedup are mutually exclusive. -+ */ -+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; -+ zio->io_prop.zp_nopwrite = nopwrite; - zio->io_prop.zp_copies = copies; -@@ -769,3 +771,17 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) - { -- bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); -+ metaslab_check_free(spa, bp); -+ -+ /* -+ * Frees that are for the currently-syncing txg, are not going to be -+ * deferred, and which will not need to do a read (i.e. not GANG or -+ * DEDUP), can be processed immediately. Otherwise, put them on the -+ * in-memory list for later processing. -+ */ -+ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || -+ txg != spa->spa_syncing_txg || -+ spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { -+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); -+ } else { -+ VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); -+ } - } -@@ -777,2 +793,3 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_t *zio; -+ enum zio_stage stage = ZIO_FREE_PIPELINE; - -@@ -785,7 +802,16 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - -+ metaslab_check_free(spa, bp); - arc_freed(spa, bp); - -+ /* -+ * GANG and DEDUP blocks can induce a read (for the gang block header, -+ * or the DDT), so issue them asynchronously so that this thread is -+ * not tied up. -+ */ -+ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) -+ stage |= ZIO_STAGE_ISSUE_ASYNC; -+ - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), -- NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, -- NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); -+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, -+ NULL, 0, NULL, ZIO_STAGE_OPEN, stage); - -@@ -825,3 +851,3 @@ zio_t * - zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags) -+ zio_done_func_t *done, void *private, enum zio_flag flags) - { -@@ -832,3 +858,3 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, -- ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, -+ ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); -@@ -841,3 +867,3 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, -- done, private, priority, flags)); -+ done, private, flags)); - } -@@ -850,3 +876,3 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, boolean_t labels) -+ zio_priority_t priority, enum zio_flag flags, boolean_t labels) - { -@@ -871,3 +897,3 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, boolean_t labels) -+ zio_priority_t priority, enum zio_flag flags, boolean_t labels) - { -@@ -906,4 +932,4 @@ zio_t * - zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, -- void *data, uint64_t size, int type, int priority, enum zio_flag flags, -- zio_done_func_t *done, void *private) -+ void *data, uint64_t size, int type, zio_priority_t priority, -+ enum zio_flag flags, zio_done_func_t *done, void *private) - { -@@ -942,2 +968,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - -+ zio->io_physdone = pio->io_physdone; -+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) -+ zio->io_logical->io_phys_children++; -+ - return (zio); -@@ -947,3 +977,3 @@ zio_t * - zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, -- int type, int priority, enum zio_flag flags, -+ int type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private) -@@ -956,3 +986,3 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, - data, size, done, private, type, priority, -- flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, -+ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, - vd, offset, NULL, -@@ -967,3 +997,3 @@ zio_flush(zio_t *zio, vdev_t *vd) - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, -- NULL, NULL, ZIO_PRIORITY_NOW, -+ NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); -@@ -1051,2 +1081,15 @@ zio_write_bp_init(zio_t *zio) - -+ /* -+ * If we've been overridden and nopwrite is set then -+ * set the flag accordingly to indicate that a nopwrite -+ * has already occurred. -+ */ -+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { -+ ASSERT(!zp->zp_dedup); -+ zio->io_flags |= ZIO_FLAG_NOPWRITE; -+ return (ZIO_PIPELINE_CONTINUE); -+ } -+ -+ ASSERT(!zp->zp_nopwrite); -+ - if (BP_IS_HOLE(bp) || !zp->zp_dedup) -@@ -1138,2 +1181,7 @@ zio_write_bp_init(zio_t *zio) - } -+ if (zp->zp_nopwrite) { -+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); -+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); -+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; -+ } - } -@@ -1404,2 +1452,3 @@ zio_reexecute(zio_t *pio) - pio->io_reexecute = 0; -+ pio->io_flags |= ZIO_FLAG_REEXECUTED; - pio->io_error = 0; -@@ -1805,3 +1854,2 @@ zio_write_gang_member_ready(zio_t *zio) - zio_t *pio = zio_unique_parent(zio); -- ASSERTV(zio_t *gio = zio->io_gang_leader;) - dva_t *cdva = zio->io_bp->blk_dva; -@@ -1810,2 +1858,3 @@ zio_write_gang_member_ready(zio_t *zio) - int d; -+ ASSERTV(zio_t *gio = zio->io_gang_leader); - -@@ -1887,4 +1936,5 @@ zio_write_gang_block(zio_t *pio) - zp.zp_copies = gio->io_prop.zp_copies; -- zp.zp_dedup = 0; -- zp.zp_dedup_verify = 0; -+ zp.zp_dedup = B_FALSE; -+ zp.zp_dedup_verify = B_FALSE; -+ zp.zp_nopwrite = B_FALSE; - -@@ -1892,3 +1942,3 @@ zio_write_gang_block(zio_t *pio) - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, -- zio_write_gang_member_ready, NULL, &gn->gn_child[g], -+ zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), -@@ -1913,2 +1963,58 @@ zio_write_gang_block(zio_t *pio) - /* -+ * The zio_nop_write stage in the pipeline determines if allocating -+ * a new bp is necessary. By leveraging a cryptographically secure checksum, -+ * such as SHA256, we can compare the checksums of the new data and the old -+ * to determine if allocating a new block is required. The nopwrite -+ * feature can handle writes in either syncing or open context (i.e. zil -+ * writes) and as a result is mutually exclusive with dedup. -+ */ -+static int -+zio_nop_write(zio_t *zio) -+{ -+ blkptr_t *bp = zio->io_bp; -+ blkptr_t *bp_orig = &zio->io_bp_orig; -+ zio_prop_t *zp = &zio->io_prop; -+ -+ ASSERT(BP_GET_LEVEL(bp) == 0); -+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); -+ ASSERT(zp->zp_nopwrite); -+ ASSERT(!zp->zp_dedup); -+ ASSERT(zio->io_bp_override == NULL); -+ ASSERT(IO_IS_ALLOCATING(zio)); -+ -+ /* -+ * Check to see if the original bp and the new bp have matching -+ * characteristics (i.e. same checksum, compression algorithms, etc). -+ * If they don't then just continue with the pipeline which will -+ * allocate a new bp. -+ */ -+ if (BP_IS_HOLE(bp_orig) || -+ !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || -+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || -+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || -+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || -+ zp->zp_copies != BP_GET_NDVAS(bp_orig)) -+ return (ZIO_PIPELINE_CONTINUE); -+ -+ /* -+ * If the checksums match then reset the pipeline so that we -+ * avoid allocating a new bp and issuing any I/O. -+ */ -+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { -+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); -+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); -+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); -+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); -+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, -+ sizeof (uint64_t)) == 0); -+ -+ *bp = *bp_orig; -+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; -+ zio->io_flags |= ZIO_FLAG_NOPWRITE; -+ } -+ -+ return (ZIO_PIPELINE_CONTINUE); -+} -+ -+/* - * ========================================================================== -@@ -2061,4 +2167,4 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) - zio->io_orig_size) != 0) -- error = EEXIST; -- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); -+ error = SET_ERROR(EEXIST); -+ VERIFY(arc_buf_remove_ref(abuf, &abuf)); - } -@@ -2186,3 +2292,3 @@ zio_ddt_write(zio_t *zio) - } else { -- zp->zp_dedup = 0; -+ zp->zp_dedup = B_FALSE; - } -@@ -2220,3 +2326,3 @@ zio_ddt_write(zio_t *zio) - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, -- zio->io_orig_size, &czp, NULL, -+ zio->io_orig_size, &czp, NULL, NULL, - zio_ddt_ditto_write_done, dde, zio->io_priority, -@@ -2242,3 +2348,3 @@ zio_ddt_write(zio_t *zio) - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, -- zio->io_orig_size, zp, zio_ddt_child_write_ready, -+ zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, -@@ -2404,3 +2510,3 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, - new_bp, 1, txg, NULL, -- METASLAB_FASTWRITE | METASLAB_GANG_AVOID); -+ METASLAB_FASTWRITE); - } -@@ -2529,3 +2635,3 @@ zio_vdev_io_start(zio_t *zio) - if (!vdev_accessible(vd, zio)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); -@@ -2566,3 +2672,3 @@ zio_vdev_io_done(zio_t *zio) - if (!vdev_accessible(vd, zio)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - } else { -@@ -2651,3 +2757,3 @@ zio_vdev_io_assess(zio_t *zio) - !vdev_accessible(vd, zio)) -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - -@@ -2658,4 +2764,5 @@ zio_vdev_io_assess(zio_t *zio) - if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && -- vd != NULL && !vd->vdev_ops->vdev_op_leaf) -+ vd != NULL && !vd->vdev_ops->vdev_op_leaf) { - vd->vdev_cant_write = B_TRUE; -+ } - -@@ -2664,2 +2771,9 @@ zio_vdev_io_assess(zio_t *zio) - -+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf && -+ zio->io_physdone != NULL) { -+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); -+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); -+ zio->io_physdone(zio->io_logical); -+ } -+ - return (ZIO_PIPELINE_CONTINUE); -@@ -2814,3 +2928,4 @@ zio_ready(zio_t *zio) - ASSERT(IO_IS_ALLOCATING(zio)); -- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); -+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || -+ (zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); -@@ -2882,3 +2997,4 @@ zio_done(zio_t *zio) - ASSERT(zio->io_bp->blk_pad[1] == 0); -- ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || -+ ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, -+ sizeof (blkptr_t)) == 0 || - (zio->io_bp == zio_unique_parent(zio)->io_bp)); -@@ -2888,6 +3004,10 @@ zio_done(zio_t *zio) - ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); -- ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); -+ ASSERT3U(zio->io_prop.zp_copies, <=, -+ BP_GET_NDVAS(zio->io_bp)); - ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || -- (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); -+ (BP_COUNT_GANG(zio->io_bp) == -+ BP_GET_NDVAS(zio->io_bp))); - } -+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) -+ VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); - } -@@ -2915,3 +3035,3 @@ zio_done(zio_t *zio) - bcopy(zio->io_data, abuf, zio->io_size); -- bzero(abuf + zio->io_size, asize - zio->io_size); -+ bzero(abuf+zio->io_size, asize-zio->io_size); - } -@@ -2940,3 +3060,3 @@ zio_done(zio_t *zio) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, -- zio->io_vd, zio, 0, 0); -+ zio->io_vd, zio, 0, 0); - } -@@ -2963,4 +3083,4 @@ zio_done(zio_t *zio) - spa_log_error(zio->io_spa, zio); -- zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, -- 0, 0); -+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, -+ NULL, zio, 0, 0); - } -@@ -3014,3 +3134,3 @@ zio_done(zio_t *zio) - IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && -- !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) -+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) - zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); -@@ -3111,3 +3231,3 @@ zio_done(zio_t *zio) - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && -- !BP_IS_HOLE(zio->io_bp)) { -+ !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); -@@ -3158,2 +3278,3 @@ static zio_pipe_stage_t *zio_pipeline[] = { - zio_checksum_generate, -+ zio_nop_write, - zio_ddt_read_start, -@@ -3226,3 +3347,2 @@ EXPORT_SYMBOL(zio_handle_device_injection); - EXPORT_SYMBOL(zio_handle_label_injection); --EXPORT_SYMBOL(zio_priority_table); - EXPORT_SYMBOL(zio_type_name); -@@ -3240,3 +3360,3 @@ module_param(zfs_sync_pass_deferred_free, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_deferred_free, -- "defer frees starting in this pass"); -+ "Defer frees starting in this pass"); - -@@ -3244,3 +3364,3 @@ module_param(zfs_sync_pass_dont_compress, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_dont_compress, -- "don't compress starting in this pass"); -+ "Don't compress starting in this pass"); - -@@ -3248,3 +3368,3 @@ module_param(zfs_sync_pass_rewrite, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_rewrite, -- "rewrite new bps starting in this pass"); -+ "Rewrite new bps starting in this pass"); - #endif -diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c -index c8fe20f..bc73317 100644 ---- a/module/zfs/zio_checksum.c -+++ b/module/zfs/zio_checksum.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -203,3 +204,3 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -218,6 +219,6 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - else -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - - if (nused > size) -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - -@@ -263,3 +264,3 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - -diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c -index 1dc780d..5b63f0a 100644 ---- a/module/zfs/zio_compress.c -+++ b/module/zfs/zio_compress.c -@@ -29,2 +29,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #include -@@ -55,3 +59,3 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {zle_compress, zle_decompress, 64, "zle"}, -- {lz4_compress, lz4_decompress, 0, "lz4"}, -+ {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, - }; -@@ -132,3 +136,3 @@ zio_decompress_data(enum zio_compress c, void *src, void *dst, - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c -index eb589c4..39ec590 100644 ---- a/module/zfs/zio_inject.c -+++ b/module/zfs/zio_inject.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -278,3 +278,3 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) - if (handler->zi_record.zi_error == ENXIO) { -- ret = EIO; -+ ret = SET_ERROR(EIO); - break; -@@ -418,3 +418,3 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) - if ((spa = spa_inject_addref(name)) == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -470,3 +470,3 @@ zio_inject_list_next(int *id, char *name, size_t buflen, - } else { -- ret = ENOENT; -+ ret = SET_ERROR(ENOENT); - } -@@ -497,3 +497,3 @@ zio_clear_fault(int id) - rw_exit(&inject_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c -index 1bb646f..9e587e3 100644 ---- a/module/zfs/zpl_ctldir.c -+++ b/module/zfs/zpl_ctldir.c -@@ -45,3 +45,3 @@ zpl_common_open(struct inode *ip, struct file *filp) - -- return generic_file_open(ip, filp); -+ return (generic_file_open(ip, filp)); - } -@@ -131,8 +131,8 @@ zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -176,3 +176,3 @@ zpl_snapdir_automount(struct path *path) - if (error) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -@@ -200,3 +200,3 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) - { -- return 0; -+ return (0); - } -@@ -239,3 +239,3 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - if (error && error != -ENOENT) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -@@ -245,3 +245,3 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -263,4 +263,6 @@ zpl_snapdir_iterate(struct file *filp, struct dir_context *ctx) - while (error == 0) { -+ dsl_pool_config_enter(dmu_objset_pool(zsb->z_os), FTAG); - error = -dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN, -- snapname, &id, &(ctx->pos), &case_conflict); -+ snapname, &id, &ctx->pos, &case_conflict); -+ dsl_pool_config_exit(dmu_objset_pool(zsb->z_os), FTAG); - if (error) -@@ -334,3 +336,3 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, mode | S_IFDIR, cr); -@@ -344,3 +346,3 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - ASSERT3S(error, <=, 0); -@@ -423,8 +425,8 @@ zpl_shares_lookup(struct inode *dip, struct dentry *dentry, - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -497,6 +499,7 @@ zpl_shares_getattr(struct vfsmount *mnt, struct dentry *dentry, - error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp); -- if (error == 0) -- error = -zfs_getattr_fast(dentry->d_inode, stat); -+ if (error == 0) { -+ error = -zfs_getattr_fast(ZTOI(dzp), stat); -+ iput(ZTOI(dzp)); -+ } - -- iput(ZTOI(dzp)); - ZFS_EXIT(zsb); -diff --git a/module/zfs/zpl_export.c b/module/zfs/zpl_export.c -index 94625e1..ac94494 100644 ---- a/module/zfs/zpl_export.c -+++ b/module/zfs/zpl_export.c -@@ -47,3 +47,3 @@ zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) - if (len_bytes < offsetof(fid_t, fid_data)) -- return 255; -+ return (255); - -@@ -78,3 +78,3 @@ zpl_dentry_obtain_alias(struct inode *ip) - -- return result; -+ return (result); - } -@@ -94,3 +94,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) -- return ERR_PTR(-EINVAL); -+ return (ERR_PTR(-EINVAL)); - -@@ -99,3 +99,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - if (rc != 0) -- return ERR_PTR(-rc); -+ return (ERR_PTR(-rc)); - -@@ -103,3 +103,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - -- return zpl_dentry_obtain_alias(ip); -+ return (zpl_dentry_obtain_alias(ip)); - } -@@ -119,5 +119,5 @@ zpl_get_parent(struct dentry *child) - if (error) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -- return zpl_dentry_obtain_alias(ip); -+ return (zpl_dentry_obtain_alias(ip)); - } -@@ -136,3 +136,3 @@ zpl_commit_metadata(struct inode *inode) - -- return error; -+ return (error); - } -@@ -141,7 +141,7 @@ zpl_commit_metadata(struct inode *inode) - const struct export_operations zpl_export_operations = { -- .encode_fh = zpl_encode_fh, -- .fh_to_dentry = zpl_fh_to_dentry, -- .get_parent = zpl_get_parent, -+ .encode_fh = zpl_encode_fh, -+ .fh_to_dentry = zpl_fh_to_dentry, -+ .get_parent = zpl_get_parent, - #ifdef HAVE_COMMIT_METADATA -- .commit_metadata= zpl_commit_metadata, -+ .commit_metadata = zpl_commit_metadata, - #endif /* HAVE_COMMIT_METADATA */ -diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c -index 6598c17..3737bb5 100644 ---- a/module/zfs/zpl_file.c -+++ b/module/zfs/zpl_file.c -@@ -25,2 +25,3 @@ - -+#include - #include -@@ -37,2 +38,6 @@ zpl_open(struct inode *ip, struct file *filp) - -+ error = generic_file_open(ip, filp); -+ if (error) -+ return (error); -+ - crhold(cr); -@@ -42,6 +47,3 @@ zpl_open(struct inode *ip, struct file *filp) - -- if (error) -- return (error); -- -- return generic_file_open(ip, filp); -+ return (error); - } -@@ -169,5 +171,6 @@ ssize_t - zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos, -- uio_seg_t segment, int flags, cred_t *cr) -+ uio_seg_t segment, int flags, cred_t *cr) - { - int error; -+ ssize_t read; - struct iovec iov; -@@ -189,3 +192,6 @@ zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - -- return (len - uio.uio_resid); -+ read = len - uio.uio_resid; -+ task_io_account_read(read); -+ -+ return (read); - } -@@ -215,2 +221,3 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - int error; -+ ssize_t wrote; - struct iovec iov; -@@ -232,3 +239,6 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - -- return (len - uio.uio_resid); -+ wrote = len - uio.uio_resid; -+ task_io_account_write(wrote); -+ -+ return (wrote); - } -@@ -272,3 +282,3 @@ zpl_llseek(struct file *filp, loff_t offset, int whence) - -- return generic_file_llseek(filp, offset, whence); -+ return (generic_file_llseek(filp, offset, whence)); - } -@@ -373,3 +383,3 @@ zpl_readpage(struct file *filp, struct page *pp) - unlock_page(pp); -- return error; -+ return (error); - } -@@ -414,3 +424,39 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) - { -- return write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ znode_t *zp = ITOZ(mapping->host); -+ zfs_sb_t *zsb = ITOZSB(mapping->host); -+ enum writeback_sync_modes sync_mode; -+ int result; -+ -+ ZFS_ENTER(zsb); -+ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) -+ wbc->sync_mode = WB_SYNC_ALL; -+ ZFS_EXIT(zsb); -+ sync_mode = wbc->sync_mode; -+ -+ /* -+ * We don't want to run write_cache_pages() in SYNC mode here, because -+ * that would make putpage() wait for a single page to be committed to -+ * disk every single time, resulting in atrocious performance. Instead -+ * we run it once in non-SYNC mode so that the ZIL gets all the data, -+ * and then we commit it all in one go. -+ */ -+ wbc->sync_mode = WB_SYNC_NONE; -+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ if (sync_mode != wbc->sync_mode) { -+ ZFS_ENTER(zsb); -+ ZFS_VERIFY_ZP(zp); -+ zil_commit(zsb->z_log, zp->z_id); -+ ZFS_EXIT(zsb); -+ -+ /* -+ * We need to call write_cache_pages() again (we can't just -+ * return after the commit) because the previous call in -+ * non-SYNC mode does not guarantee that we got all the dirty -+ * pages (see the implementation of write_cache_pages() for -+ * details). That being said, this is a no-op in most cases. -+ */ -+ wbc->sync_mode = sync_mode; -+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ } -+ return (result); - } -@@ -426,3 +472,6 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) - { -- return zpl_putpage(pp, wbc, pp->mapping); -+ if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) -+ wbc->sync_mode = WB_SYNC_ALL; -+ -+ return (zpl_putpage(pp, wbc, pp->mapping)); - } -@@ -489,3 +538,3 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) - { -- return zpl_ioctl(filp, cmd, arg); -+ return (zpl_ioctl(filp, cmd, arg)); - } -@@ -498,3 +547,3 @@ const struct address_space_operations zpl_address_space_operations = { - .writepage = zpl_writepage, -- .writepages = zpl_writepages, -+ .writepages = zpl_writepages, - }; -@@ -510,7 +559,7 @@ const struct file_operations zpl_file_operations = { - #ifdef HAVE_FILE_FALLOCATE -- .fallocate = zpl_fallocate, -+ .fallocate = zpl_fallocate, - #endif /* HAVE_FILE_FALLOCATE */ -- .unlocked_ioctl = zpl_ioctl, -+ .unlocked_ioctl = zpl_ioctl, - #ifdef CONFIG_COMPAT -- .compat_ioctl = zpl_compat_ioctl, -+ .compat_ioctl = zpl_compat_ioctl, - #endif -diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c -index ab1fe68..c009807 100644 ---- a/module/zfs/zpl_inode.c -+++ b/module/zfs/zpl_inode.c -@@ -44,3 +44,3 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) - if (dlen(dentry) > ZFS_MAXNAMELEN) -- return ERR_PTR(-ENAMETOOLONG); -+ return (ERR_PTR(-ENAMETOOLONG)); - -@@ -60,8 +60,8 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -99,3 +99,3 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); -@@ -104,4 +104,4 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - if (error == 0) { -- error = zpl_xattr_security_init(ip, dir, &dentry->d_name); -- VERIFY3S(error, ==, 0); -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -@@ -109,3 +109,3 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -133,3 +133,3 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); -@@ -138,6 +138,9 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -145,3 +148,3 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - -- return (-error); -+ return (error); - } -@@ -171,3 +174,3 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode | S_IFDIR, cr); -@@ -175,6 +178,9 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) - error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -225,2 +231,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - { -+ struct inode *ip = dentry->d_inode; - cred_t *cr = CRED(); -@@ -229,3 +236,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - -- error = inode_change_ok(dentry->d_inode, ia); -+ error = inode_change_ok(ip, ia); - if (error) -@@ -234,3 +241,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; -@@ -244,5 +251,7 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - -- error = -zfs_setattr(dentry->d_inode, vap, 0, cr); -+ error = -zfs_setattr(ip, vap, 0, cr); -+ if (!error && (ia->ia_valid & ATTR_MODE)) -+ error = zpl_chmod_acl(ip); - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -277,3 +286,3 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); -@@ -281,6 +290,8 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) - error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -340,3 +351,3 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) - if (ip->i_nlink >= ZFS_LINK_MAX) -- return -EMLINK; -+ return (-EMLINK); - -@@ -362,3 +373,3 @@ out: - static void --zpl_truncate_range(struct inode* ip, loff_t start, loff_t end) -+zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) - { -@@ -393,3 +404,3 @@ zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) - { -- return zpl_fallocate_common(ip, mode, offset, len); -+ return (zpl_fallocate_common(ip, mode, offset, len)); - } -@@ -457,2 +468,11 @@ const struct inode_operations zpl_inode_operations = { - #endif /* HAVE_INODE_FALLOCATE */ -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -@@ -475,2 +495,11 @@ const struct inode_operations zpl_dir_inode_operations = { - .listxattr = zpl_xattr_list, -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -@@ -496,2 +525,11 @@ const struct inode_operations zpl_special_inode_operations = { - .listxattr = zpl_xattr_list, -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c -index eee4a50..45639a6 100644 ---- a/module/zfs/zpl_super.c -+++ b/module/zfs/zpl_super.c -@@ -46,3 +46,3 @@ zpl_inode_destroy(struct inode *ip) - { -- ASSERT(atomic_read(&ip->i_count) == 0); -+ ASSERT(atomic_read(&ip->i_count) == 0); - zfs_inode_destroy(ip); -@@ -181,15 +181,41 @@ zpl_umount_begin(struct super_block *sb) - /* -- * The Linux VFS automatically handles the following flags: -- * MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, MNT_NOATIME, MNT_READONLY -+ * ZFS specific features must be explicitly handled here, the VFS will -+ * automatically handled the following generic functionality. -+ * -+ * MNT_NOSUID, -+ * MNT_NODEV, -+ * MNT_NOEXEC, -+ * MNT_NOATIME, -+ * MNT_NODIRATIME, -+ * MNT_READONLY, -+ * MNT_STRICTATIME, -+ * MS_SYNCHRONOUS, -+ * MS_DIRSYNC, -+ * MS_MANDLOCK. - */ --#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY - static int --zpl_show_options(struct seq_file *seq, struct dentry *root) -+__zpl_show_options(struct seq_file *seq, zfs_sb_t *zsb) - { -- zfs_sb_t *zsb = root->d_sb->s_fs_info; -- - seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); - -+#ifdef CONFIG_FS_POSIX_ACL -+ switch (zsb->z_acl_type) { -+ case ZFS_ACLTYPE_POSIXACL: -+ seq_puts(seq, ",posixacl"); -+ break; -+ default: -+ seq_puts(seq, ",noacl"); -+ break; -+ } -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - return (0); - } -+ -+#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY -+static int -+zpl_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ return (__zpl_show_options(seq, root->d_sb->s_fs_info)); -+} - #else -@@ -198,7 +224,3 @@ zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) - { -- zfs_sb_t *zsb = vfsp->mnt_sb->s_fs_info; -- -- seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); -- -- return (0); -+ return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); - } -@@ -224,3 +246,3 @@ zpl_mount(struct file_system_type *fs_type, int flags, - -- return mount_nodev(fs_type, flags, &zmd, zpl_fill_super); -+ return (mount_nodev(fs_type, flags, &zmd, zpl_fill_super)); - } -@@ -233,3 +255,3 @@ zpl_get_sb(struct file_system_type *fs_type, int flags, - -- return get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt); -+ return (get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt)); - } -@@ -267,4 +289,2 @@ zpl_prune_sb(struct super_block *sb, void *arg) - ASSERT3S(error, <=, 0); -- -- return; - } -@@ -274,3 +294,3 @@ zpl_prune_sbs(int64_t bytes_to_scan, void *private) - { -- unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); -+ unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t)); - -@@ -291,7 +311,7 @@ zpl_prune_sbs(int64_t bytes_to_scan, void *private) - { -- unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); -+ unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t)); - -- shrink_dcache_memory(nr_to_scan, GFP_KERNEL); -- shrink_icache_memory(nr_to_scan, GFP_KERNEL); -- kmem_reap(); -+ shrink_dcache_memory(nr_to_scan, GFP_KERNEL); -+ shrink_icache_memory(nr_to_scan, GFP_KERNEL); -+ kmem_reap(); - } -@@ -324,3 +344,3 @@ zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) - { -- arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE); -+ /* noop */ - } -diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c -index d79d35b..c5c15a2 100644 ---- a/module/zfs/zpl_xattr.c -+++ b/module/zfs/zpl_xattr.c -@@ -96,3 +96,3 @@ zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) - { -- if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) -+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) - if (!(ITOZSB(xf->inode)->z_flags & ZSB_XATTR)) -@@ -100,3 +100,3 @@ zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) - -- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) -+ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) - if (!capable(CAP_SYS_ADMIN)) -@@ -196,3 +196,3 @@ zpl_xattr_list_sa(xattr_filldir_t *xf) - error = zpl_xattr_filldir(xf, nvpair_name(nvp), -- strlen(nvpair_name(nvp))); -+ strlen(nvpair_name(nvp))); - if (error) -@@ -357,8 +357,16 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, - ssize_t wrote; -- int error; -+ int lookup_flags, error; - const int xattr_mode = S_IFREG | 0644; - -- /* Lookup the xattr directory and create it if required. */ -- error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR, -- cr, NULL, NULL); -+ /* -+ * Lookup the xattr directory. When we're adding an entry pass -+ * CREATE_XATTR_DIR to ensure the xattr directory is created. -+ * When removing an entry this flag is not passed to avoid -+ * unnecessarily creating a new xattr directory. -+ */ -+ lookup_flags = LOOKUP_XATTR; -+ if (value != NULL) -+ lookup_flags |= CREATE_XATTR_DIR; -+ -+ error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL); - if (error) -@@ -383,3 +391,3 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, - if (xip == NULL) { -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mode = xattr_mode; -@@ -407,3 +415,3 @@ out: - if (vap) -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - -@@ -440,6 +448,2 @@ zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, - } else { -- /* Do not allow SA xattrs in symlinks (issue #1648) */ -- if (S_ISLNK(ip->i_mode)) -- return (-EMLINK); -- - /* Limited to 32k to keep nvpair memory allocations small */ -@@ -495,3 +499,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, - -- if ((error == -ENODATA) && (flags & XATTR_REPLACE)) -+ if (flags & XATTR_REPLACE) -+ goto out; -+ -+ /* The xattr to be removed already doesn't exist */ -+ error = 0; -+ if (value == NULL) - goto out; -@@ -527,6 +536,6 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) -- return -EOPNOTSUPP; -+ return (-EOPNOTSUPP); - -@@ -548,6 +557,6 @@ __zpl_xattr_user_set(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) -- return -EOPNOTSUPP; -+ return (-EOPNOTSUPP); - -@@ -575,6 +584,6 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, - if (!capable(CAP_SYS_ADMIN)) -- return -EACCES; -+ return (-EACCES); - - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -596,6 +605,6 @@ __zpl_xattr_trusted_set(struct inode *ip, const char *name, - if (!capable(CAP_SYS_ADMIN)) -- return -EACCES; -+ return (-EACCES); - - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -623,3 +632,3 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -641,3 +650,3 @@ __zpl_xattr_security_set(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -689,6 +698,7 @@ zpl_xattr_security_init(struct inode *ip, struct inode *dip, - error = zpl_security_inode_init_security(ip, dip, qstr, -- &name, &value, &len); -+ &name, &value, &len); - if (error) { - if (error == -EOPNOTSUPP) -- return 0; -+ return (0); -+ - return (error); -@@ -711,2 +721,471 @@ xattr_handler_t zpl_xattr_security_handler = { - -+#ifdef CONFIG_FS_POSIX_ACL -+ -+int -+zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl) -+{ -+ struct super_block *sb = ITOZSB(ip)->z_sb; -+ char *name, *value = NULL; -+ int error = 0; -+ size_t size = 0; -+ -+ if (S_ISLNK(ip->i_mode)) -+ return (-EOPNOTSUPP); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ name = POSIX_ACL_XATTR_ACCESS; -+ if (acl) { -+ zpl_equivmode_t mode = ip->i_mode; -+ error = posix_acl_equiv_mode(acl, &mode); -+ if (error < 0) { -+ return (error); -+ } else { -+ /* -+ * The mode bits will have been set by -+ * ->zfs_setattr()->zfs_acl_chmod_setattr() -+ * using the ZFS ACL conversion. If they -+ * differ from the Posix ACL conversion dirty -+ * the inode to write the Posix mode bits. -+ */ -+ if (ip->i_mode != mode) { -+ ip->i_mode = mode; -+ ip->i_ctime = current_fs_time(sb); -+ mark_inode_dirty(ip); -+ } -+ -+ if (error == 0) -+ acl = NULL; -+ } -+ } -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ name = POSIX_ACL_XATTR_DEFAULT; -+ if (!S_ISDIR(ip->i_mode)) -+ return (acl ? -EACCES : 0); -+ break; -+ -+ default: -+ return (-EINVAL); -+ } -+ -+ if (acl) { -+ size = posix_acl_xattr_size(acl->a_count); -+ value = kmem_alloc(size, KM_SLEEP); -+ -+ error = zpl_acl_to_xattr(acl, value, size); -+ if (error < 0) { -+ kmem_free(value, size); -+ return (error); -+ } -+ } -+ -+ error = zpl_xattr_set(ip, name, value, size, 0); -+ if (value) -+ kmem_free(value, size); -+ -+ if (!error) { -+ if (acl) -+ zpl_set_cached_acl(ip, type, acl); -+ else -+ zpl_forget_cached_acl(ip, type); -+ } -+ -+ return (error); -+} -+ -+struct posix_acl * -+zpl_get_acl(struct inode *ip, int type) -+{ -+ struct posix_acl *acl; -+ void *value = NULL; -+ char *name; -+ int size; -+ -+#ifdef HAVE_POSIX_ACL_CACHING -+ acl = get_cached_acl(ip, type); -+ if (acl != ACL_NOT_CACHED) -+ return (acl); -+#endif /* HAVE_POSIX_ACL_CACHING */ -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ name = POSIX_ACL_XATTR_ACCESS; -+ break; -+ case ACL_TYPE_DEFAULT: -+ name = POSIX_ACL_XATTR_DEFAULT; -+ break; -+ default: -+ return (ERR_PTR(-EINVAL)); -+ } -+ -+ size = zpl_xattr_get(ip, name, NULL, 0); -+ if (size > 0) { -+ value = kmem_alloc(size, KM_PUSHPAGE); -+ size = zpl_xattr_get(ip, name, value, size); -+ } -+ -+ if (size > 0) { -+ acl = zpl_acl_from_xattr(value, size); -+ } else if (size == -ENODATA || size == -ENOSYS) { -+ acl = NULL; -+ } else { -+ acl = ERR_PTR(-EIO); -+ } -+ -+ if (size > 0) -+ kmem_free(value, size); -+ -+ if (!IS_ERR(acl)) -+ zpl_set_cached_acl(ip, type, acl); -+ -+ return (acl); -+} -+ -+#if !defined(HAVE_GET_ACL) -+static int -+__zpl_check_acl(struct inode *ip, int mask) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ -+ if (acl) { -+ error = posix_acl_permission(ip, acl, mask); -+ zpl_posix_acl_release(acl); -+ return (error); -+ } -+ -+ return (-EAGAIN); -+} -+ -+#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -+int -+zpl_check_acl(struct inode *ip, int mask, unsigned int flags) -+{ -+ return (__zpl_check_acl(ip, mask)); -+} -+#elif defined(HAVE_CHECK_ACL) -+int -+zpl_check_acl(struct inode *ip, int mask) -+{ -+ return (__zpl_check_acl(ip, mask)); -+} -+#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -+int -+zpl_permission(struct inode *ip, int mask, struct nameidata *nd) -+{ -+ return (generic_permission(ip, mask, __zpl_check_acl)); -+} -+#elif defined(HAVE_PERMISSION) -+int -+zpl_permission(struct inode *ip, int mask) -+{ -+ return (generic_permission(ip, mask, __zpl_check_acl)); -+} -+#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* !HAVE_GET_ACL */ -+ -+int -+zpl_init_acl(struct inode *ip, struct inode *dir) -+{ -+ struct posix_acl *acl = NULL; -+ int error = 0; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ if (!S_ISLNK(ip->i_mode)) { -+ if (ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) { -+ acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ } -+ -+ if (!acl) { -+ ip->i_mode &= ~current_umask(); -+ ip->i_ctime = current_fs_time(ITOZSB(ip)->z_sb); -+ mark_inode_dirty(ip); -+ return (0); -+ } -+ } -+ -+ if ((ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) && acl) { -+ umode_t mode; -+ -+ if (S_ISDIR(ip->i_mode)) { -+ error = zpl_set_acl(ip, ACL_TYPE_DEFAULT, acl); -+ if (error) -+ goto out; -+ } -+ -+ mode = ip->i_mode; -+ error = __posix_acl_create(&acl, GFP_KERNEL, &mode); -+ if (error >= 0) { -+ ip->i_mode = mode; -+ mark_inode_dirty(ip); -+ if (error > 0) -+ error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); -+ } -+ } -+out: -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+int -+zpl_chmod_acl(struct inode *ip) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ if (S_ISLNK(ip->i_mode)) -+ return (-EOPNOTSUPP); -+ -+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); -+ if (IS_ERR(acl) || !acl) -+ return (PTR_ERR(acl)); -+ -+ error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); -+ if (!error) -+ error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); -+ -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+static size_t -+zpl_xattr_acl_list(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len, int type) -+{ -+ char *xattr_name; -+ size_t xattr_size; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ xattr_name = POSIX_ACL_XATTR_ACCESS; -+ xattr_size = sizeof (xattr_name); -+ break; -+ case ACL_TYPE_DEFAULT: -+ xattr_name = POSIX_ACL_XATTR_DEFAULT; -+ xattr_size = sizeof (xattr_name); -+ break; -+ default: -+ return (0); -+ } -+ -+ if (list && xattr_size <= list_size) -+ memcpy(list, xattr_name, xattr_size); -+ -+ return (xattr_size); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_LIST -+static size_t -+zpl_xattr_acl_list_access(struct dentry *dentry, char *list, -+ size_t list_size, const char *name, size_t name_len, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return zpl_xattr_acl_list(dentry->d_inode, -+ list, list_size, name, name_len, type); -+} -+ -+static size_t -+zpl_xattr_acl_list_default(struct dentry *dentry, char *list, -+ size_t list_size, const char *name, size_t name_len, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return zpl_xattr_acl_list(dentry->d_inode, -+ list, list_size, name, name_len, type); -+} -+ -+#else -+ -+static size_t -+zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ return zpl_xattr_acl_list(ip, -+ list, list_size, name, name_len, ACL_TYPE_ACCESS); -+} -+ -+static size_t -+zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ return zpl_xattr_acl_list(ip, -+ list, list_size, name, name_len, ACL_TYPE_DEFAULT); -+} -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+ -+static int -+zpl_xattr_acl_get(struct inode *ip, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ if (strcmp(name, "") != 0) -+ return (-EINVAL); -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (-EOPNOTSUPP); -+ -+ acl = zpl_get_acl(ip, type); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ if (acl == NULL) -+ return (-ENODATA); -+ -+ error = zpl_acl_to_xattr(acl, buffer, size); -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_GET -+static int -+zpl_xattr_acl_get_access(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return (zpl_xattr_acl_get(dentry->d_inode, name, buffer, size, type)); -+} -+ -+static int -+zpl_xattr_acl_get_default(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return (zpl_xattr_acl_get(dentry->d_inode, name, buffer, size, type)); -+} -+ -+#else -+ -+static int -+zpl_xattr_acl_get_access(struct inode *ip, const char *name, -+ void *buffer, size_t size) -+{ -+ return (zpl_xattr_acl_get(ip, name, buffer, size, ACL_TYPE_ACCESS)); -+} -+ -+static int -+zpl_xattr_acl_get_default(struct inode *ip, const char *name, -+ void *buffer, size_t size) -+{ -+ return (zpl_xattr_acl_get(ip, name, buffer, size, ACL_TYPE_DEFAULT)); -+} -+#endif /* HAVE_DENTRY_XATTR_GET */ -+ -+static int -+zpl_xattr_acl_set(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ struct posix_acl *acl; -+ int error = 0; -+ -+ if (strcmp(name, "") != 0) -+ return (-EINVAL); -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (-EOPNOTSUPP); -+ -+ if (!zpl_inode_owner_or_capable(ip)) -+ return (-EPERM); -+ -+ if (value) { -+ acl = zpl_acl_from_xattr(value, size); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ else if (acl) { -+ error = posix_acl_valid(acl); -+ if (error) { -+ zpl_posix_acl_release(acl); -+ return (error); -+ } -+ } -+ } else { -+ acl = NULL; -+ } -+ -+ error = zpl_set_acl(ip, type, acl); -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_SET -+static int -+zpl_xattr_acl_set_access(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return (zpl_xattr_acl_set(dentry->d_inode, -+ name, value, size, flags, type)); -+} -+ -+static int -+zpl_xattr_acl_set_default(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return zpl_xattr_acl_set(dentry->d_inode, -+ name, value, size, flags, type); -+} -+ -+#else -+ -+static int -+zpl_xattr_acl_set_access(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return zpl_xattr_acl_set(ip, -+ name, value, size, flags, ACL_TYPE_ACCESS); -+} -+ -+static int -+zpl_xattr_acl_set_default(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return zpl_xattr_acl_set(ip, -+ name, value, size, flags, ACL_TYPE_DEFAULT); -+} -+#endif /* HAVE_DENTRY_XATTR_SET */ -+ -+struct xattr_handler zpl_xattr_acl_access_handler = -+{ -+ .prefix = POSIX_ACL_XATTR_ACCESS, -+ .list = zpl_xattr_acl_list_access, -+ .get = zpl_xattr_acl_get_access, -+ .set = zpl_xattr_acl_set_access, -+#ifdef HAVE_DENTRY_XATTR_LIST -+ .flags = ACL_TYPE_ACCESS, -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+}; -+ -+struct xattr_handler zpl_xattr_acl_default_handler = -+{ -+ .prefix = POSIX_ACL_XATTR_DEFAULT, -+ .list = zpl_xattr_acl_list_default, -+ .get = zpl_xattr_acl_get_default, -+ .set = zpl_xattr_acl_set_default, -+#ifdef HAVE_DENTRY_XATTR_LIST -+ .flags = ACL_TYPE_DEFAULT, -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+}; -+ -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - xattr_handler_t *zpl_xattr_handlers[] = { -@@ -715,6 +1194,6 @@ xattr_handler_t *zpl_xattr_handlers[] = { - &zpl_xattr_user_handler, --#ifdef HAVE_POSIX_ACLS -+#ifdef CONFIG_FS_POSIX_ACL - &zpl_xattr_acl_access_handler, - &zpl_xattr_acl_default_handler, --#endif /* HAVE_POSIX_ACLS */ -+#endif /* CONFIG_FS_POSIX_ACL */ - NULL -diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c -index b516156..fa5c7eb 100644 ---- a/module/zfs/zvol.c -+++ b/module/zfs/zvol.c -@@ -37,2 +37,3 @@ - -+#include - #include -@@ -63,4 +64,4 @@ typedef struct zvol_state { - char zv_name[MAXNAMELEN]; /* name */ -- uint64_t zv_volsize; /* advertised space */ -- uint64_t zv_volblocksize;/* volume block size */ -+ uint64_t zv_volsize; /* advertised space */ -+ uint64_t zv_volblocksize; /* volume block size */ - objset_t *zv_objset; /* objset handle */ -@@ -95,3 +96,3 @@ zvol_find_minor(unsigned *minor) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { -+ zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { - if (MINOR(zv->zv_dev) != MINOR(*minor)) -@@ -102,5 +103,5 @@ zvol_find_minor(unsigned *minor) - if (*minor >= (1 << MINORBITS)) -- return ENXIO; -+ return (SET_ERROR(ENXIO)); - -- return 0; -+ return (0); - } -@@ -117,8 +118,8 @@ zvol_find_by_dev(dev_t dev) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -+ zv = list_next(&zvol_state_list, zv)) { - if (zv->zv_dev == dev) -- return zv; -+ return (zv); - } - -- return NULL; -+ return (NULL); - } -@@ -135,8 +136,8 @@ zvol_find_by_name(const char *name) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -- if (!strncmp(zv->zv_name, name, MAXNAMELEN)) -- return zv; -+ zv = list_next(&zvol_state_list, zv)) { -+ if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0) -+ return (zv); - } - -- return NULL; -+ return (NULL); - } -@@ -161,3 +162,3 @@ zvol_is_zvol(const char *device) - if (major == zvol_major) -- return (B_TRUE); -+ return (B_TRUE); - -@@ -216,6 +217,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) - if (error) -- return (error); -+ return (SET_ERROR(error)); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - error = dmu_object_info(os, ZVOL_OBJ, doi); -@@ -227,5 +228,30 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) - -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - -- return (error); -+ return (SET_ERROR(error)); -+} -+ -+static void -+zvol_size_changed(zvol_state_t *zv, uint64_t volsize) -+{ -+ struct block_device *bdev; -+ -+ bdev = bdget_disk(zv->zv_disk, 0); -+ if (bdev == NULL) -+ return; -+/* -+ * 2.6.28 API change -+ * Added check_disk_size_change() helper function. -+ */ -+#ifdef HAVE_CHECK_DISK_SIZE_CHANGE -+ set_capacity(zv->zv_disk, volsize >> 9); -+ zv->zv_volsize = volsize; -+ check_disk_size_change(zv->zv_disk, bdev); -+#else -+ zv->zv_volsize = volsize; -+ zv->zv_changed = 1; -+ (void) check_disk_change(bdev); -+#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ -+ -+ bdput(bdev); - } -@@ -239,6 +265,6 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - if (volsize == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (volsize % blocksize != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -246,3 +272,3 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - if (volsize - 1 > MAXOFFSET_T) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - #endif -@@ -255,5 +281,4 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - static int --zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) -+zvol_update_volsize(uint64_t volsize, objset_t *os) - { -- struct block_device *bdev; - dmu_tx_t *tx; -@@ -268,3 +293,3 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) - dmu_tx_abort(tx); -- return (error); -+ return (SET_ERROR(error)); - } -@@ -275,28 +300,19 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) - -- if (error) -- return (error); -+ if (error == 0) -+ error = dmu_free_long_range(os, -+ ZVOL_OBJ, volsize, DMU_OBJECT_END); - -- error = dmu_free_long_range(os, -- ZVOL_OBJ, volsize, DMU_OBJECT_END); -- if (error) -- return (error); -+ return (error); -+} - -- bdev = bdget_disk(zv->zv_disk, 0); -- if (!bdev) -- return (EIO); --/* -- * 2.6.28 API change -- * Added check_disk_size_change() helper function. -- */ --#ifdef HAVE_CHECK_DISK_SIZE_CHANGE -- set_capacity(zv->zv_disk, volsize >> 9); -- zv->zv_volsize = volsize; -- check_disk_size_change(zv->zv_disk, bdev); --#else -- zv->zv_volsize = volsize; -- zv->zv_changed = 1; -- (void) check_disk_change(bdev); --#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ -+static int -+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) -+{ -+ zvol_size_changed(zv, volsize); - -- bdput(bdev); -+ /* -+ * We should post a event here describing the expansion. However, -+ * the zfs_ereport_post() interface doesn't nicely support posting -+ * events for zvols, it assumes events relate to vdevs or zios. -+ */ - -@@ -311,46 +327,50 @@ zvol_set_volsize(const char *name, uint64_t volsize) - { -- zvol_state_t *zv; -- dmu_object_info_t *doi; -+ zvol_state_t *zv = NULL; - objset_t *os = NULL; -- uint64_t readonly; - int error; -+ dmu_object_info_t *doi; -+ uint64_t readonly; -+ boolean_t owned = B_FALSE; - -- mutex_enter(&zvol_state_lock); -+ error = dsl_prop_get_integer(name, -+ zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); -+ if (error != 0) -+ return (SET_ERROR(error)); -+ if (readonly) -+ return (SET_ERROR(EROFS)); - -+ mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); -- if (zv == NULL) { -- error = ENXIO; -- goto out; -- } -- -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); - -- error = dmu_objset_hold(name, FTAG, &os); -- if (error) -- goto out_doi; -+ if (zv == NULL || zv->zv_objset == NULL) { -+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, -+ FTAG, &os)) != 0) { -+ mutex_exit(&zvol_state_lock); -+ return (SET_ERROR(error)); -+ } -+ owned = B_TRUE; -+ if (zv != NULL) -+ zv->zv_objset = os; -+ } else { -+ os = zv->zv_objset; -+ } - -- if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) != 0 || -- (error = zvol_check_volsize(volsize,doi->doi_data_block_size)) != 0) -- goto out_doi; -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - -- VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, NULL) == 0); -- if (readonly) { -- error = EROFS; -- goto out_doi; -- } -+ if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || -+ (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) -+ goto out; - -- if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { -- error = EROFS; -- goto out_doi; -- } -+ error = zvol_update_volsize(volsize, os); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - -- error = zvol_update_volsize(zv, volsize, os); --out_doi: -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ if (error == 0 && zv != NULL) -+ error = zvol_update_live_volsize(zv, volsize); - out: -- if (os) -- dmu_objset_rele(os, FTAG); -- -+ if (owned) { -+ dmu_objset_disown(os, FTAG); -+ if (zv != NULL) -+ zv->zv_objset = NULL; -+ } - mutex_exit(&zvol_state_lock); -- - return (error); -@@ -367,3 +387,3 @@ zvol_check_volblocksize(uint64_t volblocksize) - !ISP2(volblocksize)) -- return (EDOM); -+ return (SET_ERROR(EDOM)); - -@@ -386,3 +406,3 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - if (zv == NULL) { -- error = ENXIO; -+ error = SET_ERROR(ENXIO); - goto out; -@@ -390,4 +410,4 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - -- if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { -- error = EROFS; -+ if (zv->zv_flags & ZVOL_RDONLY) { -+ error = SET_ERROR(EROFS); - goto out; -@@ -404,3 +424,3 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - if (error == ENOTSUP) -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - dmu_tx_commit(tx); -@@ -412,3 +432,3 @@ out: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -442,3 +462,3 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -448,3 +468,3 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) - { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -480,4 +500,4 @@ ssize_t zvol_immediate_write_sz = 32768; - static void --zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, -- uint64_t offset, uint64_t size, int sync) -+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, -+ uint64_t size, int sync) - { -@@ -656,3 +676,3 @@ zvol_discard(void *arg) - -- error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); -+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start); - -@@ -700,3 +720,3 @@ zvol_read(void *arg) - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - -@@ -744,6 +764,6 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO -- "%s: bad access: block=%llu, count=%lu\n", -- req->rq_disk->disk_name, -- (long long unsigned)blk_rq_pos(req), -- (long unsigned)blk_rq_sectors(req)); -+ "%s: bad access: block=%llu, count=%lu\n", -+ req->rq_disk->disk_name, -+ (long long unsigned)blk_rq_pos(req), -+ (long unsigned)blk_rq_sectors(req)); - __blk_end_request(req, -EIO, size); -@@ -754,3 +774,3 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO "%s: non-fs cmd\n", -- req->rq_disk->disk_name); -+ req->rq_disk->disk_name); - __blk_end_request(req, -EIO, size); -@@ -764,4 +784,3 @@ zvol_request(struct request_queue *q) - case WRITE: -- if (unlikely(get_disk_ro(zv->zv_disk)) || -- unlikely(zv->zv_flags & ZVOL_RDONLY)) { -+ if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - __blk_end_request(req, -EROFS, size); -@@ -781,3 +800,3 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO "%s: unknown cmd: %d\n", -- req->rq_disk->disk_name, (int)rq_data_dir(req)); -+ req->rq_disk->disk_name, (int)rq_data_dir(req)); - __blk_end_request(req, -EIO, size); -@@ -810,4 +829,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - objset_t *os = zv->zv_objset; -+ uint64_t object = ZVOL_OBJ; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; -+ blkptr_t *bp = &lr->lr_blkptr; - dmu_buf_t *db; -@@ -831,3 +852,3 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (buf != NULL) { /* immediate write */ -- error = dmu_read(os, ZVOL_OBJ, offset, size, buf, -+ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); -@@ -836,5 +857,11 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - offset = P2ALIGN_TYPED(offset, size, uint64_t); -- error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db, -+ error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -856,3 +883,3 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -870,3 +897,3 @@ zvol_insert(zvol_state_t *zv_insert) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -+ zv = list_next(&zvol_state_list, zv)) { - if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) -@@ -917,3 +944,3 @@ zvol_first_open(zvol_state_t *zv) - if (!locked) -- return (-ERESTARTSYS); -+ return (-SET_ERROR(ERESTARTSYS)); - } -@@ -956,3 +983,3 @@ out_mutex: - -- return (-error); -+ return (SET_ERROR(-error)); - } -@@ -1005,4 +1032,3 @@ zvol_open(struct block_device *bdev, fmode_t flag) - -- if ((flag & FMODE_WRITE) && -- (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY))) { -+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; -@@ -1023,3 +1049,3 @@ out_mutex: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1057,3 +1083,3 @@ static int - zvol_ioctl(struct block_device *bdev, fmode_t mode, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { -@@ -1063,3 +1089,3 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, - if (zv == NULL) -- return (-ENXIO); -+ return (SET_ERROR(-ENXIO)); - -@@ -1079,3 +1105,3 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1085,8 +1111,8 @@ static int - zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, -- unsigned cmd, unsigned long arg) -+ unsigned cmd, unsigned long arg) - { -- return zvol_ioctl(bdev, mode, cmd, arg); -+ return (zvol_ioctl(bdev, mode, cmd, arg)); - } - #else --#define zvol_compat_ioctl NULL -+#define zvol_compat_ioctl NULL - #endif -@@ -1097,3 +1123,3 @@ static int zvol_media_changed(struct gendisk *disk) - -- return zv->zv_changed; -+ return (zv->zv_changed); - } -@@ -1107,3 +1133,3 @@ static int zvol_revalidate_disk(struct gendisk *disk) - -- return 0; -+ return (0); - } -@@ -1133,3 +1159,3 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) - -- return 0; -+ return (0); - } -@@ -1147,3 +1173,3 @@ zvol_probe(dev_t dev, int *part, void *arg) - -- return kobj; -+ return (kobj); - } -@@ -1152,10 +1178,10 @@ zvol_probe(dev_t dev, int *part, void *arg) - static struct block_device_operations zvol_ops = { -- .open = zvol_open, -- .release = zvol_release, -- .ioctl = zvol_ioctl, -- .compat_ioctl = zvol_compat_ioctl, -- .media_changed = zvol_media_changed, -- .revalidate_disk = zvol_revalidate_disk, -- .getgeo = zvol_getgeo, -- .owner = THIS_MODULE, -+ .open = zvol_open, -+ .release = zvol_release, -+ .ioctl = zvol_ioctl, -+ .compat_ioctl = zvol_compat_ioctl, -+ .media_changed = zvol_media_changed, -+ .revalidate_disk = zvol_revalidate_disk, -+ .getgeo = zvol_getgeo, -+ .owner = THIS_MODULE, - }; -@@ -1167,3 +1193,3 @@ zvol_open_by_inode(struct inode *inode, struct file *file) - { -- return zvol_open(inode->i_bdev, file->f_mode); -+ return (zvol_open(inode->i_bdev, file->f_mode)); - } -@@ -1173,3 +1199,3 @@ zvol_release_by_inode(struct inode *inode, struct file *file) - { -- return zvol_release(inode->i_bdev->bd_disk, file->f_mode); -+ return (zvol_release(inode->i_bdev->bd_disk, file->f_mode)); - } -@@ -1178,32 +1204,34 @@ static int - zvol_ioctl_by_inode(struct inode *inode, struct file *file, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { - if (file == NULL || inode == NULL) -- return -EINVAL; -- return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg); -+ return (SET_ERROR(-EINVAL)); -+ -+ return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg)); - } - --# ifdef CONFIG_COMPAT -+#ifdef CONFIG_COMPAT - static long - zvol_compat_ioctl_by_inode(struct file *file, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { - if (file == NULL) -- return -EINVAL; -- return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, -- file->f_mode, cmd, arg); -+ return (SET_ERROR(-EINVAL)); -+ -+ return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, -+ file->f_mode, cmd, arg)); - } --# else --# define zvol_compat_ioctl_by_inode NULL --# endif -+#else -+#define zvol_compat_ioctl_by_inode NULL -+#endif - - static struct block_device_operations zvol_ops = { -- .open = zvol_open_by_inode, -- .release = zvol_release_by_inode, -- .ioctl = zvol_ioctl_by_inode, -- .compat_ioctl = zvol_compat_ioctl_by_inode, -- .media_changed = zvol_media_changed, -- .revalidate_disk = zvol_revalidate_disk, -- .getgeo = zvol_getgeo, -- .owner = THIS_MODULE, -+ .open = zvol_open_by_inode, -+ .release = zvol_release_by_inode, -+ .ioctl = zvol_ioctl_by_inode, -+ .compat_ioctl = zvol_compat_ioctl_by_inode, -+ .media_changed = zvol_media_changed, -+ .revalidate_disk = zvol_revalidate_disk, -+ .getgeo = zvol_getgeo, -+ .owner = THIS_MODULE, - }; -@@ -1221,3 +1249,3 @@ zvol_alloc(dev_t dev, const char *name) - -- zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); -+ zv = kmem_zalloc(sizeof (zvol_state_t), KM_PUSHPAGE); - -@@ -1267,3 +1295,3 @@ zvol_alloc(dev_t dev, const char *name) - -- return zv; -+ return (zv); - -@@ -1274,3 +1302,3 @@ out_kmem: - -- return NULL; -+ return (NULL); - } -@@ -1296,18 +1324,20 @@ __zvol_snapdev_hidden(const char *name) - { -- uint64_t snapdev; -- char *parent; -- char *atp; -- int error = 0; -- -- parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); -- (void) strlcpy(parent, name, MAXPATHLEN); -- -- if ((atp = strrchr(parent, '@')) != NULL) { -- *atp = '\0'; -- error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); -- if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) -- error = ENODEV; -- } -- kmem_free(parent, MAXPATHLEN); -- return (error); -+ uint64_t snapdev; -+ char *parent; -+ char *atp; -+ int error = 0; -+ -+ parent = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ (void) strlcpy(parent, name, MAXPATHLEN); -+ -+ if ((atp = strrchr(parent, '@')) != NULL) { -+ *atp = '\0'; -+ error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); -+ if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) -+ error = SET_ERROR(ENODEV); -+ } -+ -+ kmem_free(parent, MAXPATHLEN); -+ -+ return (SET_ERROR(error)); - } -@@ -1328,3 +1358,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - if (zv) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - goto out; -@@ -1338,3 +1368,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_PUSHPAGE); - -@@ -1358,3 +1388,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - if (zv == NULL) { -- error = EAGAIN; -+ error = SET_ERROR(EAGAIN); - goto out_dmu_objset_disown; -@@ -1397,3 +1427,3 @@ out_dmu_objset_disown: - out_doi: -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - out: -@@ -1405,3 +1435,3 @@ out: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1422,3 +1452,3 @@ zvol_create_minor(const char *name) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1434,6 +1464,6 @@ __zvol_remove_minor(const char *name) - if (zv == NULL) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - - if (zv->zv_open_count > 0) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -1457,3 +1487,27 @@ zvol_remove_minor(const char *name) - -- return (error); -+ return (SET_ERROR(error)); -+} -+ -+/* -+ * Rename a block device minor mode for the specified volume. -+ */ -+static void -+__zvol_rename_minor(zvol_state_t *zv, const char *newname) -+{ -+ int readonly = get_disk_ro(zv->zv_disk); -+ -+ ASSERT(MUTEX_HELD(&zvol_state_lock)); -+ -+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); -+ -+ /* -+ * The block device's read-only state is briefly changed causing -+ * a KOBJ_CHANGE uevent to be issued. This ensures udev detects -+ * the name change and fixes the symlinks. This does not change -+ * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never -+ * changes. This would normally be done using kobject_uevent() but -+ * that is a GPL-only symbol which is why we need this workaround. -+ */ -+ set_disk_ro(zv->zv_disk, !readonly); -+ set_disk_ro(zv->zv_disk, readonly); - } -@@ -1461,9 +1515,6 @@ zvol_remove_minor(const char *name) - static int --zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, -- const char *dsname, void *arg) -+zvol_create_minors_cb(const char *dsname, void *arg) - { -- if (strchr(dsname, '/') == NULL) -- return 0; -+ (void) zvol_create_minor(dsname); - -- (void) __zvol_create_minor(dsname, B_FALSE); - return (0); -@@ -1472,32 +1523,42 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, - /* -- * Create minors for specified pool, if pool is NULL create minors -- * for all available pools. -+ * Create minors for specified dataset including children and snapshots. - */ - int --zvol_create_minors(const char *pool) -+zvol_create_minors(const char *name) - { -- spa_t *spa = NULL; - int error = 0; - -+ if (!zvol_inhibit_dev) -+ error = dmu_objset_find((char *)name, zvol_create_minors_cb, -+ NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ -+ return (SET_ERROR(error)); -+} -+ -+/* -+ * Remove minors for specified dataset including children and snapshots. -+ */ -+void -+zvol_remove_minors(const char *name) -+{ -+ zvol_state_t *zv, *zv_next; -+ int namelen = ((name) ? strlen(name) : 0); -+ - if (zvol_inhibit_dev) -- return (0); -+ return; - - mutex_enter(&zvol_state_lock); -- if (pool) { -- error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, -- NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -- } else { -- mutex_enter(&spa_namespace_lock); -- while ((spa = spa_next(spa)) != NULL) { -- error = dmu_objset_find_spa(NULL, -- spa_name(spa), zvol_create_minors_cb, NULL, -- DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -- if (error) -- break; -+ -+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { -+ zv_next = list_next(&zvol_state_list, zv); -+ -+ if (name == NULL || strcmp(zv->zv_name, name) == 0 || -+ (strncmp(zv->zv_name, name, namelen) == 0 && -+ zv->zv_name[namelen] == '/')) { -+ zvol_remove(zv); -+ zvol_free(zv); - } -- mutex_exit(&spa_namespace_lock); - } -- mutex_exit(&zvol_state_lock); - -- return error; -+ mutex_exit(&zvol_state_lock); - } -@@ -1505,9 +1566,10 @@ zvol_create_minors(const char *pool) - /* -- * Remove minors for specified pool, if pool is NULL remove all minors. -+ * Rename minors for specified dataset including children and snapshots. - */ - void --zvol_remove_minors(const char *pool) -+zvol_rename_minors(const char *oldname, const char *newname) - { - zvol_state_t *zv, *zv_next; -- char *str; -+ int oldnamelen, newnamelen; -+ char *name; - -@@ -1516,9 +1578,8 @@ zvol_remove_minors(const char *pool) - -- str = kmem_zalloc(MAXNAMELEN, KM_SLEEP); -- if (pool) { -- (void) strncpy(str, pool, strlen(pool)); -- (void) strcat(str, "/"); -- } -+ oldnamelen = strlen(oldname); -+ newnamelen = strlen(newname); -+ name = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE); - - mutex_enter(&zvol_state_lock); -+ - for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { -@@ -1526,9 +1587,17 @@ zvol_remove_minors(const char *pool) - -- if (pool == NULL || !strncmp(str, zv->zv_name, strlen(str))) { -- zvol_remove(zv); -- zvol_free(zv); -+ if (strcmp(zv->zv_name, oldname) == 0) { -+ __zvol_rename_minor(zv, newname); -+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && -+ (zv->zv_name[oldnamelen] == '/' || -+ zv->zv_name[oldnamelen] == '@')) { -+ snprintf(name, MAXNAMELEN, "%s%c%s", newname, -+ zv->zv_name[oldnamelen], -+ zv->zv_name + oldnamelen + 1); -+ __zvol_rename_minor(zv, name); - } - } -+ - mutex_exit(&zvol_state_lock); -- kmem_free(str, MAXNAMELEN); -+ -+ kmem_free(name, MAXNAMELEN); - } -@@ -1540,3 +1609,3 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - if (strchr(dsname, '@') == NULL) -- return 0; -+ return (0); - -@@ -1552,3 +1621,4 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - } -- return 0; -+ -+ return (0); - } -@@ -1563,3 +1633,2 @@ zvol_set_snapdev(const char *dsname, uint64_t snapdev) { - -- - int -@@ -1570,3 +1639,4 @@ zvol_init(void) - list_create(&zvol_state_list, sizeof (zvol_state_t), -- offsetof(zvol_state_t, zv_next)); -+ offsetof(zvol_state_t, zv_next)); -+ - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); -@@ -1574,3 +1644,3 @@ zvol_init(void) - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, -- zvol_threads, INT_MAX, TASKQ_PREPOPULATE); -+ zvol_threads, INT_MAX, TASKQ_PREPOPULATE); - if (zvol_taskq == NULL) { -@@ -1588,3 +1658,3 @@ zvol_init(void) - blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, -- THIS_MODULE, zvol_probe, NULL, NULL); -+ THIS_MODULE, zvol_probe, NULL, NULL); - -@@ -1598,3 +1668,3 @@ out1: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1622,2 +1692,2 @@ MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); - module_param(zvol_max_discard_blocks, ulong, 0444); --MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once"); -+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); -diff --git a/module/zpios/pios.c b/module/zpios/pios.c -index 53cc77b..f0bad6c 100644 ---- a/module/zpios/pios.c -+++ b/module/zpios/pios.c -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -44,15 +45,16 @@ static char *zpios_tag = "zpios_tag"; - --static --int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) -+static int -+zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - { -- /* This is stack heavy but it should be OK since we are only -+ /* -+ * This is stack heavy but it should be OK since we are only - * making the upcall between tests when the stack is shallow. - */ -- char id[16], chunk_size[16], region_size[16], thread_count[16]; -+ char id[16], chunk_size[16], region_size[16], thread_count[16]; - char region_count[16], offset[16], region_noise[16], chunk_noise[16]; -- char thread_delay[16], flags[16], result[8]; -- char *argv[16], *envp[4]; -+ char thread_delay[16], flags[16], result[8]; -+ char *argv[16], *envp[4]; - - if ((path == NULL) || (strlen(path) == 0)) -- return -ENOENT; -+ return (-ENOENT); - -@@ -60,3 +62,3 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); -- snprintf(region_size, 15, "%lu",(long unsigned) run_args->region_size); -+ snprintf(region_size, 15, "%lu", (long unsigned) run_args->region_size); - snprintf(thread_count, 15, "%u", run_args->thread_count); -@@ -71,3 +73,3 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - /* Passing 15 args to registered pre/post upcall */ -- argv[0] = path; -+ argv[0] = path; - argv[1] = phase; -@@ -89,8 +91,37 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - /* Passing environment for user space upcall */ -- envp[0] = "HOME=/"; -- envp[1] = "TERM=linux"; -- envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; -- envp[3] = NULL; -+ envp[0] = "HOME=/"; -+ envp[1] = "TERM=linux"; -+ envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; -+ envp[3] = NULL; - -- return call_usermodehelper(path, argv, envp, UMH_WAIT_PROC); -+ return (call_usermodehelper(path, argv, envp, UMH_WAIT_PROC)); -+} -+ -+static int -+zpios_print(struct file *file, const char *format, ...) -+{ -+ zpios_info_t *info = (zpios_info_t *)file->private_data; -+ va_list adx; -+ int rc; -+ -+ ASSERT(info); -+ ASSERT(info->info_buffer); -+ -+ va_start(adx, format); -+ spin_lock(&info->info_lock); -+ -+ /* Don't allow the kernel to start a write in the red zone */ -+ if ((int)(info->info_head - info->info_buffer) > -+ (info->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { -+ rc = -EOVERFLOW; -+ } else { -+ rc = vsprintf(info->info_head, format, adx); -+ if (rc >= 0) -+ info->info_head += rc; -+ } -+ -+ spin_unlock(&info->info_lock); -+ va_end(adx); -+ -+ return (rc); - } -@@ -101,3 +132,3 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - struct dmu_tx *tx; -- uint64_t obj = 0ULL; -+ uint64_t obj = 0ULL; - int rc; -@@ -109,9 +140,8 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - zpios_print(run_args->file, -- "dmu_tx_assign() failed: %d\n", rc); -+ "dmu_tx_assign() failed: %d\n", rc); - dmu_tx_abort(tx); -- return obj; -+ return (obj); - } - -- obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, -- DMU_OT_NONE, 0, tx); -+ obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, DMU_OT_NONE, 0, tx); - rc = dmu_object_set_blocksize(os, obj, 128ULL << 10, 0, tx); -@@ -120,4 +150,4 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - "dmu_object_set_blocksize() failed: %d\n", rc); -- dmu_tx_abort(tx); -- return obj; -+ dmu_tx_abort(tx); -+ return (obj); - } -@@ -126,3 +156,3 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - -- return obj; -+ return (obj); - } -@@ -136,3 +166,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - tx = dmu_tx_create(os); -- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); -+ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - rc = dmu_tx_assign(tx, TXG_WAIT); -@@ -142,3 +172,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - dmu_tx_abort(tx); -- return rc; -+ return (rc); - } -@@ -149,4 +179,4 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - "dmu_object_free() failed: %d\n", rc); -- dmu_tx_abort(tx); -- return rc; -+ dmu_tx_abort(tx); -+ return (rc); - } -@@ -155,3 +185,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - -- return 0; -+ return (0); - } -@@ -167,6 +197,6 @@ zpios_dmu_setup(run_args_t *run_args) - -- (void)zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); - t->start = zpios_timespec_now(); - -- (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); -+ (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); - rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL); -@@ -178,4 +208,4 @@ zpios_dmu_setup(run_args_t *run_args) - -- rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); -- if (rc) { -+ rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); -+ if (rc) { - zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) " -@@ -183,3 +213,3 @@ zpios_dmu_setup(run_args_t *run_args) - goto out_destroy; -- } -+ } - -@@ -199,3 +229,3 @@ zpios_dmu_setup(run_args_t *run_args) - region = &run_args->regions[i]; -- mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); - -@@ -210,3 +240,3 @@ zpios_dmu_setup(run_args_t *run_args) - region->max_offset = run_args->offset + -- run_args->region_size; -+ run_args->region_size; - } else { -@@ -219,3 +249,3 @@ zpios_dmu_setup(run_args_t *run_args) - region->max_offset = run_args->offset * -- i + run_args->region_size; -+ i + run_args->region_size; - } -@@ -226,5 +256,5 @@ out_destroy: - if (rc) { -- rc2 = dmu_objset_destroy(name, B_FALSE); -+ rc2 = dsl_destroy_head(name); - if (rc2) -- zpios_print(run_args->file, "Error dmu_objset_destroy" -+ zpios_print(run_args->file, "Error dsl_destroy_head" - "(%s, ...) failed: %d\n", name, rc2); -@@ -234,5 +264,5 @@ out: - t->delta = zpios_timespec_sub(t->stop, t->start); -- (void)zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); - -- return rc; -+ return (rc); - } -@@ -245,3 +275,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - -- size = sizeof(*ra) + kcmd->cmd_region_count * sizeof(zpios_region_t); -+ size = sizeof (*ra) + kcmd->cmd_region_count * sizeof (zpios_region_t); - -@@ -251,3 +281,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - "for regions\n", size); -- return -ENOMEM; -+ return (-ENOMEM); - } -@@ -259,22 +289,22 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - strncpy(ra->log, kcmd->cmd_log, ZPIOS_PATH_SIZE - 1); -- ra->id = kcmd->cmd_id; -- ra->chunk_size = kcmd->cmd_chunk_size; -- ra->thread_count = kcmd->cmd_thread_count; -- ra->region_count = kcmd->cmd_region_count; -- ra->region_size = kcmd->cmd_region_size; -- ra->offset = kcmd->cmd_offset; -- ra->region_noise = kcmd->cmd_region_noise; -- ra->chunk_noise = kcmd->cmd_chunk_noise; -- ra->thread_delay = kcmd->cmd_thread_delay; -- ra->flags = kcmd->cmd_flags; -- ra->stats.wr_data = 0; -- ra->stats.wr_chunks = 0; -- ra->stats.rd_data = 0; -- ra->stats.rd_chunks = 0; -- ra->region_next = 0; -- ra->file = file; -- mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); -- mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); -- -- (void)zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); -+ ra->id = kcmd->cmd_id; -+ ra->chunk_size = kcmd->cmd_chunk_size; -+ ra->thread_count = kcmd->cmd_thread_count; -+ ra->region_count = kcmd->cmd_region_count; -+ ra->region_size = kcmd->cmd_region_size; -+ ra->offset = kcmd->cmd_offset; -+ ra->region_noise = kcmd->cmd_region_noise; -+ ra->chunk_noise = kcmd->cmd_chunk_noise; -+ ra->thread_delay = kcmd->cmd_thread_delay; -+ ra->flags = kcmd->cmd_flags; -+ ra->stats.wr_data = 0; -+ ra->stats.wr_chunks = 0; -+ ra->stats.rd_data = 0; -+ ra->stats.rd_chunks = 0; -+ ra->region_next = 0; -+ ra->file = file; -+ mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); -+ -+ (void) zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); - -@@ -282,4 +312,4 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - if (rc) { -- mutex_destroy(&ra->lock_ctl); -- mutex_destroy(&ra->lock_work); -+ mutex_destroy(&ra->lock_ctl); -+ mutex_destroy(&ra->lock_work); - vmem_free(ra, size); -@@ -288,3 +318,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - -- return rc; -+ return (rc); - } -@@ -298,3 +328,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - -@@ -303,3 +333,4 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- /* XXX: I don't much care for this chunk selection mechansim -+ /* -+ * XXX: I don't much care for this chunk selection mechansim - * there's the potential to burn a lot of time here doing nothing -@@ -341,4 +372,5 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - if (run_args->region_noise) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -- run_args->region_next += random_int % run_args->region_noise; -+ get_random_bytes(&random_int, sizeof (unsigned int)); -+ run_args->region_next += -+ random_int % run_args->region_noise; - } else { -@@ -348,3 +380,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - mutex_exit(&run_args->lock_work); -- return 1; -+ return (1); - } -@@ -354,3 +386,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- return 0; -+ return (0); - } -@@ -365,6 +397,6 @@ zpios_remove_objset(run_args_t *run_args) - -- (void)zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); - t->start = zpios_timespec_now(); - -- (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); -+ (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); - -@@ -375,8 +407,7 @@ zpios_remove_objset(run_args_t *run_args) - rc = zpios_dmu_object_free(run_args, -- region->obj.os, -- region->obj.obj); -+ region->obj.os, region->obj.obj); - if (rc) -- zpios_print(run_args->file, "Error " -- "removing object %d, %d\n", -- (int)region->obj.obj, rc); -+ zpios_print(run_args->file, -+ "Error removing object %d, %d\n", -+ (int)region->obj.obj, rc); - } -@@ -385,8 +416,7 @@ zpios_remove_objset(run_args_t *run_args) - rc = zpios_dmu_object_free(run_args, -- region->obj.os, -- region->obj.obj); -+ region->obj.os, region->obj.obj); - if (rc) -- zpios_print(run_args->file, "Error " -- "removing object %d, %d\n", -- (int)region->obj.obj, rc); -+ zpios_print(run_args->file, -+ "Error removing object %d, %d\n", -+ (int)region->obj.obj, rc); - } -@@ -397,6 +427,6 @@ zpios_remove_objset(run_args_t *run_args) - if (run_args->flags & DMU_REMOVE) { -- rc = dmu_objset_destroy(name, B_FALSE); -+ rc = dsl_destroy_head(name); - if (rc) -- zpios_print(run_args->file, "Error dmu_objset_destroy" -- "(%s, ...) failed: %d\n", name, rc); -+ zpios_print(run_args->file, "Error dsl_destroy_head" -+ "(%s, ...) failed: %d\n", name, rc); - } -@@ -405,3 +435,3 @@ zpios_remove_objset(run_args_t *run_args) - t->delta = zpios_timespec_sub(t->stop, t->start); -- (void)zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); - } -@@ -421,3 +451,3 @@ zpios_cleanup_run(run_args_t *run_args) - kmem_free(run_args->threads[i], -- sizeof(thread_data_t)); -+ sizeof (thread_data_t)); - } -@@ -426,3 +456,3 @@ zpios_cleanup_run(run_args_t *run_args) - kmem_free(run_args->threads, -- sizeof(thread_data_t *) * run_args->thread_count); -+ sizeof (thread_data_t *) * run_args->thread_count); - } -@@ -434,5 +464,5 @@ zpios_cleanup_run(run_args_t *run_args) - mutex_destroy(&run_args->lock_ctl); -- size = run_args->region_count * sizeof(zpios_region_t); -+ size = run_args->region_count * sizeof (zpios_region_t); - -- vmem_free(run_args, sizeof(*run_args) + size); -+ vmem_free(run_args, sizeof (*run_args) + size); - } -@@ -464,3 +494,3 @@ zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, - dmu_tx_abort(tx); -- return rc; -+ return (rc); - } -@@ -475,3 +505,3 @@ zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, - -- return 0; -+ return (0); - } -@@ -480,3 +510,3 @@ static int - zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, -- uint64_t offset, uint64_t size, void *buf) -+ uint64_t offset, uint64_t size, void *buf) - { -@@ -490,3 +520,3 @@ zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, - -- return dmu_read(os, object, offset, size, buf, flags); -+ return (dmu_read(os, object, offset, size, buf, flags)); - } -@@ -512,3 +542,3 @@ zpios_thread_main(void *data) - if (chunk_noise) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - chunk_noise_tmp = (random_int % (chunk_noise * 2))-chunk_noise; -@@ -516,3 +546,4 @@ zpios_thread_main(void *data) - -- /* It's OK to vmem_alloc() this memory because it will be copied -+ /* -+ * It's OK to vmem_alloc() this memory because it will be copied - * in to the slab and pointers to the slab copy will be setup in -@@ -536,5 +567,5 @@ zpios_thread_main(void *data) - while (zpios_get_work_item(run_args, &obj, &offset, -- &chunk_size, ®ion, DMU_WRITE)) { -+ &chunk_size, ®ion, DMU_WRITE)) { - if (thread_delay) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - thread_delay_tmp = random_int % thread_delay; -@@ -546,3 +577,3 @@ zpios_thread_main(void *data) - rc = zpios_dmu_write(run_args, obj.os, obj.obj, -- offset, chunk_size, buf); -+ offset, chunk_size, buf); - t.stop = zpios_timespec_now(); -@@ -560,3 +591,3 @@ zpios_thread_main(void *data) - thr->stats.wr_time.delta = zpios_timespec_add( -- thr->stats.wr_time.delta, t.delta); -+ thr->stats.wr_time.delta, t.delta); - mutex_exit(&thr->lock); -@@ -567,3 +598,3 @@ zpios_thread_main(void *data) - region->stats.wr_time.delta = zpios_timespec_add( -- region->stats.wr_time.delta, t.delta); -+ region->stats.wr_time.delta, t.delta); - -@@ -602,5 +633,5 @@ zpios_thread_main(void *data) - while (zpios_get_work_item(run_args, &obj, &offset, -- &chunk_size, ®ion, DMU_READ)) { -+ &chunk_size, ®ion, DMU_READ)) { - if (thread_delay) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - thread_delay_tmp = random_int % thread_delay; -@@ -630,5 +661,5 @@ zpios_thread_main(void *data) - zpios_print(run_args->file, -- "IO verify error: %d/%d/%d\n", -- (int)obj.obj, (int)offset, -- (int)chunk_size); -+ "IO verify error: %d/%d/%d\n", -+ (int)obj.obj, (int)offset, -+ (int)chunk_size); - break; -@@ -642,3 +673,3 @@ zpios_thread_main(void *data) - thr->stats.rd_time.delta = zpios_timespec_add( -- thr->stats.rd_time.delta, t.delta); -+ thr->stats.rd_time.delta, t.delta); - mutex_exit(&thr->lock); -@@ -649,3 +680,3 @@ zpios_thread_main(void *data) - region->stats.rd_time.delta = zpios_timespec_add( -- region->stats.rd_time.delta, t.delta); -+ region->stats.rd_time.delta, t.delta); - -@@ -672,3 +703,3 @@ out: - -- return rc; /* Unreachable, due to do_exit() */ -+ return (rc); /* Unreachable, due to do_exit() */ - } -@@ -692,3 +723,3 @@ zpios_threads_run(run_args_t *run_args) - -- tsks = kmem_zalloc(sizeof(struct task_struct *) * tc, KM_SLEEP); -+ tsks = kmem_zalloc(sizeof (struct task_struct *) * tc, KM_SLEEP); - if (tsks == NULL) { -@@ -698,3 +729,3 @@ zpios_threads_run(run_args_t *run_args) - -- run_args->threads = kmem_zalloc(sizeof(thread_data_t *) * tc, KM_SLEEP); -+ run_args->threads = kmem_zalloc(sizeof (thread_data_t *)*tc, KM_SLEEP); - if (run_args->threads == NULL) { -@@ -709,3 +740,3 @@ zpios_threads_run(run_args_t *run_args) - for (i = 0; i < tc; i++) { -- thr = kmem_zalloc(sizeof(thread_data_t), KM_SLEEP); -+ thr = kmem_zalloc(sizeof (thread_data_t), KM_SLEEP); - if (thr == NULL) { -@@ -722,3 +753,3 @@ zpios_threads_run(run_args_t *run_args) - tsk = kthread_create(zpios_thread_main, (void *)thr, -- "%s/%d", "zpios_io", i); -+ "%s/%d", "zpios_io", i); - if (IS_ERR(tsk)) { -@@ -734,3 +765,3 @@ zpios_threads_run(run_args_t *run_args) - /* Wake up all threads for write phase */ -- (void)zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); - for (i = 0; i < tc; i++) -@@ -742,3 +773,3 @@ zpios_threads_run(run_args_t *run_args) - tw->stop = zpios_timespec_now(); -- (void)zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); - -@@ -775,4 +806,4 @@ zpios_threads_run(run_args_t *run_args) - /* Wake up all threads for read phase */ -- (void)zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); -- for (i = 0; i < tc; i++) -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); -+ for (i = 0; i < tc; i++) - wake_up_process(tsks[i]); -@@ -783,3 +814,3 @@ zpios_threads_run(run_args_t *run_args) - tr->stop = zpios_timespec_now(); -- (void)zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); - -@@ -804,6 +835,6 @@ out: - cleanup: -- kmem_free(tsks, sizeof(struct task_struct *) * tc); -+ kmem_free(tsks, sizeof (struct task_struct *) * tc); - cleanup2: - /* Returns first encountered thread error (if any) */ -- return rc; -+ return (rc); - -@@ -820,3 +851,3 @@ static int - zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, -- int data_size, void *data) -+ int data_size, void *data) - { -@@ -829,4 +860,4 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid chunk_size, region_size, " -- "thread_count, or region_count, %d\n", -EINVAL); -- return -EINVAL; -+ "thread_count, or region_count, %d\n", -EINVAL); -+ return (-EINVAL); - } -@@ -836,4 +867,4 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid flags, minimally DMU_WRITE " -- "and DMU_READ must be set, %d\n", -EINVAL); -- return -EINVAL; -+ "and DMU_READ must be set, %d\n", -EINVAL); -+ return (-EINVAL); - } -@@ -843,8 +874,9 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid flags, DMU_*_ZC incompatible " -- "with DMU_VERIFY, used for performance analysis " -- "only, %d\n", -EINVAL); -- return -EINVAL; -+ "with DMU_VERIFY, used for performance analysis " -+ "only, %d\n", -EINVAL); -+ return (-EINVAL); - } - -- /* Opaque data on return contains structs of the following form: -+ /* -+ * Opaque data on return contains structs of the following form: - * -@@ -857,9 +889,9 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - */ -- size = (sizeof(zpios_stats_t) + -- (kcmd->cmd_thread_count * sizeof(zpios_stats_t)) + -- (kcmd->cmd_region_count * sizeof(zpios_stats_t))); -+ size = (sizeof (zpios_stats_t) + -+ (kcmd->cmd_thread_count * sizeof (zpios_stats_t)) + -+ (kcmd->cmd_region_count * sizeof (zpios_stats_t))); - if (data_size < size) { - zpios_print(file, "Invalid size, command data buffer " -- "size too small, (%d < %d)\n", data_size, size); -- return -ENOSPC; -+ "size too small, (%d < %d)\n", data_size, size); -+ return (-ENOSPC); - } -@@ -868,5 +900,5 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - if (rc) -- return rc; -+ return (rc); - -- rc = zpios_threads_run(run_args); -+ rc = zpios_threads_run(run_args); - zpios_remove_objset(run_args); -@@ -888,7 +920,7 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - cleanup: -- zpios_cleanup_run(run_args); -+ zpios_cleanup_run(run_args); - -- (void)zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); -+ (void) zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); - -- return rc; -+ return (rc); - } -@@ -902,7 +934,7 @@ zpios_open(struct inode *inode, struct file *file) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -- info = (zpios_info_t *)kmem_alloc(sizeof(*info), KM_SLEEP); -+ info = (zpios_info_t *)kmem_alloc(sizeof (*info), KM_SLEEP); - if (info == NULL) -- return -ENOMEM; -+ return (-ENOMEM); - -@@ -910,6 +942,7 @@ zpios_open(struct inode *inode, struct file *file) - info->info_size = ZPIOS_INFO_BUFFER_SIZE; -- info->info_buffer = (char *)vmem_alloc(ZPIOS_INFO_BUFFER_SIZE,KM_SLEEP); -+ info->info_buffer = -+ (char *) vmem_alloc(ZPIOS_INFO_BUFFER_SIZE, KM_SLEEP); - if (info->info_buffer == NULL) { -- kmem_free(info, sizeof(*info)); -- return -ENOMEM; -+ kmem_free(info, sizeof (*info)); -+ return (-ENOMEM); - } -@@ -919,3 +952,3 @@ zpios_open(struct inode *inode, struct file *file) - -- return 0; -+ return (0); - } -@@ -929,3 +962,3 @@ zpios_release(struct inode *inode, struct file *file) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -935,5 +968,5 @@ zpios_release(struct inode *inode, struct file *file) - vmem_free(info->info_buffer, ZPIOS_INFO_BUFFER_SIZE); -- kmem_free(info, sizeof(*info)); -+ kmem_free(info, sizeof (*info)); - -- return 0; -+ return (0); - } -@@ -953,3 +986,3 @@ zpios_buffer_clear(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) - -- return 0; -+ return (0); - } -@@ -988,3 +1021,4 @@ zpios_buffer_size(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) - -- if (copy_to_user((struct zpios_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) -+ if (copy_to_user((struct zpios_cfg_t __user *)arg, -+ kcfg, sizeof (*kcfg))) - rc = -EFAULT; -@@ -993,3 +1027,3 @@ out: - -- return rc; -+ return (rc); - } -@@ -1002,4 +1036,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - -- if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof(kcfg))) -- return -EFAULT; -+ if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof (kcfg))) -+ return (-EFAULT); - -@@ -1007,4 +1041,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - zpios_print(file, "Bad config magic 0x%x != 0x%x\n", -- kcfg.cfg_magic, ZPIOS_CFG_MAGIC); -- return -EINVAL; -+ kcfg.cfg_magic, ZPIOS_CFG_MAGIC); -+ return (-EINVAL); - } -@@ -1013,3 +1047,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - case ZPIOS_CFG_BUFFER_CLEAR: -- /* cfg_arg1 - Unused -+ /* -+ * cfg_arg1 - Unused - * cfg_rc1 - Unused -@@ -1019,3 +1054,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - case ZPIOS_CFG_BUFFER_SIZE: -- /* cfg_arg1 - 0 - query size; >0 resize -+ /* -+ * cfg_arg1 - 0 - query size; >0 resize - * cfg_rc1 - Set to current buffer size -@@ -1031,3 +1067,3 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - -- return rc; -+ return (rc); - } -@@ -1041,10 +1077,10 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - -- kcmd = kmem_alloc(sizeof(zpios_cmd_t), KM_SLEEP); -+ kcmd = kmem_alloc(sizeof (zpios_cmd_t), KM_SLEEP); - if (kcmd == NULL) { - zpios_print(file, "Unable to kmem_alloc() %ld byte for " -- "zpios_cmd_t\n", (long int)sizeof(zpios_cmd_t)); -- return -ENOMEM; -+ "zpios_cmd_t\n", (long int)sizeof (zpios_cmd_t)); -+ return (-ENOMEM); - } - -- rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof(zpios_cmd_t)); -+ rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof (zpios_cmd_t)); - if (rc) { -@@ -1057,4 +1093,4 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - zpios_print(file, "Bad command magic 0x%x != 0x%x\n", -- kcmd->cmd_magic, ZPIOS_CFG_MAGIC); -- rc = -EINVAL; -+ kcmd->cmd_magic, ZPIOS_CFG_MAGIC); -+ rc = (-EINVAL); - goto out_cmd; -@@ -1074,3 +1110,3 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - rc = copy_from_user(data, (void *)(arg + offsetof(zpios_cmd_t, -- cmd_data_str)), kcmd->cmd_data_size); -+ cmd_data_str)), kcmd->cmd_data_size); - if (rc) { -@@ -1090,3 +1126,3 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - rc = copy_to_user((void *)(arg + offsetof(zpios_cmd_t, -- cmd_data_str)), data, kcmd->cmd_data_size); -+ cmd_data_str)), data, kcmd->cmd_data_size); - if (rc) { -@@ -1101,5 +1137,5 @@ out_data: - out_cmd: -- kmem_free(kcmd, sizeof(zpios_cmd_t)); -+ kmem_free(kcmd, sizeof (zpios_cmd_t)); - -- return rc; -+ return (rc); - } -@@ -1109,3 +1145,3 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - int rc = 0; -@@ -1114,6 +1150,6 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - if ((cmd & 0xffffff00) == ((int)'T') << 8) -- return -ENOTTY; -+ return (-ENOTTY); - - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1132,3 +1168,3 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - -- return rc; -+ return (rc); - } -@@ -1140,3 +1176,3 @@ zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - { -- return zpios_unlocked_ioctl(file, cmd, arg); -+ return (zpios_unlocked_ioctl(file, cmd, arg)); - } -@@ -1144,3 +1180,4 @@ zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - --/* I'm not sure why you would want to write in to this buffer from -+/* -+ * I'm not sure why you would want to write in to this buffer from - * user space since its principle use is to pass test status info -@@ -1150,5 +1187,5 @@ static ssize_t - zpios_write(struct file *file, const char __user *buf, -- size_t count, loff_t *ppos) -+ size_t count, loff_t *ppos) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1157,3 +1194,3 @@ zpios_write(struct file *file, const char __user *buf, - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1183,3 +1220,3 @@ out: - spin_unlock(&info->info_lock); -- return rc; -+ return (rc); - } -@@ -1187,6 +1224,5 @@ out: - static ssize_t --zpios_read(struct file *file, char __user *buf, -- size_t count, loff_t *ppos) -+zpios_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1195,3 +1231,3 @@ zpios_read(struct file *file, char __user *buf, - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1219,3 +1255,3 @@ out: - spin_unlock(&info->info_lock); -- return rc; -+ return (rc); - } -@@ -1224,3 +1260,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1229,3 +1265,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1255,3 +1291,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - -- return rc; -+ return (rc); - } -@@ -1304,7 +1340,8 @@ zpios_init(void) - zpios_device = spl_device_create(zpios_class, NULL, -- dev, NULL, ZPIOS_NAME); -- return 0; -+ dev, NULL, ZPIOS_NAME); -+ -+ return (0); - error: - printk(KERN_ERR "ZPIOS: Error registering zpios device, %d\n", rc); -- return rc; -+ return (rc); - } -@@ -1321,3 +1358,3 @@ zpios_fini(void) - -- return 0; -+ return (0); - } -@@ -1330 +1367,2 @@ MODULE_DESCRIPTION("Kernel PIOS implementation"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); -diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in -index a4b0b36..412feaf 100644 ---- a/rpm/generic/zfs-dkms.spec.in -+++ b/rpm/generic/zfs-dkms.spec.in -@@ -16,7 +16,3 @@ BuildArch: noarch - --%if 0%{?dkms_version:1} --Requires: dkms = %{dkms_version} --%else --Requires: dkms >= 2.2.0.2 --%endif -+Requires: dkms >= 2.2.0.3-20 - Requires: spl-dkms = %{version} -diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in -index 0797124..5c2196f 100644 ---- a/rpm/generic/zfs.spec.in -+++ b/rpm/generic/zfs.spec.in -@@ -7,3 +7,3 @@ - %global _udevdir /lib/udev --%global _dracutdir /lib/dracut -+%global _dracutdir %{_prefix}/share/dracut - %endif -@@ -12,4 +12,20 @@ - %bcond_with blkid --%bcond_with selinux -+%bcond_with systemd - -+# Generic enable switch for systemd -+%if %{with systemd} -+%define _systemd 1 -+%endif -+ -+# Fedora >= 15 comes with systemd, but only >= 18 has -+# the proper macros -+%if 0%{?fedora} >= 18 -+%define _systemd 1 -+%endif -+ -+# opensuse >= 12.1 comes with systemd, but only >= 13.1 -+# has the proper macros -+%if 0%{?suse_version} >= 1310 -+%define _systemd 1 -+%endif - -@@ -31,5 +47,9 @@ ExcludeArch: ppc ppc64 - Requires: spl = %{version} --Requires: %{name}-kmod >= %{version} -+Requires: %{name}-kmod = %{version} - Provides: %{name}-kmod-common = %{version} - -+# zfs-fuse provides the same commands and man pages that ZoL does. Renaming -+# those on either side would conflict with all available documentation. -+Conflicts: zfs-fuse -+ - %if 0%{?rhel}%{?fedora}%{?suse_version} -@@ -40,5 +60,8 @@ BuildRequires: libblkid-devel - %endif --%if %{with selinux} --BuildRequires: libselinux-devel - %endif -+%if 0%{?_systemd} -+Requires(post): systemd -+Requires(preun): systemd -+Requires(postun): systemd -+BuildRequires: systemd - %endif -@@ -91,6 +114,6 @@ image which is ZFS aware. - %endif --%if %{with selinux} -- %define selinux --with-selinux -+%if 0%{?_systemd} -+ %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --disable-sysvinit - %else -- %define selinux --without-selinux -+ %define systemd --enable-sysvinit --disable-systemd - %endif -@@ -107,3 +130,3 @@ image which is ZFS aware. - %{blkid} \ -- %{selinux} -+ %{systemd} - make %{?_smp_mflags} -@@ -117,3 +140,7 @@ find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; - /sbin/ldconfig -+%if 0%{?_systemd} -+%systemd_post zfs.target -+%else - [ -x /sbin/chkconfig ] && /sbin/chkconfig --add zfs -+%endif - exit 0 -@@ -121,2 +148,5 @@ exit 0 - %preun -+%if 0%{?_systemd} -+%systemd_preun zfs.target -+%else - if [ $1 -eq 0 ] ; then -@@ -124,5 +154,10 @@ if [ $1 -eq 0 ] ; then - fi -+%endif - exit 0 - --%postun -p /sbin/ldconfig -+%postun -+/sbin/ldconfig -+%if 0%{?_systemd} -+%systemd_postun zfs.target -+%endif - -@@ -133,3 +168,4 @@ exit 0 - %{_bindir}/* --%{_libdir}/*.so.1* -+%{_libdir}/*.so.* -+%{_libexecdir}/%{name} - %{_mandir}/man1/* -@@ -141,3 +177,9 @@ exit 0 - %config(noreplace) %{_sysconfdir}/%{name} -+%if 0%{?_systemd} -+/usr/lib/modules-load.d/* -+%{_unitdir}/* -+%{_presetdir}/* -+%else - %{_sysconfdir}/init.d/* -+%endif - -diff --git a/scripts/Makefile.am b/scripts/Makefile.am -index 08a32b4..7894db4 100644 ---- a/scripts/Makefile.am -+++ b/scripts/Makefile.am -@@ -2,3 +2,3 @@ SUBDIRS = zpool-config zpios-test zpios-profile - --EXTRA_DIST = dkms.mkconf dkms.postinst kmodtool -+EXTRA_DIST = dkms.mkconf dkms.postinst kmodtool zfs2zol-patch.sed cstyle.pl - -@@ -9,2 +9,3 @@ dist_pkgdata_SCRIPTS = \ - $(top_srcdir)/scripts/zfault.sh \ -+ $(top_srcdir)/scripts/zimport.sh \ - $(top_srcdir)/scripts/zfs.sh \ -@@ -19,2 +20,3 @@ ZCONFIG=$(top_builddir)/scripts/zconfig.sh - ZFAULT=$(top_builddir)/scripts/zfault.sh -+ZIMPORT=$(top_builddir)/scripts/zimport.sh - ZTEST=$(top_builddir)/cmd/ztest/ztest -diff --git a/scripts/common.sh.in b/scripts/common.sh.in -index 29b85d3..2fac2a9 100644 ---- a/scripts/common.sh.in -+++ b/scripts/common.sh.in -@@ -40,2 +40,3 @@ udevruledir=@udevruledir@ - sysconfdir=@sysconfdir@ -+localstatedir=@localstatedir@ - -@@ -74,2 +75,4 @@ AWK=${AWK:-/usr/bin/awk} - -+ZED_PIDFILE=${ZED_PIDFILE:-${localstatedir}/run/zed.pid} -+ - COLOR_BLACK="\033[0;30m" -@@ -206,3 +209,7 @@ load_module() { - -- ${LDMOD} $* &>/dev/null || ERROR="Failed to load $1" return 1 -+ ${LDMOD} $* &>/dev/null -+ if [ $? -ne 0 ]; then -+ echo "Failed to load ${NAME} ($@)" -+ return 1 -+ fi - -@@ -215,3 +222,3 @@ load_modules() { - for MOD in ${KERNEL_MODULES[*]}; do -- load_module ${MOD} -+ load_module ${MOD} >/dev/null - done -@@ -287,14 +294,41 @@ check_loop_utils() { - # --# Find and return an unused loopback device. -+# Find and return an unused loop device. A new /dev/loopN node will be -+# created if required. The kernel loop driver will automatically register -+# the minor as long as it's less than /sys/module/loop/parameters/max_loop. - # - unused_loop_device() { -- for DEVICE in `ls -1 /dev/loop[0-9]* 2>/dev/null`; do -- ${LOSETUP} ${DEVICE} &>/dev/null -- if [ $? -ne 0 ]; then -- echo ${DEVICE} -- return -+ local DEVICE=`${LOSETUP} -f` -+ local MAX_LOOP_PATH="/sys/module/loop/parameters/max_loop" -+ local MAX_LOOP; -+ -+ # An existing /dev/loopN device was available. -+ if [ -n "${DEVICE}" ]; then -+ echo "${DEVICE}" -+ return 0 -+ fi -+ -+ # Create a new /dev/loopN provided we are not at MAX_LOOP. -+ if [ -f "${MAX_LOOP_PATH}" ]; then -+ MAX_LOOP=`cat /sys/module/loop/parameters/max_loop` -+ if [ ${MAX_LOOP} -eq 0 ]; then -+ MAX_LOOP=255 - fi -- done - -- die "Error: Unable to find unused loopback device" -+ for (( i=0; i<=${MAX_LOOP}; i++ )); do -+ DEVICE="/dev/loop$i" -+ -+ if [ -b "${DEVICE}" ]; then -+ continue -+ else -+ mknod -m660 "${DEVICE}" b 7 $i -+ chown root.disk "${DEVICE}" -+ chmod 666 "${DEVICE}" -+ -+ echo "${DEVICE}" -+ return 0 -+ fi -+ done -+ fi -+ -+ die "Error: Unable to create new loopback device" - } -@@ -305,4 +339,4 @@ unused_loop_device() { - # in use we will not be able to remove them, and we only remove --# devices which include 'zpool' in the name. So any damage we might --# do should be limited to other zfs related testing. -+# devices which include 'zpool' or 'deleted' in the name. So any -+# damage we might do should be limited to other zfs related testing. - # -@@ -313,4 +347,4 @@ cleanup_loop_devices() { - ${AWK} -F":" -v losetup="$LOSETUP" \ -- '/zpool/ { system("losetup -d "$1) }' ${TMP_FILE} -- ${AWK} -F" " '/zpool/ { system("rm -f "$3) }' ${TMP_FILE} -+ '/zpool/ || /deleted/ { system("losetup -d "$1) }' ${TMP_FILE} -+ ${AWK} -F" " '/zpool/ || /deleted/ { system("rm -f "$3) }' ${TMP_FILE} - -@@ -335,3 +369,3 @@ destroy_loop_devices() { - # --# Create a device label. -+# Create a device label taking care to briefly wait if udev needs to settle. - # -@@ -341,3 +375,4 @@ label() { - -- ${PARTED} ${DEVICE} --script -- mklabel ${LABEL} || return 1 -+ wait_udev ${DEVICE} 30 || return 1 -+ ${PARTED} ${DEVICE} --script -- mklabel ${LABEL} || return 2 - -@@ -746 +781,7 @@ stack_check() { - } -+ -+kill_zed() { -+ if [ -f $ZED_PIDFILE ]; then -+ kill $(cat $ZED_PIDFILE) -+ fi -+} -diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl -new file mode 100755 -index 0000000..083b30f ---- /dev/null -+++ b/scripts/cstyle.pl -@@ -0,0 +1,950 @@ -+#!/usr/bin/perl -w -+# -+# CDDL HEADER START -+# -+# The contents of this file are subject to the terms of the -+# Common Development and Distribution License (the "License"). -+# You may not use this file except in compliance with the License. -+# -+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+# or http://www.opensolaris.org/os/licensing. -+# See the License for the specific language governing permissions -+# and limitations under the License. -+# -+# When distributing Covered Code, include this CDDL HEADER in each -+# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+# If applicable, add the following below this CDDL HEADER, with the -+# fields enclosed by brackets "[]" replaced with your own identifying -+# information: Portions Copyright [yyyy] [name of copyright owner] -+# -+# CDDL HEADER END -+# -+# -+# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -+# Use is subject to license terms. -+# -+# @(#)cstyle 1.58 98/09/09 (from shannon) -+#ident "%Z%%M% %I% %E% SMI" -+# -+# cstyle - check for some common stylistic errors. -+# -+# cstyle is a sort of "lint" for C coding style. -+# It attempts to check for the style used in the -+# kernel, sometimes known as "Bill Joy Normal Form". -+# -+# There's a lot this can't check for, like proper indentation -+# of code blocks. There's also a lot more this could check for. -+# -+# A note to the non perl literate: -+# -+# perl regular expressions are pretty much like egrep -+# regular expressions, with the following special symbols -+# -+# \s any space character -+# \S any non-space character -+# \w any "word" character [a-zA-Z0-9_] -+# \W any non-word character -+# \d a digit [0-9] -+# \D a non-digit -+# \b word boundary (between \w and \W) -+# \B non-word boundary -+# -+ -+require 5.0; -+use IO::File; -+use Getopt::Std; -+use strict; -+ -+my $usage = -+"usage: cstyle [-chpvCP] [-o constructs] file ... -+ -c check continuation indentation inside functions -+ -h perform heuristic checks that are sometimes wrong -+ -p perform some of the more picky checks -+ -v verbose -+ -C don't check anything in header block comments -+ -P check for use of non-POSIX types -+ -o constructs -+ allow a comma-seperated list of optional constructs: -+ doxygen allow doxygen-style block comments (/** /*!) -+ splint allow splint-style lint comments (/*@ ... @*/) -+"; -+ -+my %opts; -+ -+if (!getopts("cho:pvCP", \%opts)) { -+ print $usage; -+ exit 2; -+} -+ -+my $check_continuation = $opts{'c'}; -+my $heuristic = $opts{'h'}; -+my $picky = $opts{'p'}; -+my $verbose = $opts{'v'}; -+my $ignore_hdr_comment = $opts{'C'}; -+my $check_posix_types = $opts{'P'}; -+ -+my $doxygen_comments = 0; -+my $splint_comments = 0; -+ -+if (defined($opts{'o'})) { -+ for my $x (split /,/, $opts{'o'}) { -+ if ($x eq "doxygen") { -+ $doxygen_comments = 1; -+ } elsif ($x eq "splint") { -+ $splint_comments = 1; -+ } else { -+ print "cstyle: unrecognized construct \"$x\"\n"; -+ print $usage; -+ exit 2; -+ } -+ } -+} -+ -+my ($filename, $line, $prev); # shared globals -+ -+my $fmt; -+my $hdr_comment_start; -+ -+if ($verbose) { -+ $fmt = "%s: %d: %s\n%s\n"; -+} else { -+ $fmt = "%s: %d: %s\n"; -+} -+ -+if ($doxygen_comments) { -+ # doxygen comments look like "/*!" or "/**"; allow them. -+ $hdr_comment_start = qr/^\s*\/\*[\!\*]?$/; -+} else { -+ $hdr_comment_start = qr/^\s*\/\*$/; -+} -+ -+# Note, following must be in single quotes so that \s and \w work right. -+my $typename = '(int|char|short|long|unsigned|float|double' . -+ '|\w+_t|struct\s+\w+|union\s+\w+|FILE)'; -+ -+# mapping of old types to POSIX compatible types -+my %old2posix = ( -+ 'unchar' => 'uchar_t', -+ 'ushort' => 'ushort_t', -+ 'uint' => 'uint_t', -+ 'ulong' => 'ulong_t', -+ 'u_int' => 'uint_t', -+ 'u_short' => 'ushort_t', -+ 'u_long' => 'ulong_t', -+ 'u_char' => 'uchar_t', -+ 'quad' => 'quad_t' -+); -+ -+my $lint_re = qr/\/\*(?: -+ ARGSUSED[0-9]*|NOTREACHED|LINTLIBRARY|VARARGS[0-9]*| -+ CONSTCOND|CONSTANTCOND|CONSTANTCONDITION|EMPTY| -+ FALLTHRU|FALLTHROUGH|LINTED.*?|PRINTFLIKE[0-9]*| -+ PROTOLIB[0-9]*|SCANFLIKE[0-9]*|CSTYLED.*? -+ )\*\//x; -+ -+my $splint_re = qr/\/\*@.*?@\*\//x; -+ -+my $warlock_re = qr/\/\*\s*(?: -+ VARIABLES\ PROTECTED\ BY| -+ MEMBERS\ PROTECTED\ BY| -+ ALL\ MEMBERS\ PROTECTED\ BY| -+ READ-ONLY\ VARIABLES:| -+ READ-ONLY\ MEMBERS:| -+ VARIABLES\ READABLE\ WITHOUT\ LOCK:| -+ MEMBERS\ READABLE\ WITHOUT\ LOCK:| -+ LOCKS\ COVERED\ BY| -+ LOCK\ UNNEEDED\ BECAUSE| -+ LOCK\ NEEDED:| -+ LOCK\ HELD\ ON\ ENTRY:| -+ READ\ LOCK\ HELD\ ON\ ENTRY:| -+ WRITE\ LOCK\ HELD\ ON\ ENTRY:| -+ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ READ\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ WRITE\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ LOCK\ RELEASED\ AS\ SIDE\ EFFECT:| -+ LOCK\ UPGRADED\ AS\ SIDE\ EFFECT:| -+ LOCK\ DOWNGRADED\ AS\ SIDE\ EFFECT:| -+ FUNCTIONS\ CALLED\ THROUGH\ POINTER| -+ FUNCTIONS\ CALLED\ THROUGH\ MEMBER| -+ LOCK\ ORDER: -+ )/x; -+ -+my $err_stat = 0; # exit status -+ -+if ($#ARGV >= 0) { -+ foreach my $arg (@ARGV) { -+ my $fh = new IO::File $arg, "r"; -+ if (!defined($fh)) { -+ printf "%s: can not open\n", $arg; -+ } else { -+ &cstyle($arg, $fh); -+ close $fh; -+ } -+ } -+} else { -+ &cstyle("", *STDIN); -+} -+exit $err_stat; -+ -+my $no_errs = 0; # set for CSTYLED-protected lines -+ -+sub err($) { -+ my ($error) = @_; -+ unless ($no_errs) { -+ printf $fmt, $filename, $., $error, $line; -+ $err_stat = 1; -+ } -+} -+ -+sub err_prefix($$) { -+ my ($prevline, $error) = @_; -+ my $out = $prevline."\n".$line; -+ unless ($no_errs) { -+ printf $fmt, $filename, $., $error, $out; -+ $err_stat = 1; -+ } -+} -+ -+sub err_prev($) { -+ my ($error) = @_; -+ unless ($no_errs) { -+ printf $fmt, $filename, $. - 1, $error, $prev; -+ $err_stat = 1; -+ } -+} -+ -+sub cstyle($$) { -+ -+my ($fn, $filehandle) = @_; -+$filename = $fn; # share it globally -+ -+my $in_cpp = 0; -+my $next_in_cpp = 0; -+ -+my $in_comment = 0; -+my $in_header_comment = 0; -+my $comment_done = 0; -+my $in_warlock_comment = 0; -+my $in_function = 0; -+my $in_function_header = 0; -+my $in_declaration = 0; -+my $note_level = 0; -+my $nextok = 0; -+my $nocheck = 0; -+ -+my $in_string = 0; -+ -+my ($okmsg, $comment_prefix); -+ -+$line = ''; -+$prev = ''; -+reset_indent(); -+ -+line: while (<$filehandle>) { -+ s/\r?\n$//; # strip return and newline -+ -+ # save the original line, then remove all text from within -+ # double or single quotes, we do not want to check such text. -+ -+ $line = $_; -+ -+ # -+ # C allows strings to be continued with a backslash at the end of -+ # the line. We translate that into a quoted string on the previous -+ # line followed by an initial quote on the next line. -+ # -+ # (we assume that no-one will use backslash-continuation with character -+ # constants) -+ # -+ $_ = '"' . $_ if ($in_string && !$nocheck && !$in_comment); -+ -+ # -+ # normal strings and characters -+ # -+ s/'([^\\']|\\[^xX0]|\\0[0-9]*|\\[xX][0-9a-fA-F]*)'/''/g; -+ s/"([^\\"]|\\.)*"/\"\"/g; -+ -+ # -+ # detect string continuation -+ # -+ if ($nocheck || $in_comment) { -+ $in_string = 0; -+ } else { -+ # -+ # Now that all full strings are replaced with "", we check -+ # for unfinished strings continuing onto the next line. -+ # -+ $in_string = -+ (s/([^"](?:"")*)"([^\\"]|\\.)*\\$/$1""/ || -+ s/^("")*"([^\\"]|\\.)*\\$/""/); -+ } -+ -+ # -+ # figure out if we are in a cpp directive -+ # -+ $in_cpp = $next_in_cpp || /^\s*#/; # continued or started -+ $next_in_cpp = $in_cpp && /\\$/; # only if continued -+ -+ # strip off trailing backslashes, which appear in long macros -+ s/\s*\\$//; -+ -+ # an /* END CSTYLED */ comment ends a no-check block. -+ if ($nocheck) { -+ if (/\/\* *END *CSTYLED *\*\//) { -+ $nocheck = 0; -+ } else { -+ reset_indent(); -+ next line; -+ } -+ } -+ -+ # a /*CSTYLED*/ comment indicates that the next line is ok. -+ if ($nextok) { -+ if ($okmsg) { -+ err($okmsg); -+ } -+ $nextok = 0; -+ $okmsg = 0; -+ if (/\/\* *CSTYLED.*\*\//) { -+ /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ $no_errs = 1; -+ } elsif ($no_errs) { -+ $no_errs = 0; -+ } -+ -+ # check length of line. -+ # first, a quick check to see if there is any chance of being too long. -+ if (($line =~ tr/\t/\t/) * 7 + length($line) > 80) { -+ # yes, there is a chance. -+ # replace tabs with spaces and check again. -+ my $eline = $line; -+ 1 while $eline =~ -+ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e; -+ if (length($eline) > 80) { -+ err("line > 80 characters"); -+ } -+ } -+ -+ # ignore NOTE(...) annotations (assumes NOTE is on lines by itself). -+ if ($note_level || /\b_?NOTE\s*\(/) { # if in NOTE or this is NOTE -+ s/[^()]//g; # eliminate all non-parens -+ $note_level += s/\(//g - length; # update paren nest level -+ next; -+ } -+ -+ # a /* BEGIN CSTYLED */ comment starts a no-check block. -+ if (/\/\* *BEGIN *CSTYLED *\*\//) { -+ $nocheck = 1; -+ } -+ -+ # a /*CSTYLED*/ comment indicates that the next line is ok. -+ if (/\/\* *CSTYLED.*\*\//) { -+ /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ if (/\/\/ *CSTYLED/) { -+ /^.*\/\/ *CSTYLED *(.*)$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ -+ # universal checks; apply to everything -+ if (/\t +\t/) { -+ err("spaces between tabs"); -+ } -+ if (/ \t+ /) { -+ err("tabs between spaces"); -+ } -+ if (/\s$/) { -+ err("space or tab at end of line"); -+ } -+ if (/[^ \t(]\/\*/ && !/\w\(\/\*.*\*\/\);/) { -+ err("comment preceded by non-blank"); -+ } -+ -+ # is this the beginning or ending of a function? -+ # (not if "struct foo\n{\n") -+ if (/^{$/ && $prev =~ /\)\s*(const\s*)?(\/\*.*\*\/\s*)?\\?$/) { -+ $in_function = 1; -+ $in_declaration = 1; -+ $in_function_header = 0; -+ $prev = $line; -+ next line; -+ } -+ if (/^}\s*(\/\*.*\*\/\s*)*$/) { -+ if ($prev =~ /^\s*return\s*;/) { -+ err_prev("unneeded return at end of function"); -+ } -+ $in_function = 0; -+ reset_indent(); # we don't check between functions -+ $prev = $line; -+ next line; -+ } -+ if (/^\w*\($/) { -+ $in_function_header = 1; -+ } -+ -+ if ($in_warlock_comment && /\*\//) { -+ $in_warlock_comment = 0; -+ $prev = $line; -+ next line; -+ } -+ -+ # a blank line terminates the declarations within a function. -+ # XXX - but still a problem in sub-blocks. -+ if ($in_declaration && /^$/) { -+ $in_declaration = 0; -+ } -+ -+ if ($comment_done) { -+ $in_comment = 0; -+ $in_header_comment = 0; -+ $comment_done = 0; -+ } -+ # does this looks like the start of a block comment? -+ if (/$hdr_comment_start/) { -+ if (!/^\t*\/\*/) { -+ err("block comment not indented by tabs"); -+ } -+ $in_comment = 1; -+ /^(\s*)\//; -+ $comment_prefix = $1; -+ if ($comment_prefix eq "") { -+ $in_header_comment = 1; -+ } -+ $prev = $line; -+ next line; -+ } -+ # are we still in the block comment? -+ if ($in_comment) { -+ if (/^$comment_prefix \*\/$/) { -+ $comment_done = 1; -+ } elsif (/\*\//) { -+ $comment_done = 1; -+ err("improper block comment close") -+ unless ($ignore_hdr_comment && $in_header_comment); -+ } elsif (!/^$comment_prefix \*[ \t]/ && -+ !/^$comment_prefix \*$/) { -+ err("improper block comment") -+ unless ($ignore_hdr_comment && $in_header_comment); -+ } -+ } -+ -+ if ($in_header_comment && $ignore_hdr_comment) { -+ $prev = $line; -+ next line; -+ } -+ -+ # check for errors that might occur in comments and in code. -+ -+ # allow spaces to be used to draw pictures in all comments. -+ if (/[^ ] / && !/".* .*"/ && !$in_comment) { -+ err("spaces instead of tabs"); -+ } -+ if (/^ / && !/^ \*[ \t\/]/ && !/^ \*$/ && -+ (!/^ \w/ || $in_function != 0)) { -+ err("indent by spaces instead of tabs"); -+ } -+ if (/^\t+ [^ \t\*]/ || /^\t+ \S/ || /^\t+ \S/) { -+ err("continuation line not indented by 4 spaces"); -+ } -+ if (/$warlock_re/ && !/\*\//) { -+ $in_warlock_comment = 1; -+ $prev = $line; -+ next line; -+ } -+ if (/^\s*\/\*./ && !/^\s*\/\*.*\*\// && !/$hdr_comment_start/) { -+ err("improper first line of block comment"); -+ } -+ -+ if ($in_comment) { # still in comment, don't do further checks -+ $prev = $line; -+ next line; -+ } -+ -+ if ((/[^(]\/\*\S/ || /^\/\*\S/) && -+ !(/$lint_re/ || ($splint_comments && /$splint_re/))) { -+ err("missing blank after open comment"); -+ } -+ if (/\S\*\/[^)]|\S\*\/$/ && -+ !(/$lint_re/ || ($splint_comments && /$splint_re/))) { -+ err("missing blank before close comment"); -+ } -+ if (/\/\/\S/) { # C++ comments -+ err("missing blank after start comment"); -+ } -+ # check for unterminated single line comments, but allow them when -+ # they are used to comment out the argument list of a function -+ # declaration. -+ if (/\S.*\/\*/ && !/\S.*\/\*.*\*\// && !/\(\/\*/) { -+ err("unterminated single line comment"); -+ } -+ -+ if (/^(#else|#endif|#include)(.*)$/) { -+ $prev = $line; -+ if ($picky) { -+ my $directive = $1; -+ my $clause = $2; -+ # Enforce ANSI rules for #else and #endif: no noncomment -+ # identifiers are allowed after #endif or #else. Allow -+ # C++ comments since they seem to be a fact of life. -+ if ((($1 eq "#endif") || ($1 eq "#else")) && -+ ($clause ne "") && -+ (!($clause =~ /^\s+\/\*.*\*\/$/)) && -+ (!($clause =~ /^\s+\/\/.*$/))) { -+ err("non-comment text following " . -+ "$directive (or malformed $directive " . -+ "directive)"); -+ } -+ } -+ next line; -+ } -+ -+ # -+ # delete any comments and check everything else. Note that -+ # ".*?" is a non-greedy match, so that we don't get confused by -+ # multiple comments on the same line. -+ # -+ s/\/\*.*?\*\///g; -+ s/\/\/.*$//; # C++ comments -+ -+ # delete any trailing whitespace; we have already checked for that. -+ s/\s*$//; -+ -+ # following checks do not apply to text in comments. -+ -+ if (/[^<>\s][!<>=]=/ || /[^<>][!<>=]=[^\s,]/ || -+ (/[^->]>[^,=>\s]/ && !/[^->]>$/) || -+ (/[^<]<[^,=<\s]/ && !/[^<]<$/) || -+ /[^<\s]<[^<]/ || /[^->\s]>[^>]/) { -+ err("missing space around relational operator"); -+ } -+ if (/\S>>=/ || /\S<<=/ || />>=\S/ || /<<=\S/ || /\S[-+*\/&|^%]=/ || -+ (/[^-+*\/&|^%!<>=\s]=[^=]/ && !/[^-+*\/&|^%!<>=\s]=$/) || -+ (/[^!<>=]=[^=\s]/ && !/[^!<>=]=$/)) { -+ # XXX - should only check this for C++ code -+ # XXX - there are probably other forms that should be allowed -+ if (!/\soperator=/) { -+ err("missing space around assignment operator"); -+ } -+ } -+ if (/[,;]\S/ && !/\bfor \(;;\)/) { -+ err("comma or semicolon followed by non-blank"); -+ } -+ # allow "for" statements to have empty "while" clauses -+ if (/\s[,;]/ && !/^[\t]+;$/ && !/^\s*for \([^;]*; ;[^;]*\)/) { -+ err("comma or semicolon preceded by blank"); -+ } -+ if (/^\s*(&&|\|\|)/) { -+ err("improper boolean continuation"); -+ } -+ if (/\S *(&&|\|\|)/ || /(&&|\|\|) *\S/) { -+ err("more than one space around boolean operator"); -+ } -+ if (/\b(for|if|while|switch|sizeof|return|case)\(/) { -+ err("missing space between keyword and paren"); -+ } -+ if (/(\b(for|if|while|switch|return)\b.*){2,}/ && !/^#define/) { -+ # multiple "case" and "sizeof" allowed -+ err("more than one keyword on line"); -+ } -+ if (/\b(for|if|while|switch|sizeof|return|case)\s\s+\(/ && -+ !/^#if\s+\(/) { -+ err("extra space between keyword and paren"); -+ } -+ # try to detect "func (x)" but not "if (x)" or -+ # "#define foo (x)" or "int (*func)();" -+ if (/\w\s\(/) { -+ my $s = $_; -+ # strip off all keywords on the line -+ s/\b(for|if|while|switch|return|case|sizeof)\s\(/XXX(/g; -+ s/#elif\s\(/XXX(/g; -+ s/^#define\s+\w+\s+\(/XXX(/; -+ # do not match things like "void (*f)();" -+ # or "typedef void (func_t)();" -+ s/\w\s\(+\*/XXX(*/g; -+ s/\b($typename|void)\s+\(+/XXX(/og; -+ if (/\w\s\(/) { -+ err("extra space between function name and left paren"); -+ } -+ $_ = $s; -+ } -+ # try to detect "int foo(x)", but not "extern int foo(x);" -+ # XXX - this still trips over too many legitimate things, -+ # like "int foo(x,\n\ty);" -+# if (/^(\w+(\s|\*)+)+\w+\(/ && !/\)[;,](\s|)*$/ && -+# !/^(extern|static)\b/) { -+# err("return type of function not on separate line"); -+# } -+ # this is a close approximation -+ if (/^(\w+(\s|\*)+)+\w+\(.*\)(\s|)*$/ && -+ !/^(extern|static)\b/) { -+ err("return type of function not on separate line"); -+ } -+ if (/^#define /) { -+ err("#define followed by space instead of tab"); -+ } -+ if (/^\s*return\W[^;]*;/ && !/^\s*return\s*\(.*\);/) { -+ err("unparenthesized return expression"); -+ } -+ if (/\bsizeof\b/ && !/\bsizeof\s*\(.*\)/) { -+ err("unparenthesized sizeof expression"); -+ } -+ if (/\(\s/) { -+ err("whitespace after left paren"); -+ } -+ # allow "for" statements to have empty "continue" clauses -+ if (/\s\)/ && !/^\s*for \([^;]*;[^;]*; \)/) { -+ err("whitespace before right paren"); -+ } -+ if (/^\s*\(void\)[^ ]/) { -+ err("missing space after (void) cast"); -+ } -+ if (/\S{/ && !/{{/) { -+ err("missing space before left brace"); -+ } -+ if ($in_function && /^\s+{/ && -+ ($prev =~ /\)\s*$/ || $prev =~ /\bstruct\s+\w+$/)) { -+ err("left brace starting a line"); -+ } -+ if (/}(else|while)/) { -+ err("missing space after right brace"); -+ } -+ if (/}\s\s+(else|while)/) { -+ err("extra space after right brace"); -+ } -+ if (/\b_VOID\b|\bVOID\b|\bSTATIC\b/) { -+ err("obsolete use of VOID or STATIC"); -+ } -+ if (/\b$typename\*/o) { -+ err("missing space between type name and *"); -+ } -+ if (/^\s+#/) { -+ err("preprocessor statement not in column 1"); -+ } -+ if (/^#\s/) { -+ err("blank after preprocessor #"); -+ } -+ if (/!\s*(strcmp|strncmp|bcmp)\s*\(/) { -+ err("don't use boolean ! with comparison functions"); -+ } -+ -+ # -+ # We completely ignore, for purposes of indentation: -+ # * lines outside of functions -+ # * preprocessor lines -+ # -+ if ($check_continuation && $in_function && !$in_cpp) { -+ process_indent($_); -+ } -+ if ($picky) { -+ # try to detect spaces after casts, but allow (e.g.) -+ # "sizeof (int) + 1", "void (*funcptr)(int) = foo;", and -+ # "int foo(int) __NORETURN;" -+ if ((/^\($typename( \*+)?\)\s/o || -+ /\W\($typename( \*+)?\)\s/o) && -+ !/sizeof\s*\($typename( \*)?\)\s/o && -+ !/\($typename( \*+)?\)\s+=[^=]/o) { -+ err("space after cast"); -+ } -+ if (/\b$typename\s*\*\s/o && -+ !/\b$typename\s*\*\s+const\b/o) { -+ err("unary * followed by space"); -+ } -+ } -+ if ($check_posix_types) { -+ # try to detect old non-POSIX types. -+ # POSIX requires all non-standard typedefs to end in _t, -+ # but historically these have been used. -+ if (/\b(unchar|ushort|uint|ulong|u_int|u_short|u_long|u_char|quad)\b/) { -+ err("non-POSIX typedef $1 used: use $old2posix{$1} instead"); -+ } -+ } -+ if ($heuristic) { -+ # cannot check this everywhere due to "struct {\n...\n} foo;" -+ if ($in_function && !$in_declaration && -+ /}./ && !/}\s+=/ && !/{.*}[;,]$/ && !/}(\s|)*$/ && -+ !/} (else|while)/ && !/}}/) { -+ err("possible bad text following right brace"); -+ } -+ # cannot check this because sub-blocks in -+ # the middle of code are ok -+ if ($in_function && /^\s+{/) { -+ err("possible left brace starting a line"); -+ } -+ } -+ if (/^\s*else\W/) { -+ if ($prev =~ /^\s*}$/) { -+ err_prefix($prev, -+ "else and right brace should be on same line"); -+ } -+ } -+ $prev = $line; -+} -+ -+if ($prev eq "") { -+ err("last line in file is blank"); -+} -+ -+} -+ -+# -+# Continuation-line checking -+# -+# The rest of this file contains the code for the continuation checking -+# engine. It's a pretty simple state machine which tracks the expression -+# depth (unmatched '('s and '['s). -+# -+# Keep in mind that the argument to process_indent() has already been heavily -+# processed; all comments have been replaced by control-A, and the contents of -+# strings and character constants have been elided. -+# -+ -+my $cont_in; # currently inside of a continuation -+my $cont_off; # skipping an initializer or definition -+my $cont_noerr; # suppress cascading errors -+my $cont_start; # the line being continued -+my $cont_base; # the base indentation -+my $cont_first; # this is the first line of a statement -+my $cont_multiseg; # this continuation has multiple segments -+ -+my $cont_special; # this is a C statement (if, for, etc.) -+my $cont_macro; # this is a macro -+my $cont_case; # this is a multi-line case -+ -+my @cont_paren; # the stack of unmatched ( and [s we've seen -+ -+sub -+reset_indent() -+{ -+ $cont_in = 0; -+ $cont_off = 0; -+} -+ -+sub -+delabel($) -+{ -+ # -+ # replace labels with tabs. Note that there may be multiple -+ # labels on a line. -+ # -+ local $_ = $_[0]; -+ -+ while (/^(\t*)( *(?:(?:\w+\s*)|(?:case\b[^:]*)): *)(.*)$/) { -+ my ($pre_tabs, $label, $rest) = ($1, $2, $3); -+ $_ = $pre_tabs; -+ while ($label =~ s/^([^\t]*)(\t+)//) { -+ $_ .= "\t" x (length($2) + length($1) / 8); -+ } -+ $_ .= ("\t" x (length($label) / 8)).$rest; -+ } -+ -+ return ($_); -+} -+ -+sub -+process_indent($) -+{ -+ require strict; -+ local $_ = $_[0]; # preserve the global $_ -+ -+ s///g; # No comments -+ s/\s+$//; # Strip trailing whitespace -+ -+ return if (/^$/); # skip empty lines -+ -+ # regexps used below; keywords taking (), macros, and continued cases -+ my $special = '(?:(?:\}\s*)?else\s+)?(?:if|for|while|switch)\b'; -+ my $macro = '[A-Z_][A-Z_0-9]*\('; -+ my $case = 'case\b[^:]*$'; -+ -+ # skip over enumerations, array definitions, initializers, etc. -+ if ($cont_off <= 0 && !/^\s*$special/ && -+ (/(?:(?:\b(?:enum|struct|union)\s*[^\{]*)|(?:\s+=\s*)){/ || -+ (/^\s*{/ && $prev =~ /=\s*(?:\/\*.*\*\/\s*)*$/))) { -+ $cont_in = 0; -+ $cont_off = tr/{/{/ - tr/}/}/; -+ return; -+ } -+ if ($cont_off) { -+ $cont_off += tr/{/{/ - tr/}/}/; -+ return; -+ } -+ -+ if (!$cont_in) { -+ $cont_start = $line; -+ -+ if (/^\t* /) { -+ err("non-continuation indented 4 spaces"); -+ $cont_noerr = 1; # stop reporting -+ } -+ $_ = delabel($_); # replace labels with tabs -+ -+ # check if the statement is complete -+ return if (/^\s*\}?$/); -+ return if (/^\s*\}?\s*else\s*\{?$/); -+ return if (/^\s*do\s*\{?$/); -+ return if (/{$/); -+ return if (/}[,;]?$/); -+ -+ # Allow macros on their own lines -+ return if (/^\s*[A-Z_][A-Z_0-9]*$/); -+ -+ # cases we don't deal with, generally non-kosher -+ if (/{/) { -+ err("stuff after {"); -+ return; -+ } -+ -+ # Get the base line, and set up the state machine -+ /^(\t*)/; -+ $cont_base = $1; -+ $cont_in = 1; -+ @cont_paren = (); -+ $cont_first = 1; -+ $cont_multiseg = 0; -+ -+ # certain things need special processing -+ $cont_special = /^\s*$special/? 1 : 0; -+ $cont_macro = /^\s*$macro/? 1 : 0; -+ $cont_case = /^\s*$case/? 1 : 0; -+ } else { -+ $cont_first = 0; -+ -+ # Strings may be pulled back to an earlier (half-)tabstop -+ unless ($cont_noerr || /^$cont_base / || -+ (/^\t*(?: )?(?:gettext\()?\"/ && !/^$cont_base\t/)) { -+ err_prefix($cont_start, -+ "continuation should be indented 4 spaces"); -+ } -+ } -+ -+ my $rest = $_; # keeps the remainder of the line -+ -+ # -+ # The split matches 0 characters, so that each 'special' character -+ # is processed separately. Parens and brackets are pushed and -+ # popped off the @cont_paren stack. For normal processing, we wait -+ # until a ; or { terminates the statement. "special" processing -+ # (if/for/while/switch) is allowed to stop when the stack empties, -+ # as is macro processing. Case statements are terminated with a : -+ # and an empty paren stack. -+ # -+ foreach $_ (split /[^\(\)\[\]\{\}\;\:]*/) { -+ next if (length($_) == 0); -+ -+ # rest contains the remainder of the line -+ my $rxp = "[^\Q$_\E]*\Q$_\E"; -+ $rest =~ s/^$rxp//; -+ -+ if (/\(/ || /\[/) { -+ push @cont_paren, $_; -+ } elsif (/\)/ || /\]/) { -+ my $cur = $_; -+ tr/\)\]/\(\[/; -+ -+ my $old = (pop @cont_paren); -+ if (!defined($old)) { -+ err("unexpected '$cur'"); -+ $cont_in = 0; -+ last; -+ } elsif ($old ne $_) { -+ err("'$cur' mismatched with '$old'"); -+ $cont_in = 0; -+ last; -+ } -+ -+ # -+ # If the stack is now empty, do special processing -+ # for if/for/while/switch and macro statements. -+ # -+ next if (@cont_paren != 0); -+ if ($cont_special) { -+ if ($rest =~ /^\s*{?$/) { -+ $cont_in = 0; -+ last; -+ } -+ if ($rest =~ /^\s*;$/) { -+ err("empty if/for/while body ". -+ "not on its own line"); -+ $cont_in = 0; -+ last; -+ } -+ if (!$cont_first && $cont_multiseg == 1) { -+ err_prefix($cont_start, -+ "multiple statements continued ". -+ "over multiple lines"); -+ $cont_multiseg = 2; -+ } elsif ($cont_multiseg == 0) { -+ $cont_multiseg = 1; -+ } -+ # We've finished this section, start -+ # processing the next. -+ goto section_ended; -+ } -+ if ($cont_macro) { -+ if ($rest =~ /^$/) { -+ $cont_in = 0; -+ last; -+ } -+ } -+ } elsif (/\;/) { -+ if ($cont_case) { -+ err("unexpected ;"); -+ } elsif (!$cont_special) { -+ err("unexpected ;") if (@cont_paren != 0); -+ if (!$cont_first && $cont_multiseg == 1) { -+ err_prefix($cont_start, -+ "multiple statements continued ". -+ "over multiple lines"); -+ $cont_multiseg = 2; -+ } elsif ($cont_multiseg == 0) { -+ $cont_multiseg = 1; -+ } -+ if ($rest =~ /^$/) { -+ $cont_in = 0; -+ last; -+ } -+ if ($rest =~ /^\s*special/) { -+ err("if/for/while/switch not started ". -+ "on its own line"); -+ } -+ goto section_ended; -+ } -+ } elsif (/\{/) { -+ err("{ while in parens/brackets") if (@cont_paren != 0); -+ err("stuff after {") if ($rest =~ /[^\s}]/); -+ $cont_in = 0; -+ last; -+ } elsif (/\}/) { -+ err("} while in parens/brackets") if (@cont_paren != 0); -+ if (!$cont_special && $rest !~ /^\s*(while|else)\b/) { -+ if ($rest =~ /^$/) { -+ err("unexpected }"); -+ } else { -+ err("stuff after }"); -+ } -+ $cont_in = 0; -+ last; -+ } -+ } elsif (/\:/ && $cont_case && @cont_paren == 0) { -+ err("stuff after multi-line case") if ($rest !~ /$^/); -+ $cont_in = 0; -+ last; -+ } -+ next; -+section_ended: -+ # End of a statement or if/while/for loop. Reset -+ # cont_special and cont_macro based on the rest of the -+ # line. -+ $cont_special = ($rest =~ /^\s*$special/)? 1 : 0; -+ $cont_macro = ($rest =~ /^\s*$macro/)? 1 : 0; -+ $cont_case = 0; -+ next; -+ } -+ $cont_noerr = 0 if (!$cont_in); -+} -diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh -index 281166c..d6695be 100755 ---- a/scripts/zconfig.sh -+++ b/scripts/zconfig.sh -@@ -405,10 +405,12 @@ test_7() { - -- # Mount the ext2 filesystem and copy some data to it. -- mkdir -p /tmp/${ZVOL_NAME}-part1 || fail 6 -- mount /dev/zvol/${FULL_ZVOL_NAME}-part1 /tmp/${ZVOL_NAME}-part1 \ -- || fail 7 -+ # Snapshot the pristine ext2 filesystem. -+ ${ZFS} snapshot ${FULL_SNAP_NAME} || fail 6 -+ wait_udev /dev/zvol/${FULL_SNAP_NAME}-part1 30 || fail 7 - -- # Snapshot the pristine ext2 filesystem and mount it read-only. -- ${ZFS} snapshot ${FULL_SNAP_NAME} || fail 8 -- wait_udev /dev/zvol/${FULL_SNAP_NAME}-part1 30 || fail 8 -+ # Mount the ext2 filesystem so some data can be copied to it. -+ mkdir -p /tmp/${ZVOL_NAME}-part1 || fail 7 -+ mount /dev/zvol/${FULL_ZVOL_NAME}-part1 \ -+ /tmp/${ZVOL_NAME}-part1 || fail 8 -+ -+ # Mount the pristine ext2 snapshot. - mkdir -p /tmp/${SNAP_NAME}-part1 || fail 9 -@@ -498,7 +500,10 @@ test_8() { - cp -RL ${SRC_DIR} /tmp/${FULL_ZVOL_NAME1}-part1 || fail 8 -- sync || fail 9 - -- # Snapshot the ext2 filesystem so it may be sent. -- ${ZFS} snapshot ${FULL_SNAP_NAME1} || fail 11 -- wait_udev /dev/zvol/${FULL_SNAP_NAME1} 30 || fail 11 -+ # Unmount, snapshot, mount the ext2 filesystem so it may be sent. -+ # We only unmount to ensure the ext2 filesystem is clean. -+ umount /tmp/${FULL_ZVOL_NAME1}-part1 || fail 9 -+ ${ZFS} snapshot ${FULL_SNAP_NAME1} || fail 10 -+ wait_udev /dev/zvol/${FULL_SNAP_NAME1} 30 || fail 10 -+ mount /dev/zvol/${FULL_ZVOL_NAME1}-part1 \ -+ /tmp/${FULL_ZVOL_NAME1}-part1 || 11 - -@@ -551,2 +556,3 @@ test_9() { - ${ZFS} create -V 300M ${FULL_NAME} || fail 3 -+ udev_trigger - -diff --git a/scripts/zfs-images b/scripts/zfs-images -new file mode 160000 -index 0000000..3331601 ---- /dev/null -+++ b/scripts/zfs-images -@@ -0,0 +1 @@ -+Subproject commit 3331601f6dc50ef2c9779c1656218701b48b276c -diff --git a/scripts/zfs.sh b/scripts/zfs.sh -index f44053e..b97a057 100755 ---- a/scripts/zfs.sh -+++ b/scripts/zfs.sh -@@ -67,2 +67,3 @@ fi - if [ ${UNLOAD} ]; then -+ kill_zed - umount -t zfs -a -@@ -73,4 +74,4 @@ else - check_modules || die "${ERROR}" -- load_modules "$@" -- wait_udev /dev/zfs 30 -+ load_modules "$@" || die "Failed to load modules" -+ wait_udev /dev/zfs 30 || die "'/dev/zfs' was not created" - fi -diff --git a/scripts/zfs2zol-patch.sed b/scripts/zfs2zol-patch.sed -new file mode 100755 -index 0000000..3a7280f ---- /dev/null -+++ b/scripts/zfs2zol-patch.sed -@@ -0,0 +1,15 @@ -+#!/bin/sed -f -+ -+s:usr/src/uts/common/fs/zfs/sys:include/sys:g -+s:usr/src/uts/common/fs/zfs:module/zfs:g -+s:usr/src/lib/libzpool:lib/libzpool:g -+s:usr/src/cmd:cmd:g -+s:usr/src/common/nvpair:module/nvpair:g -+s:usr/src/lib/libzfs/common/libzfs.h:include/libzfs.h:g -+s:usr/src/man/man1m/zfs.1m:man/man8/zfs.8:g -+s:usr/src/uts/common/sys:include/sys:g -+s:usr/src/lib/libzfs_core/common/libzfs_core.h:include/libzfs_core.h:g -+s:usr/src/lib/libzfs/common:lib/libzfs:g -+s:usr/src/lib/libzfs_core/common:lib/libzfs_core:g -+s:lib/libzpool/common/sys:include/sys:g -+s:lib/libzpool/common:lib/libzpool:g -diff --git a/scripts/zimport.sh b/scripts/zimport.sh -new file mode 100755 -index 0000000..8a6cdf0 ---- /dev/null -+++ b/scripts/zimport.sh -@@ -0,0 +1,495 @@ -+#!/bin/bash -+# -+# Verify that an assortment of known good reference pools can be imported -+# using different versions of the ZoL code. -+# -+# By default references pools for the major ZFS implementation will be -+# checked against the most recent ZoL tags and the master development branch. -+# Alternate tags or branches may be verified with the '-s option. -+# Passing the keyword "installed" will instruct the script to test whatever -+# version is installed. -+# -+# Preferentially a reference pool is used for all tests. However, if one -+# does not exist and the pool-tag matches one of the src-tags then a new -+# reference pool will be created using binaries from that source build. -+# This is particularly useful when you need to test your changes before -+# opening a pull request. The keyword 'all' can be used as short hand -+# refer to all available reference pools. -+# -+# New reference pools may be added by placing a bzip2 compressed tarball -+# of the pool in the scripts/zfs-images directory and then passing -+# the -p option. To increase the test coverage reference pools -+# should be collected for all the major ZFS implementations. Having these -+# pools easily available is also helpful to the developers. -+# -+# Care should be taken to run these tests with a kernel supported by all -+# the listed tags. Otherwise build failure will cause false positives. -+# -+# -+# EXAMPLES: -+# -+# The following example will verify the zfs-0.6.2 tag, the master branch, -+# and the installed zfs version can correctly import the listed pools. -+# Note there is no reference pool available for master and installed but -+# because binaries are available one is automatically constructed. The -+# working directory is also preserved between runs (-k) preventing the -+# need to rebuild from source for multiple runs. -+# -+# zimport.sh -k -f /var/tmp/zimport \ -+# -s "zfs-0.6.2 master installed" \ -+# -p "zevo-1.1.1 zol-0.6.2 zol-0.6.2-173 master installed" -+# -+# --------------------- ZFS on Linux Source Versions -------------- -+# zfs-0.6.2 master 0.6.2-175_g36eb554 -+# ----------------------------------------------------------------- -+# Clone SPL Local Local Skip -+# Clone ZFS Local Local Skip -+# Build SPL Pass Pass Skip -+# Build ZFS Pass Pass Skip -+# ----------------------------------------------------------------- -+# zevo-1.1.1 Pass Pass Pass -+# zol-0.6.2 Pass Pass Pass -+# zol-0.6.2-173 Fail Pass Pass -+# master Pass Pass Pass -+# installed Pass Pass Pass -+# -+basedir="$(dirname $0)" -+ -+SCRIPT_COMMON=common.sh -+if [ -f "${basedir}/${SCRIPT_COMMON}" ]; then -+. "${basedir}/${SCRIPT_COMMON}" -+else -+echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 -+fi -+ -+PROG=zimport.sh -+ -+SRC_TAGS="zfs-0.6.1 zfs-0.6.2 master" -+POOL_TAGS="all master" -+TEST_DIR=`mktemp -u -d -p /var/tmp zimport.XXXXXXXX` -+KEEP=0 -+VERBOSE=0 -+COLOR=1 -+REPO="https://github.com/zfsonlinux" -+IMAGES_DIR="$SCRIPTDIR/zfs-images/" -+IMAGES_TAR="https://github.com/zfsonlinux/zfs-images/tarball/master" -+CPUS=`grep -c ^processor /proc/cpuinfo` -+ERROR=0 -+ -+usage() { -+cat << EOF -+USAGE: -+zimport.sh [hvl] [-r repo] [-s src-tag] [-i pool-dir] [-p pool-tag] [-f path] -+ -+DESCRIPTION: -+ ZPOOL import verification tests -+ -+OPTIONS: -+ -h Show this message -+ -v Verbose -+ -c No color -+ -k Keep temporary directory -+ -r Source repository ($REPO) -+ -s ... Verify ZoL versions with the listed tags -+ -i Pool image directory -+ -p ... Verify pools created with the listed tags -+ -f Temporary directory to use -+ -+EOF -+} -+ -+while getopts 'hvckr:s:i:p:f:?' OPTION; do -+ case $OPTION in -+ h) -+ usage -+ exit 1 -+ ;; -+ v) -+ VERBOSE=1 -+ ;; -+ c) -+ COLOR=0 -+ ;; -+ k) -+ KEEP=1 -+ ;; -+ r) -+ REPO="$OPTARG" -+ ;; -+ s) -+ SRC_TAGS="$OPTARG" -+ ;; -+ i) -+ IMAGES_DIR="$OPTARG" -+ ;; -+ p) -+ POOL_TAGS="$OPTARG" -+ ;; -+ f) -+ TEST_DIR="$OPTARG" -+ ;; -+ ?) -+ usage -+ exit -+ ;; -+ esac -+done -+ -+# Initialize the test suite -+init -+check_modules || die "ZFS modules must be unloaded" -+ -+SRC_DIR="$TEST_DIR/src" -+SRC_DIR_SPL="$SRC_DIR/spl" -+SRC_DIR_ZFS="$SRC_DIR/zfs" -+ -+if [ $COLOR -eq 0 ]; then -+ COLOR_GREEN="" -+ COLOR_BROWN="" -+ COLOR_RED="" -+ COLOR_RESET="" -+fi -+ -+pass_nonewline() { -+ echo -n -e "${COLOR_GREEN}Pass${COLOR_RESET}\t\t" -+} -+ -+skip_nonewline() { -+ echo -n -e "${COLOR_BROWN}Skip${COLOR_RESET}\t\t" -+} -+ -+fail_nonewline() { -+ echo -n -e "${COLOR_RED}Fail${COLOR_RESET}\t\t" -+} -+ -+# -+# Set several helper variables which are derived from a source tag. -+# -+# SPL_TAG - The tag zfs-x.y.z is translated to spl-x.y.z. -+# SPL_DIR - The spl directory name. -+# SPL_URL - The spl github URL to fetch the tarball. -+# ZFS_TAG - The passed zfs-x.y.z tag -+# ZFS_DIR - The zfs directory name -+# ZFS_URL - The zfs github URL to fetch the tarball -+# -+src_set_vars() { -+ local TAG=$1 -+ -+ SPL_TAG=`echo $TAG | sed -e 's/zfs/spl/'` -+ SPL_DIR=$SRC_DIR_SPL/$SPL_TAG -+ SPL_URL=$REPO/spl/tarball/$SPL_TAG -+ -+ ZFS_TAG=$TAG -+ ZFS_DIR=$SRC_DIR_ZFS/$ZFS_TAG -+ ZFS_URL=$REPO/zfs/tarball/$ZFS_TAG -+ -+ if [ "$TAG" = "installed" ]; then -+ ZPOOL_CMD=`which zpool` -+ ZFS_CMD=`which zfs` -+ ZFS_SH="/usr/share/zfs/zfs.sh" -+ ZPOOL_CREATE="/usr/share/zfs/zpool-create.sh" -+ else -+ ZPOOL_CMD="./cmd/zpool/zpool" -+ ZFS_CMD="./cmd/zfs/zfs" -+ ZFS_SH="./scripts/zfs.sh" -+ ZPOOL_CREATE="./scripts/zpool-create.sh" -+ fi -+} -+ -+# -+# Set several helper variables which are derived from a pool name such -+# as zol-0.6.x, zevo-1.1.1, etc. These refer to example pools from various -+# ZFS implementations which are used to verify compatibility. -+# -+# POOL_TAG - The example pools name in scripts/zfs-images/. -+# POOL_BZIP - The full path to the example bzip2 compressed pool. -+# POOL_DIR - The top level test path for this pool. -+# POOL_DIR_PRISTINE - The directory containing a pristine version of the pool. -+# POOL_DIR_COPY - The directory containing a working copy of the pool. -+# POOL_DIR_SRC - Location of a source build if it exists for this pool. -+# -+pool_set_vars() { -+ local TAG=$1 -+ -+ POOL_TAG=$TAG -+ POOL_BZIP=$IMAGES_DIR/$POOL_TAG.tar.bz2 -+ POOL_DIR=$TEST_DIR/pools/$POOL_TAG -+ POOL_DIR_PRISTINE=$POOL_DIR/pristine -+ POOL_DIR_COPY=$POOL_DIR/copy -+ POOL_DIR_SRC=`echo -n "$SRC_DIR_ZFS/"; \ -+ echo "$POOL_TAG" | sed -e 's/zol/zfs/'` -+} -+ -+# -+# Construct a non-trivial pool given a specific version of the source. More -+# interesting pools provide better test coverage so this function should -+# extended as needed to create more realistic pools. -+# -+pool_create() { -+ pool_set_vars $1 -+ src_set_vars $1 -+ -+ if [ "$POOL_TAG" != "installed" ]; then -+ cd $POOL_DIR_SRC -+ fi -+ -+ $ZFS_SH zfs="spa_config_path=$POOL_DIR_PRISTINE" || fail -+ -+ # Create a file vdev RAIDZ pool. -+ FILEDIR="$POOL_DIR_PRISTINE" $ZPOOL_CREATE \ -+ -c file-raidz -p $POOL_TAG -v >/dev/null || fail -+ -+ # Create a pool/fs filesystem with some random contents. -+ $ZFS_CMD create $POOL_TAG/fs || fail -+ populate /$POOL_TAG/fs/ 10 100 -+ -+ # Snapshot that filesystem, clone it, remove the files/dirs, -+ # replace them with new files/dirs. -+ $ZFS_CMD snap $POOL_TAG/fs@snap || fail -+ $ZFS_CMD clone $POOL_TAG/fs@snap $POOL_TAG/clone || fail -+ rm -Rf /$POOL_TAG/clone/* || fail -+ populate /$POOL_TAG/clone/ 10 100 -+ -+ # Scrub the pool, delay slightly, then export it. It is now -+ # somewhat interesting for testing purposes. -+ $ZPOOL_CMD scrub $POOL_TAG || fail -+ sleep 10 -+ $ZPOOL_CMD export $POOL_TAG || fail -+ -+ $ZFS_SH -u || fail -+} -+ -+# If the zfs-images directory doesn't exist fetch a copy from Github then -+# cache it in the $TEST_DIR and update $IMAGES_DIR. -+if [ ! -d $IMAGES_DIR ]; then -+ IMAGES_DIR="$TEST_DIR/zfs-images" -+ mkdir -p $IMAGES_DIR -+ curl -sL $IMAGES_TAR | \ -+ tar -xz -C $IMAGES_DIR --strip-components=1 || fail -+fi -+ -+# Given the available images in the zfs-images directory substitute the -+# list of available images for the reserved keywork 'all'. -+for TAG in $POOL_TAGS; do -+ -+ if [ "$TAG" = "all" ]; then -+ ALL_TAGS=`ls $IMAGES_DIR | grep "tar.bz2" | \ -+ sed 's/.tar.bz2//' | tr '\n' ' '` -+ NEW_TAGS="$NEW_TAGS $ALL_TAGS" -+ else -+ NEW_TAGS="$NEW_TAGS $TAG" -+ fi -+done -+POOL_TAGS="$NEW_TAGS" -+ -+if [ $VERBOSE -ne 0 ]; then -+ echo "---------------------------- Options ----------------------------" -+ echo "VERBOSE=$VERBOSE" -+ echo "KEEP=$KEEP" -+ echo "REPO=$REPO" -+ echo "SRC_TAGS="$SRC_TAGS"" -+ echo "POOL_TAGS="$POOL_TAGS"" -+ echo "PATH=$TEST_DIR" -+ echo -+fi -+ -+if [ ! -d $TEST_DIR ]; then -+ mkdir -p $TEST_DIR -+fi -+ -+# Print a header for all tags which are being tested. -+echo "--------------------- ZFS on Linux Source Versions --------------" -+printf "%-16s" " " -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ "$TAG" = "installed" ]; then -+ ZFS_VERSION=`modinfo zfs | awk '/version:/ { print $2; exit }'` -+ if [ -n "$ZFS_VERSION" ]; then -+ printf "%-16s" $ZFS_VERSION -+ else -+ echo "ZFS is not installed\n" -+ fail -+ fi -+ else -+ printf "%-16s" $TAG -+ fi -+done -+echo -e "\n-----------------------------------------------------------------" -+ -+# -+# Attempt to generate the tarball from your local git repository, if that -+# fails then attempt to download the tarball from Github. -+# -+printf "%-16s" "Clone SPL" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -d $SPL_DIR ]; then -+ skip_nonewline -+ elif [ "$SPL_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SPLSRC -+ -+ if [ ! -d $SRC_DIR_SPL ]; then -+ mkdir -p $SRC_DIR_SPL -+ fi -+ -+ git archive --format=tar --prefix=$SPL_TAG/ $SPL_TAG \ -+ -o $SRC_DIR_SPL/$SPL_TAG.tar &>/dev/nul || \ -+ rm $SRC_DIR_SPL/$SPL_TAG.tar -+ if [ -s $SRC_DIR_SPL/$SPL_TAG.tar ]; then -+ tar -xf $SRC_DIR_SPL/$SPL_TAG.tar -C $SRC_DIR_SPL -+ rm $SRC_DIR_SPL/$SPL_TAG.tar -+ echo -n -e "${COLOR_GREEN}Local${COLOR_RESET}\t\t" -+ else -+ mkdir -p $SPL_DIR || fail -+ curl -sL $SPL_URL | tar -xz -C $SPL_DIR \ -+ --strip-components=1 || fail -+ echo -n -e "${COLOR_GREEN}Remote${COLOR_RESET}\t\t" -+ fi -+ fi -+done -+printf "\n" -+ -+# -+# Attempt to generate the tarball from your local git repository, if that -+# fails then attempt to download the tarball from Github. -+# -+printf "%-16s" "Clone ZFS" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -d $ZFS_DIR ]; then -+ skip_nonewline -+ elif [ "$ZFS_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SRCDIR -+ -+ if [ ! -d $SRC_DIR_ZFS ]; then -+ mkdir -p $SRC_DIR_ZFS -+ fi -+ -+ git archive --format=tar --prefix=$ZFS_TAG/ $ZFS_TAG \ -+ -o $SRC_DIR_ZFS/$ZFS_TAG.tar &>/dev/nul || \ -+ rm $SRC_DIR_ZFS/$ZFS_TAG.tar -+ if [ -s $SRC_DIR_ZFS/$ZFS_TAG.tar ]; then -+ tar -xf $SRC_DIR_ZFS/$ZFS_TAG.tar -C $SRC_DIR_ZFS -+ rm $SRC_DIR_ZFS/$ZFS_TAG.tar -+ echo -n -e "${COLOR_GREEN}Local${COLOR_RESET}\t\t" -+ else -+ mkdir -p $ZFS_DIR || fail -+ curl -sL $ZFS_URL | tar -xz -C $ZFS_DIR \ -+ --strip-components=1 || fail -+ echo -n -e "${COLOR_GREEN}Remote${COLOR_RESET}\t\t" -+ fi -+ fi -+done -+printf "\n" -+ -+# Build the listed tags -+printf "%-16s" "Build SPL" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -f $SPL_DIR/module/spl/spl.ko ]; then -+ skip_nonewline -+ elif [ "$SPL_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SPL_DIR -+ make distclean &>/dev/null -+ sh ./autogen.sh &>/dev/null || fail -+ ./configure &>/dev/null || fail -+ make -s -j$CPUS &>/dev/null || fail -+ pass_nonewline -+ fi -+done -+printf "\n" -+ -+# Build the listed tags -+printf "%-16s" "Build ZFS" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -f $ZFS_DIR/module/zfs/zfs.ko ]; then -+ skip_nonewline -+ elif [ "$ZFS_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $ZFS_DIR -+ make distclean &>/dev/null -+ sh ./autogen.sh &>/dev/null || fail -+ ./configure --with-spl=$SPL_DIR &>/dev/null || fail -+ make -s -j$CPUS &>/dev/null || fail -+ pass_nonewline -+ fi -+done -+printf "\n" -+echo "-----------------------------------------------------------------" -+ -+# Either create a new pool using 'zpool create', or alternately restore an -+# existing pool from another ZFS implementation for compatibility testing. -+for TAG in $POOL_TAGS; do -+ pool_set_vars $TAG -+ SKIP=0 -+ -+ printf "%-16s" $POOL_TAG -+ rm -Rf $POOL_DIR -+ mkdir -p $POOL_DIR_PRISTINE -+ -+ # Use the existing compressed image if available. -+ if [ -f $POOL_BZIP ]; then -+ tar -xjf $POOL_BZIP -C $POOL_DIR_PRISTINE \ -+ --strip-components=1 || fail -+ # Use the installed version to create the pool. -+ elif [ "$TAG" = "installed" ]; then -+ pool_create $TAG -+ # A source build is available to create the pool. -+ elif [ -d $POOL_DIR_SRC ]; then -+ pool_create $TAG -+ else -+ SKIP=1 -+ fi -+ -+ # Verify 'zpool import' works for all listed source versions. -+ for TAG in $SRC_TAGS; do -+ -+ if [ $SKIP -eq 1 ]; then -+ skip_nonewline -+ continue -+ fi -+ -+ src_set_vars $TAG -+ if [ "$TAG" != "installed" ]; then -+ cd $ZFS_DIR -+ fi -+ $ZFS_SH zfs="spa_config_path=$POOL_DIR_COPY" -+ -+ cp -a --sparse=always $POOL_DIR_PRISTINE $POOL_DIR_COPY || fail -+ POOL_NAME=`$ZPOOL_CMD import -d $POOL_DIR_COPY | \ -+ awk '/pool:/ { print $2; exit 0 }'` -+ -+ $ZPOOL_CMD import -N -d $POOL_DIR_COPY $POOL_NAME &>/dev/null -+ if [ $? -ne 0 ]; then -+ fail_nonewline -+ ERROR=1 -+ else -+ $ZPOOL_CMD export $POOL_NAME || fail -+ pass_nonewline -+ fi -+ -+ rm -Rf $POOL_DIR_COPY -+ -+ $ZFS_SH -u || fail -+ done -+ printf "\n" -+done -+ -+if [ ! $KEEP ]; then -+ rm -Rf $TEST_DIR -+fi -+ -+exit $ERROR -diff --git a/scripts/zpool-config/file-raid0.sh b/scripts/zpool-config/file-raid0.sh -index 5ec80b0..ff11836 100644 ---- a/scripts/zpool-config/file-raid0.sh -+++ b/scripts/zpool-config/file-raid0.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/file-raid10.sh b/scripts/zpool-config/file-raid10.sh -index ae7f0ae..fa297b4 100644 ---- a/scripts/zpool-config/file-raid10.sh -+++ b/scripts/zpool-config/file-raid10.sh -@@ -5,6 +5,5 @@ - --FILES_M1="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1" --FILES_M2="/tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - FILES="${FILES_M1} ${FILES_M2}" -diff --git a/scripts/zpool-config/file-raidz.sh b/scripts/zpool-config/file-raidz.sh -index 5b6c3ea..768e3de 100644 ---- a/scripts/zpool-config/file-raidz.sh -+++ b/scripts/zpool-config/file-raidz.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/file-raidz2.sh b/scripts/zpool-config/file-raidz2.sh -index bc0e5ec..b1c18f4 100644 ---- a/scripts/zpool-config/file-raidz2.sh -+++ b/scripts/zpool-config/file-raidz2.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/lo-faulty-raid0.sh b/scripts/zpool-config/lo-faulty-raid0.sh -index 10b8f88..bf057bb 100644 ---- a/scripts/zpool-config/lo-faulty-raid0.sh -+++ b/scripts/zpool-config/lo-faulty-raid0.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raid10.sh b/scripts/zpool-config/lo-faulty-raid10.sh -index ef81abb..0a3720a 100644 ---- a/scripts/zpool-config/lo-faulty-raid10.sh -+++ b/scripts/zpool-config/lo-faulty-raid10.sh -@@ -11,6 +11,6 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} -+FILES="${FILES_M1} ${FILES_M2}" - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raidz.sh b/scripts/zpool-config/lo-faulty-raidz.sh -index 2f1f08a..07fd145 100644 ---- a/scripts/zpool-config/lo-faulty-raidz.sh -+++ b/scripts/zpool-config/lo-faulty-raidz.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raidz2.sh b/scripts/zpool-config/lo-faulty-raidz2.sh -index 2522fa7..4456a56 100644 ---- a/scripts/zpool-config/lo-faulty-raidz2.sh -+++ b/scripts/zpool-config/lo-faulty-raidz2.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-raid0.sh b/scripts/zpool-config/lo-raid0.sh -index f24050f..1f23fe1 100644 ---- a/scripts/zpool-config/lo-raid0.sh -+++ b/scripts/zpool-config/lo-raid0.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/lo-raid10.sh b/scripts/zpool-config/lo-raid10.sh -index f9fe3c0..18c1dcb 100644 ---- a/scripts/zpool-config/lo-raid10.sh -+++ b/scripts/zpool-config/lo-raid10.sh -@@ -5,6 +5,5 @@ - --FILES_M1="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1" --FILES_M2="/tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - FILES="${FILES_M1} ${FILES_M2}" -diff --git a/scripts/zpool-config/lo-raidz.sh b/scripts/zpool-config/lo-raidz.sh -index db5de7c..483baf7 100644 ---- a/scripts/zpool-config/lo-raidz.sh -+++ b/scripts/zpool-config/lo-raidz.sh -@@ -4,7 +4,5 @@ - # -- --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/lo-raidz2.sh b/scripts/zpool-config/lo-raidz2.sh -index 53a032e..ea52236 100644 ---- a/scripts/zpool-config/lo-raidz2.sh -+++ b/scripts/zpool-config/lo-raidz2.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raid0.sh b/scripts/zpool-config/scsi_debug-raid0.sh -index 797ea80..fc09798 100644 ---- a/scripts/zpool-config/scsi_debug-raid0.sh -+++ b/scripts/zpool-config/scsi_debug-raid0.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raid10.sh b/scripts/zpool-config/scsi_debug-raid10.sh -index 4ec205b..3c1f733 100644 ---- a/scripts/zpool-config/scsi_debug-raid10.sh -+++ b/scripts/zpool-config/scsi_debug-raid10.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES_M1="" -diff --git a/scripts/zpool-config/scsi_debug-raidz.sh b/scripts/zpool-config/scsi_debug-raidz.sh -index c811a01..54a4565 100644 ---- a/scripts/zpool-config/scsi_debug-raidz.sh -+++ b/scripts/zpool-config/scsi_debug-raidz.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raidz2.sh b/scripts/zpool-config/scsi_debug-raidz2.sh -index 429a841..fa6e77a 100644 ---- a/scripts/zpool-config/scsi_debug-raidz2.sh -+++ b/scripts/zpool-config/scsi_debug-raidz2.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/zfs-script-config.sh.in b/zfs-script-config.sh.in -index ba676c8..10d24f0 100644 ---- a/zfs-script-config.sh.in -+++ b/zfs-script-config.sh.in -@@ -38,2 +38,4 @@ LDMOD=/sbin/insmod - -+ZED_PIDFILE=@runstatedir@/zed.pid -+ - KERNEL_MODULES=( \ --- -1.9.2 - diff --git a/zfs-utils-git/20140411-zfs-git-master.patch b/zfs-utils-git/20140411-zfs-git-master.patch deleted file mode 100644 index cd9fc1b..0000000 --- a/zfs-utils-git/20140411-zfs-git-master.patch +++ /dev/null @@ -1,57337 +0,0 @@ -diff --git a/.gitmodules b/.gitmodules -new file mode 100644 -index 0000000..d400f10 ---- /dev/null -+++ b/.gitmodules -@@ -0,0 +1,3 @@ -+[submodule "scripts/zfs-images"] -+ path = scripts/zfs-images -+ url = https://github.com/zfsonlinux/zfs-images -diff --git a/Makefile.am b/Makefile.am -index 9c299a9..dfb006b 100644 ---- a/Makefile.am -+++ b/Makefile.am -@@ -42,2 +42,6 @@ dist-hook: - -+checkstyle: -+ @find ${top_srcdir} -name '*.[hc]' ! -name 'zfs_config.*' \ -+ ! -name '*.mod.c' -type f -exec scripts/cstyle.pl {} \+ -+ - ctags: -diff --git a/cmd/Makefile.am b/cmd/Makefile.am -index bad1af6..968c6c1 100644 ---- a/cmd/Makefile.am -+++ b/cmd/Makefile.am -@@ -1,2 +1,2 @@ - SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios --SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat -+SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py -index e01dd8b..ba79235 100755 ---- a/cmd/arcstat/arcstat.py -+++ b/cmd/arcstat/arcstat.py -@@ -53,3 +53,3 @@ import copy - from decimal import Decimal --from signal import signal, SIGINT -+from signal import signal, SIGINT, SIG_DFL - -@@ -92,2 +92,3 @@ cols = { - "l2miss%": [7, 100, "L2ARC access miss percentage"], -+ "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], - "l2size": [6, 1024, "Size of the L2ARC"], -@@ -98,5 +99,5 @@ v = {} - hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", -- "mm%", "arcsz", "c"] -+ "mm%", "arcsz", "c"] - xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "rmis", -- "dread", "pread", "read"] -+ "dread", "pread", "read"] - sint = 1 # Default interval is 1 second -@@ -108,4 +109,4 @@ version = "0.4" - l2exist = False --cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " -- "[count]]\n") -+cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval " -+ "[count]]\n") - cur = {} -@@ -131,3 +132,3 @@ def usage(): - sys.stderr.write("\t -v : List all possible field headers and definitions" -- "\n") -+ "\n") - sys.stderr.write("\t -x : Print extended stats\n") -@@ -136,8 +137,8 @@ def usage(): - sys.stderr.write("\t -s : Override default field separator with custom " -- "character or string\n") -+ "character or string\n") - sys.stderr.write("\nExamples:\n") -- sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") -- sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") -- sys.stderr.write("\tarcstat -v\n") -- sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") -+ sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n") -+ sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n") -+ sys.stderr.write("\tarcstat.py -v\n") -+ sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n") - sys.stderr.write("\n") -@@ -193,3 +194,3 @@ def prettynum(sz, scale, num=0): - # Rounding error, return 0 -- elif num > 0 and num < 1: -+ elif 0 < num < 1: - num = 0 -@@ -219,3 +220,3 @@ def print_values(): - sep -- )) -+ )) - sys.stdout.write("\n") -@@ -231,2 +232,10 @@ def print_header(): - -+def get_terminal_lines(): -+ try: -+ import fcntl, termios, struct -+ data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') -+ sz = struct.unpack('hh', data) -+ return sz[0] -+ except: -+ pass - -@@ -236,2 +245,3 @@ def init(): - global hdr -+ global hdr_intr - global xhdr -@@ -261,6 +271,6 @@ def init(): - ) -- -- except getopt.error, msg: -+ except getopt.error as msg: - sys.stderr.write(msg) - usage() -+ opts = None - -@@ -305,2 +315,6 @@ def init(): - -+ lines = get_terminal_lines() -+ if lines: -+ hdr_intr = lines - 3 -+ - # check if L2ARC exists -@@ -328,5 +342,4 @@ def init(): - if len(incompat) > 0: -- sys.stderr.write("Incompatible field specified! -- %s\n" % ( -- incompat, -- )) -+ sys.stderr.write("Incompatible field specified! -- %s\n" % -+ incompat) - usage() -@@ -338,3 +351,3 @@ def init(): - -- except: -+ except IOError: - sys.stderr.write("Cannot open %s for writing\n" % opfile) -@@ -348,3 +361,3 @@ def calculate(): - -- v = {} -+ v = dict() - v["time"] = time.strftime("%H:%M:%S", time.localtime()) -@@ -365,3 +378,3 @@ def calculate(): - v["pmis"] = (d["prefetch_data_misses"] + -- d["prefetch_metadata_misses"]) / sint -+ d["prefetch_metadata_misses"]) / sint - -@@ -372,5 +385,5 @@ def calculate(): - v["mhit"] = (d["prefetch_metadata_hits"] + -- d["demand_metadata_hits"]) / sint -+ d["demand_metadata_hits"]) / sint - v["mmis"] = (d["prefetch_metadata_misses"] + -- d["demand_metadata_misses"]) / sint -+ d["demand_metadata_misses"]) / sint - -@@ -397,2 +410,3 @@ def calculate(): - v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 -+ v["l2asize"] = cur["l2_asize"] - v["l2size"] = cur["l2_size"] -@@ -401,6 +415,2 @@ def calculate(): - --def sighandler(*args): -- sys.exit(0) -- -- - def main(): -@@ -417,3 +427,3 @@ def main(): - -- signal(SIGINT, sighandler) -+ signal(SIGINT, SIG_DFL) - while True: -diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am -new file mode 100644 -index 0000000..0548b24 ---- /dev/null -+++ b/cmd/dbufstat/Makefile.am -@@ -0,0 +1,2 @@ -+bin_SCRIPTS = dbufstat.py -+EXTRA_DIST = $(bin_SCRIPTS) -diff --git a/cmd/dbufstat/dbufstat.py b/cmd/dbufstat/dbufstat.py -new file mode 100755 -index 0000000..5f75376 ---- /dev/null -+++ b/cmd/dbufstat/dbufstat.py -@@ -0,0 +1,582 @@ -+#!/usr/bin/python -+# -+# Print out statistics for all cached dmu buffers. This information -+# is available through the dbufs kstat and may be post-processed as -+# needed by the script. -+# -+# CDDL HEADER START -+# -+# The contents of this file are subject to the terms of the -+# Common Development and Distribution License, Version 1.0 only -+# (the "License"). You may not use this file except in compliance -+# with the License. -+# -+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+# or http://www.opensolaris.org/os/licensing. -+# See the License for the specific language governing permissions -+# and limitations under the License. -+# -+# When distributing Covered Code, include this CDDL HEADER in each -+# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+# If applicable, add the following below this CDDL HEADER, with the -+# fields enclosed by brackets "[]" replaced with your own identifying -+# information: Portions Copyright [yyyy] [name of copyright owner] -+# -+# CDDL HEADER END -+# -+# Copyright (C) 2013 Lawrence Livermore National Security, LLC. -+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). -+# -+ -+import sys -+import getopt -+import errno -+ -+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] -+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", -+ "meta", "state", "dbholds", "list", "atype", "index", "flags", -+ "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", -+ "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", -+ "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] -+bincompat = ["cached", "direct", "indirect", "bonus", "spill"] -+ -+dhdr = ["pool", "objset", "object", "dtype", "cached"] -+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", -+ "indirect", "bonus", "spill"] -+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", -+ "list", "atype", "index", "flags", "count", "asize", "access", -+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", -+ "l2_comp", "aholds"] -+ -+thdr = ["pool", "objset", "dtype", "cached"] -+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", -+ "bonus", "spill"] -+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", -+ "dbholds", "list", "atype", "index", "flags", "count", "asize", -+ "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", -+ "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize"] -+ -+cols = { -+ # hdr: [size, scale, description] -+ "pool": [15, -1, "pool name"], -+ "objset": [6, -1, "dataset identification number"], -+ "object": [10, -1, "object number"], -+ "level": [5, -1, "indirection level of buffer"], -+ "blkid": [8, -1, "block number of buffer"], -+ "offset": [12, 1024, "offset in object of buffer"], -+ "dbsize": [7, 1024, "size of buffer"], -+ "meta": [4, -1, "is this buffer metadata?"], -+ "state": [5, -1, "state of buffer (read, cached, etc)"], -+ "dbholds": [7, 1000, "number of holds on buffer"], -+ "list": [4, -1, "which ARC list contains this buffer"], -+ "atype": [7, -1, "ARC header type (data or metadata)"], -+ "index": [5, -1, "buffer's index into its ARC list"], -+ "flags": [8, -1, "ARC read flags"], -+ "count": [5, -1, "ARC data count"], -+ "asize": [7, 1024, "size of this ARC buffer"], -+ "access": [10, -1, "time this ARC buffer was last accessed"], -+ "mru": [5, 1000, "hits while on the ARC's MRU list"], -+ "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], -+ "mfu": [5, 1000, "hits while on the ARC's MFU list"], -+ "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], -+ "l2": [5, 1000, "hits while on the L2ARC"], -+ "l2_dattr": [8, -1, "L2ARC disk address/offset"], -+ "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], -+ "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], -+ "aholds": [6, 1000, "number of holds on this ARC buffer"], -+ "dtype": [27, -1, "dnode type"], -+ "btype": [27, -1, "bonus buffer type"], -+ "data_bs": [7, 1024, "data block size"], -+ "meta_bs": [7, 1024, "metadata block size"], -+ "bsize": [6, 1024, "bonus buffer size"], -+ "lvls": [6, -1, "number of indirection levels"], -+ "dholds": [6, 1000, "number of holds on dnode"], -+ "blocks": [8, 1000, "number of allocated blocks"], -+ "dsize": [12, 1024, "size of dnode"], -+ "cached": [6, 1024, "bytes cached for all blocks"], -+ "direct": [6, 1024, "bytes cached for direct blocks"], -+ "indirect": [8, 1024, "bytes cached for indirect blocks"], -+ "bonus": [5, 1024, "bytes cached for bonus buffer"], -+ "spill": [5, 1024, "bytes cached for spill block"], -+} -+ -+hdr = None -+xhdr = None -+sep = " " # Default separator is 2 spaces -+cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] " -+ "[-s string]\n") -+raw = 0 -+ -+ -+def print_incompat_helper(incompat): -+ cnt = 0 -+ for key in sorted(incompat): -+ if cnt is 0: -+ sys.stderr.write("\t") -+ elif cnt > 8: -+ sys.stderr.write(",\n\t") -+ cnt = 0 -+ else: -+ sys.stderr.write(", ") -+ -+ sys.stderr.write("%s" % key) -+ cnt += 1 -+ -+ sys.stderr.write("\n\n") -+ -+ -+def detailed_usage(): -+ sys.stderr.write("%s\n" % cmd) -+ -+ sys.stderr.write("Field definitions incompatible with '-b' option:\n") -+ print_incompat_helper(bincompat) -+ -+ sys.stderr.write("Field definitions incompatible with '-d' option:\n") -+ print_incompat_helper(dincompat) -+ -+ sys.stderr.write("Field definitions incompatible with '-t' option:\n") -+ print_incompat_helper(tincompat) -+ -+ sys.stderr.write("Field definitions are as follows:\n") -+ for key in sorted(cols.keys()): -+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) -+ sys.stderr.write("\n") -+ -+ sys.exit(1) -+ -+ -+def usage(): -+ sys.stderr.write("%s\n" % cmd) -+ sys.stderr.write("\t -b : Print table of information for each dbuf\n") -+ sys.stderr.write("\t -d : Print table of information for each dnode\n") -+ sys.stderr.write("\t -h : Print this help message\n") -+ sys.stderr.write("\t -r : Print raw values\n") -+ sys.stderr.write("\t -t : Print table of information for each dnode type" -+ "\n") -+ sys.stderr.write("\t -v : List all possible field headers and definitions" -+ "\n") -+ sys.stderr.write("\t -x : Print extended stats\n") -+ sys.stderr.write("\t -i : Redirect input from the specified file\n") -+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") -+ sys.stderr.write("\t -o : Redirect output to the specified file\n") -+ sys.stderr.write("\t -s : Override default field separator with custom " -+ "character or string\n") -+ sys.stderr.write("\nExamples:\n") -+ sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n") -+ sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n") -+ sys.stderr.write("\tdbufstat.py -v\n") -+ sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n") -+ sys.stderr.write("\n") -+ -+ sys.exit(1) -+ -+ -+def prettynum(sz, scale, num=0): -+ global raw -+ -+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] -+ index = 0 -+ save = 0 -+ -+ if raw or scale == -1: -+ return "%*s" % (sz, num) -+ -+ # Rounding error, return 0 -+ elif 0 < num < 1: -+ num = 0 -+ -+ while num > scale and index < 5: -+ save = num -+ num = num / scale -+ index += 1 -+ -+ if index == 0: -+ return "%*d" % (sz, num) -+ -+ if (save / scale) < 10: -+ return "%*.1f%s" % (sz - 1, num, suffix[index]) -+ else: -+ return "%*d%s" % (sz - 1, num, suffix[index]) -+ -+ -+def print_values(v): -+ global hdr -+ global sep -+ -+ try: -+ for col in hdr: -+ sys.stdout.write("%s%s" % ( -+ prettynum(cols[col][0], cols[col][1], v[col]), sep)) -+ sys.stdout.write("\n") -+ except IOError as e: -+ if e.errno == errno.EPIPE: -+ sys.exit(1) -+ -+ -+def print_header(): -+ global hdr -+ global sep -+ -+ try: -+ for col in hdr: -+ sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) -+ sys.stdout.write("\n") -+ except IOError as e: -+ if e.errno == errno.EPIPE: -+ sys.exit(1) -+ -+ -+def get_typestring(t): -+ type_strings = ["DMU_OT_NONE", -+ # general: -+ "DMU_OT_OBJECT_DIRECTORY", -+ "DMU_OT_OBJECT_ARRAY", -+ "DMU_OT_PACKED_NVLIST", -+ "DMU_OT_PACKED_NVLIST_SIZE", -+ "DMU_OT_BPOBJ", -+ "DMU_OT_BPOBJ_HDR", -+ # spa: -+ "DMU_OT_SPACE_MAP_HEADER", -+ "DMU_OT_SPACE_MAP", -+ # zil: -+ "DMU_OT_INTENT_LOG", -+ # dmu: -+ "DMU_OT_DNODE", -+ "DMU_OT_OBJSET", -+ # dsl: -+ "DMU_OT_DSL_DIR", -+ "DMU_OT_DSL_DIR_CHILD_MAP", -+ "DMU_OT_DSL_DS_SNAP_MAP", -+ "DMU_OT_DSL_PROPS", -+ "DMU_OT_DSL_DATASET", -+ # zpl: -+ "DMU_OT_ZNODE", -+ "DMU_OT_OLDACL", -+ "DMU_OT_PLAIN_FILE_CONTENTS", -+ "DMU_OT_DIRECTORY_CONTENTS", -+ "DMU_OT_MASTER_NODE", -+ "DMU_OT_UNLINKED_SET", -+ # zvol: -+ "DMU_OT_ZVOL", -+ "DMU_OT_ZVOL_PROP", -+ # other; for testing only! -+ "DMU_OT_PLAIN_OTHER", -+ "DMU_OT_UINT64_OTHER", -+ "DMU_OT_ZAP_OTHER", -+ # new object types: -+ "DMU_OT_ERROR_LOG", -+ "DMU_OT_SPA_HISTORY", -+ "DMU_OT_SPA_HISTORY_OFFSETS", -+ "DMU_OT_POOL_PROPS", -+ "DMU_OT_DSL_PERMS", -+ "DMU_OT_ACL", -+ "DMU_OT_SYSACL", -+ "DMU_OT_FUID", -+ "DMU_OT_FUID_SIZE", -+ "DMU_OT_NEXT_CLONES", -+ "DMU_OT_SCAN_QUEUE", -+ "DMU_OT_USERGROUP_USED", -+ "DMU_OT_USERGROUP_QUOTA", -+ "DMU_OT_USERREFS", -+ "DMU_OT_DDT_ZAP", -+ "DMU_OT_DDT_STATS", -+ "DMU_OT_SA", -+ "DMU_OT_SA_MASTER_NODE", -+ "DMU_OT_SA_ATTR_REGISTRATION", -+ "DMU_OT_SA_ATTR_LAYOUTS", -+ "DMU_OT_SCAN_XLATE", -+ "DMU_OT_DEDUP", -+ "DMU_OT_DEADLIST", -+ "DMU_OT_DEADLIST_HDR", -+ "DMU_OT_DSL_CLONES", -+ "DMU_OT_BPOBJ_SUBOBJ"] -+ -+ # If "-rr" option is used, don't convert to string representation -+ if raw > 1: -+ return "%i" % t -+ -+ try: -+ return type_strings[t] -+ except IndexError: -+ return "%i" % t -+ -+ -+def get_compstring(c): -+ comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", -+ "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", -+ "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", -+ "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", -+ "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", -+ "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", -+ "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", -+ "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", -+ "ZIO_COMPRESS_FUNCTION"] -+ -+ # If "-rr" option is used, don't convert to string representation -+ if raw > 1: -+ return "%i" % c -+ -+ try: -+ return comp_strings[c] -+ except IndexError: -+ return "%i" % c -+ -+ -+def parse_line(line, labels): -+ global hdr -+ -+ new = dict() -+ val = None -+ for col in hdr: -+ # These are "special" fields computed in the update_dict -+ # function, prevent KeyError exception on labels[col] for these. -+ if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: -+ val = line[labels[col]] -+ -+ if col in ['pool', 'flags']: -+ new[col] = str(val) -+ elif col in ['dtype', 'btype']: -+ new[col] = get_typestring(int(val)) -+ elif col in ['l2_comp']: -+ new[col] = get_compstring(int(val)) -+ else: -+ new[col] = int(val) -+ -+ return new -+ -+ -+def update_dict(d, k, line, labels): -+ pool = line[labels['pool']] -+ objset = line[labels['objset']] -+ key = line[labels[k]] -+ -+ dbsize = int(line[labels['dbsize']]) -+ blkid = int(line[labels['blkid']]) -+ level = int(line[labels['level']]) -+ -+ if pool not in d: -+ d[pool] = dict() -+ -+ if objset not in d[pool]: -+ d[pool][objset] = dict() -+ -+ if key not in d[pool][objset]: -+ d[pool][objset][key] = parse_line(line, labels) -+ d[pool][objset][key]['bonus'] = 0 -+ d[pool][objset][key]['cached'] = 0 -+ d[pool][objset][key]['direct'] = 0 -+ d[pool][objset][key]['indirect'] = 0 -+ d[pool][objset][key]['spill'] = 0 -+ -+ d[pool][objset][key]['cached'] += dbsize -+ -+ if blkid == -1: -+ d[pool][objset][key]['bonus'] += dbsize -+ elif blkid == -2: -+ d[pool][objset][key]['spill'] += dbsize -+ else: -+ if level == 0: -+ d[pool][objset][key]['direct'] += dbsize -+ else: -+ d[pool][objset][key]['indirect'] += dbsize -+ -+ return d -+ -+ -+def print_dict(d): -+ print_header() -+ for pool in d.keys(): -+ for objset in d[pool].keys(): -+ for v in d[pool][objset].values(): -+ print_values(v) -+ -+ -+def dnodes_build_dict(filehandle): -+ labels = dict() -+ dnodes = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ update_dict(dnodes, 'object', line.split(), labels) -+ -+ return dnodes -+ -+ -+def types_build_dict(filehandle): -+ labels = dict() -+ types = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ update_dict(types, 'dtype', line.split(), labels) -+ -+ return types -+ -+ -+def buffers_print_all(filehandle): -+ labels = dict() -+ -+ # First 3 lines are header information, skip the first two -+ for i in range(2): -+ next(filehandle) -+ -+ # The third line contains the labels and index locations -+ for i, v in enumerate(next(filehandle).split()): -+ labels[v] = i -+ -+ print_header() -+ -+ # The rest of the file is buffer information -+ for line in filehandle: -+ print_values(parse_line(line.split(), labels)) -+ -+ -+def main(): -+ global hdr -+ global sep -+ global raw -+ -+ desired_cols = None -+ bflag = False -+ dflag = False -+ hflag = False -+ ifile = None -+ ofile = None -+ tflag = False -+ vflag = False -+ xflag = False -+ -+ try: -+ opts, args = getopt.getopt( -+ sys.argv[1:], -+ "bdf:hi:o:rs:tvx", -+ [ -+ "buffers", -+ "dnodes", -+ "columns", -+ "help", -+ "infile", -+ "outfile", -+ "seperator", -+ "types", -+ "verbose", -+ "extended" -+ ] -+ ) -+ except getopt.error: -+ usage() -+ opts = None -+ -+ for opt, arg in opts: -+ if opt in ('-b', '--buffers'): -+ bflag = True -+ if opt in ('-d', '--dnodes'): -+ dflag = True -+ if opt in ('-f', '--columns'): -+ desired_cols = arg -+ if opt in ('-h', '--help'): -+ hflag = True -+ if opt in ('-i', '--infile'): -+ ifile = arg -+ if opt in ('-o', '--outfile'): -+ ofile = arg -+ if opt in ('-r', '--raw'): -+ raw += 1 -+ if opt in ('-s', '--seperator'): -+ sep = arg -+ if opt in ('-t', '--types'): -+ tflag = True -+ if opt in ('-v', '--verbose'): -+ vflag = True -+ if opt in ('-x', '--extended'): -+ xflag = True -+ -+ if hflag or (xflag and desired_cols): -+ usage() -+ -+ if vflag: -+ detailed_usage() -+ -+ # Ensure at most only one of b, d, or t flags are set -+ if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): -+ usage() -+ -+ if bflag: -+ hdr = bxhdr if xflag else bhdr -+ elif tflag: -+ hdr = txhdr if xflag else thdr -+ else: # Even if dflag is False, it's the default if none set -+ dflag = True -+ hdr = dxhdr if xflag else dhdr -+ -+ if desired_cols: -+ hdr = desired_cols.split(",") -+ -+ invalid = [] -+ incompat = [] -+ for ele in hdr: -+ if ele not in cols: -+ invalid.append(ele) -+ elif ((bflag and bincompat and ele in bincompat) or -+ (dflag and dincompat and ele in dincompat) or -+ (tflag and tincompat and ele in tincompat)): -+ incompat.append(ele) -+ -+ if len(invalid) > 0: -+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid) -+ usage() -+ -+ if len(incompat) > 0: -+ sys.stderr.write("Incompatible field specified! -- %s\n" % -+ incompat) -+ usage() -+ -+ if ofile: -+ try: -+ tmp = open(ofile, "w") -+ sys.stdout = tmp -+ -+ except IOError: -+ sys.stderr.write("Cannot open %s for writing\n" % ofile) -+ sys.exit(1) -+ -+ if not ifile: -+ ifile = '/proc/spl/kstat/zfs/dbufs' -+ -+ if ifile is not "-": -+ try: -+ tmp = open(ifile, "r") -+ sys.stdin = tmp -+ except IOError: -+ sys.stderr.write("Cannot open %s for reading\n" % ifile) -+ sys.exit(1) -+ -+ if bflag: -+ buffers_print_all(sys.stdin) -+ -+ if dflag: -+ print_dict(dnodes_build_dict(sys.stdin)) -+ -+ if tflag: -+ print_dict(types_build_dict(sys.stdin)) -+ -+if __name__ == '__main__': -+ main() -diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am -index 7abcc30..e5f3d08 100644 ---- a/cmd/mount_zfs/Makefile.am -+++ b/cmd/mount_zfs/Makefile.am -@@ -20,5 +20,3 @@ mount_zfs_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --mount_zfs_LDFLAGS = \ -- -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) $(LIBSELINUX) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la -diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c -index 4db33ed..6cb23d1 100644 ---- a/cmd/mount_zfs/mount_zfs.c -+++ b/cmd/mount_zfs/mount_zfs.c -@@ -33,5 +33,2 @@ - #include --#ifdef HAVE_LIBSELINUX --#include --#endif /* HAVE_LIBSELINUX */ - -@@ -63,2 +60,6 @@ static const option_map_t option_map[] = { - { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, -+ /* acl flags passed with util-linux-2.24 mount command */ -+ { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, -+ { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, - #ifdef MS_NOATIME -@@ -75,7 +76,6 @@ static const option_map_t option_map[] = { - #endif -- { MNTOPT_CONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_NOCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -- { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_NOCONTEXT }, -+ { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, -+ { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, - #ifdef MS_I_VERSION -@@ -270,3 +270,3 @@ out: - /* Do not add one when cwd already ends in a trailing '/' */ -- if (!strncmp(cwd, dataset, len)) -+ if (strncmp(cwd, dataset, len) == 0) - return (dataset + len + (cwd[len-1] != '/')); -@@ -336,2 +336,26 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) - -+static void -+__zfs_selinux_setcontext(const char *name, const char *context, char *mntopts, -+ char *mtabopt) -+{ -+ char tmp[MNT_LINE_MAX]; -+ -+ snprintf(tmp, MNT_LINE_MAX, ",%s=\"%s\"", name, context); -+ strlcat(mntopts, tmp, MNT_LINE_MAX); -+ strlcat(mtabopt, tmp, MNT_LINE_MAX); -+} -+ -+static void -+zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, -+ char *mntopts, char *mtabopt) -+{ -+ char context[ZFS_MAXPROPLEN]; -+ -+ if (zfs_prop_get(zhp, zpt, context, sizeof (context), -+ NULL, NULL, 0, B_FALSE) == 0) { -+ if (strcmp(context, "none") != 0) -+ __zfs_selinux_setcontext(name, context, mntopts, mtabopt); -+ } -+} -+ - int -@@ -340,3 +364,3 @@ main(int argc, char **argv) - zfs_handle_t *zhp; -- char legacy[ZFS_MAXPROPLEN]; -+ char prop[ZFS_MAXPROPLEN]; - char mntopts[MNT_LINE_MAX] = { '\0' }; -@@ -422,3 +446,3 @@ main(int argc, char **argv) - (void) fprintf(stderr, gettext("filesystem '%s' " -- "cannot be mounted of due invalid option " -+ "cannot be mounted due to invalid option " - "'%s'.\n"), dataset, badopt); -@@ -435,18 +459,2 @@ main(int argc, char **argv) - --#ifdef HAVE_LIBSELINUX -- /* -- * Automatically add the default zfs context when selinux is enabled -- * and the caller has not specified their own context. This must be -- * done until zfs is added to the default selinux policy configuration -- * as a known filesystem type which supports xattrs. -- */ -- if (is_selinux_enabled() && !(zfsflags & ZS_NOCONTEXT)) { -- (void) strlcat(mntopts, ",context=\"system_u:" -- "object_r:file_t:s0\"", sizeof (mntopts)); -- (void) strlcat(mtabopt, ",context=\"system_u:" -- "object_r:file_t:s0\"", sizeof (mtabopt)); -- } --#endif /* HAVE_LIBSELINUX */ -- -- - if (verbose) -@@ -478,8 +486,32 @@ main(int argc, char **argv) - -+ /* -+ * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists -+ * if it does, create a tmp variable in case it's needed -+ * checks to see if the selinux context is set to the default -+ * if it is, allow the setting of the other context properties -+ * this is needed because the 'context' property overrides others -+ * if it is not the default, set the 'context' property -+ */ -+ if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), -+ NULL, NULL, 0, B_FALSE) == 0) { -+ if (strcmp(prop, "none") == 0) { -+ zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, -+ MNTOPT_FSCONTEXT, mntopts, mtabopt); -+ zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, -+ MNTOPT_DEFCONTEXT, mntopts, mtabopt); -+ zfs_selinux_setcontext(zhp, -+ ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, -+ mntopts, mtabopt); -+ } else { -+ __zfs_selinux_setcontext(MNTOPT_CONTEXT, -+ prop, mntopts, mtabopt); -+ } -+ } -+ - /* treat all snapshots as legacy mount points */ - if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) -- (void) strlcpy(legacy, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); -+ (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); - else -- (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, legacy, -- sizeof (legacy), NULL, NULL, 0, B_FALSE); -+ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop, -+ sizeof (prop), NULL, NULL, 0, B_FALSE); - -@@ -499,3 +531,3 @@ main(int argc, char **argv) - */ -- if (zfsutil && !strcmp(legacy, ZFS_MOUNTPOINT_LEGACY)) { -+ if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) { - (void) fprintf(stderr, gettext( -@@ -504,3 +536,3 @@ main(int argc, char **argv) - "See zfs(8) for more information.\n"), -- dataset, mntpoint, dataset, mntpoint); -+ dataset, mntpoint, dataset, mntpoint); - return (MOUNT_USAGE); -@@ -509,3 +541,3 @@ main(int argc, char **argv) - if (!zfsutil && !(remount || fake) && -- strcmp(legacy, ZFS_MOUNTPOINT_LEGACY)) { -+ strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) { - (void) fprintf(stderr, gettext( -@@ -530,3 +562,3 @@ main(int argc, char **argv) - "'%s' is already mounted\n"), dataset); -- return (MOUNT_SYSERR); -+ return (MOUNT_BUSY); - default: -diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id -index 3cf1b58..b6752ba 100755 ---- a/cmd/vdev_id/vdev_id -+++ b/cmd/vdev_id/vdev_id -@@ -41,14 +41,16 @@ - # --# # Linux Mapped --# # Slot Slot --# slot 1 7 --# slot 2 10 --# slot 3 3 --# slot 4 6 --# slot 5 2 --# slot 6 8 --# slot 7 1 --# slot 8 4 --# slot 9 9 --# slot 10 5 -+# # Custom mapping for Channel A -+# -+# # Linux Mapped -+# # Slot Slot Channel -+# slot 1 7 A -+# slot 2 10 A -+# slot 3 3 A -+# slot 4 6 A -+# -+# # Default mapping for B, C, and D -+# slot 1 4 -+# slot 2 2 -+# slot 3 1 -+# slot 4 3 - -@@ -112,6 +114,7 @@ map_slot() { - local LINUX_SLOT=$1 -+ local CHANNEL=$2 - local MAPPED_SLOT= - -- MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} \ -- { print \\$3; exit }" $CONFIG` -+ MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ -+ \\$4 ~ /^(${CHANNEL}|)$/ { print \\$3; exit }" $CONFIG` - if [ -z "$MAPPED_SLOT" ] ; then -@@ -181,4 +184,6 @@ sas_handler() { - -- # Get the raw scsi device name from multipath -l. -- DEV=`multipath -l $DM_NAME |awk '/running/{print $3 ; exit}'` -+ # Get the raw scsi device name from multipath -l. Strip off -+ # leading pipe symbols to make field numbering consistent. -+ DEV=`multipath -l $DM_NAME | -+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` - if [ -z "$DEV" ] ; then -@@ -254,4 +259,4 @@ sas_handler() { - -- SLOT=`map_slot $SLOT` - CHAN=`map_channel $PCI_ID $PORT` -+ SLOT=`map_slot $SLOT $CHAN` - if [ -z "$CHAN" ] ; then -diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am -index f82f1a3..854fbab 100644 ---- a/cmd/zdb/Makefile.am -+++ b/cmd/zdb/Makefile.am -@@ -16,4 +16,5 @@ zdb_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zdb_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zdb_LDADD += $(ZLIB) -diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c -index 82491ad..8e60b9b 100644 ---- a/cmd/zdb/zdb.c -+++ b/cmd/zdb/zdb.c -@@ -59,2 +59,3 @@ - #include -+#include - #undef ZFS_MAXNAMELEN -@@ -166,3 +167,4 @@ usage(void) - (void) fprintf(stderr, " -M -- " -- "specify the maximum number of checksumming I/Os [default is 200]\n"); -+ "specify the maximum number of checksumming I/Os " -+ "[default is 200]\n"); - (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " -@@ -211,2 +213,23 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) - -+/* ARGSUSED */ -+static void -+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) -+{ -+ spa_history_phys_t *shp = data; -+ -+ if (shp == NULL) -+ return; -+ -+ (void) printf("\t\tpool_create_len = %llu\n", -+ (u_longlong_t)shp->sh_pool_create_len); -+ (void) printf("\t\tphys_max_off = %llu\n", -+ (u_longlong_t)shp->sh_phys_max_off); -+ (void) printf("\t\tbof = %llu\n", -+ (u_longlong_t)shp->sh_bof); -+ (void) printf("\t\teof = %llu\n", -+ (u_longlong_t)shp->sh_eof); -+ (void) printf("\t\trecords_lost = %llu\n", -+ (u_longlong_t)shp->sh_records_lost); -+} -+ - static void -@@ -871,2 +894,3 @@ dump_history(spa_t *spa) - char *cmd, *intstr; -+ boolean_t printed = B_FALSE; - -@@ -874,3 +898,3 @@ dump_history(spa_t *spa) - &time) != 0) -- continue; -+ goto next; - if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, -@@ -879,3 +903,3 @@ dump_history(spa_t *spa) - ZPOOL_HIST_INT_EVENT, &ievent) != 0) -- continue; -+ goto next; - verify(nvlist_lookup_uint64(events[i], -@@ -884,4 +908,4 @@ dump_history(spa_t *spa) - ZPOOL_HIST_INT_STR, &intstr) == 0); -- if (ievent >= LOG_END) -- continue; -+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) -+ goto next; - -@@ -898,2 +922,10 @@ dump_history(spa_t *spa) - (void) printf("%s %s\n", tbuf, cmd); -+ printed = B_TRUE; -+ -+next: -+ if (dump_opt['h'] > 1) { -+ if (!printed) -+ (void) printf("unrecognized record:\n"); -+ dump_nvlist(events[i], 2); -+ } - } -@@ -1204,3 +1236,3 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - static void --dump_bpobj(bpobj_t *bpo, char *name) -+dump_bpobj(bpobj_t *bpo, char *name, int indent) - { -@@ -1209,2 +1241,3 @@ dump_bpobj(bpobj_t *bpo, char *name) - char uncomp[32]; -+ uint64_t i; - -@@ -1214,13 +1247,35 @@ dump_bpobj(bpobj_t *bpo, char *name) - zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); -- if (bpo->bpo_havesubobj) { -+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { - zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); - zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); -- (void) printf("\n %s: %llu local blkptrs, %llu subobjs, " -- "%s (%s/%s comp)\n", -- name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, -+ (void) printf(" %*s: object %llu, %llu local blkptrs, " -+ "%llu subobjs, %s (%s/%s comp)\n", -+ indent * 8, name, -+ (u_longlong_t)bpo->bpo_object, -+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, - (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, - bytes, comp, uncomp); -+ -+ for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { -+ uint64_t subobj; -+ bpobj_t subbpo; -+ int error; -+ VERIFY0(dmu_read(bpo->bpo_os, -+ bpo->bpo_phys->bpo_subobjs, -+ i * sizeof (subobj), sizeof (subobj), &subobj, 0)); -+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj); -+ if (error != 0) { -+ (void) printf("ERROR %u while trying to open " -+ "subobj id %llu\n", -+ error, (u_longlong_t)subobj); -+ continue; -+ } -+ dump_bpobj(&subbpo, "subobj", indent + 1); -+ } - } else { -- (void) printf("\n %s: %llu blkptrs, %s\n", -- name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); -+ (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", -+ indent * 8, name, -+ (u_longlong_t)bpo->bpo_object, -+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, -+ bytes); - } -@@ -1230,5 +1285,7 @@ dump_bpobj(bpobj_t *bpo, char *name) - -- (void) printf("\n"); - -- (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); -+ if (indent == 0) { -+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); -+ (void) printf("\n"); -+ } - } -@@ -1239,2 +1296,3 @@ dump_deadlist(dsl_deadlist_t *dl) - dsl_deadlist_entry_t *dle; -+ uint64_t unused; - char bytes[32]; -@@ -1257,10 +1315,21 @@ dump_deadlist(dsl_deadlist_t *dl) - -+ /* force the tree to be loaded */ -+ dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); -+ - for (dle = avl_first(&dl->dl_tree); dle; - dle = AVL_NEXT(&dl->dl_tree, dle)) { -- (void) printf(" mintxg %llu -> obj %llu\n", -- (longlong_t)dle->dle_mintxg, -- (longlong_t)dle->dle_bpobj.bpo_object); -+ if (dump_opt['d'] >= 5) { -+ char buf[128]; -+ (void) snprintf(buf, sizeof (buf), -+ "mintxg %llu -> obj %llu", -+ (longlong_t)dle->dle_mintxg, -+ (longlong_t)dle->dle_bpobj.bpo_object); -+ -+ dump_bpobj(&dle->dle_bpobj, buf, 0); -+ } else { -+ (void) printf("mintxg %llu -> obj %llu\n", -+ (longlong_t)dle->dle_mintxg, -+ (longlong_t)dle->dle_bpobj.bpo_object); - -- if (dump_opt['d'] >= 5) -- dump_bpobj(&dle->dle_bpobj, ""); -+ } - } -@@ -1287,3 +1356,3 @@ fuid_table_destroy(void) - * For CIFS files with FUID the fuid is printed in hex followed by -- * the doman-rid string. -+ * the domain-rid string. - */ -@@ -1371,3 +1440,3 @@ dump_znode_sa_xattr(sa_handle_t *hdl) - nvpair_value_byte_array(elem, &value, &cnt); -- for (idx = 0 ; idx < cnt ; ++idx) { -+ for (idx = 0; idx < cnt; ++idx) { - if (isprint(value[idx])) -@@ -1531,3 +1600,3 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { - dump_uint8, /* SPA history */ -- dump_uint64, /* SPA history offsets */ -+ dump_history_offsets, /* SPA history offsets */ - dump_zap, /* Pool properties */ -@@ -1696,3 +1765,5 @@ dump_dir(objset_t *os) - -+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - dmu_objset_fast_stat(os, &dds); -+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - -@@ -2142,3 +2213,2 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); -- - } -@@ -2328,3 +2398,3 @@ dump_block_stats(spa_t *spa) - */ -- bzero(&zcb, sizeof(zdb_cb_t)); -+ bzero(&zcb, sizeof (zdb_cb_t)); - zdb_leak_init(spa, &zcb); -@@ -2336,4 +2406,6 @@ dump_block_stats(spa_t *spa) - count_block_cb, &zcb, NULL); -- (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, -- count_block_cb, &zcb, NULL); -+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { -+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, -+ count_block_cb, &zcb, NULL); -+ } - if (spa_feature_is_active(spa, -@@ -2637,6 +2709,7 @@ dump_zpool(spa_t *spa) - if (dump_opt['d'] >= 3) { -- dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); -+ dump_bpobj(&spa->spa_deferred_bpobj, -+ "Deferred frees", 0); - if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { - dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, -- "Pool snapshot frees"); -+ "Pool snapshot frees", 0); - } -diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore -new file mode 100644 -index 0000000..76557bb ---- /dev/null -+++ b/cmd/zed/.gitignore -@@ -0,0 +1 @@ -+/zed -diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am -new file mode 100644 -index 0000000..f1404de ---- /dev/null -+++ b/cmd/zed/Makefile.am -@@ -0,0 +1,66 @@ -+include $(top_srcdir)/config/Rules.am -+ -+DEFAULT_INCLUDES += \ -+ -I$(top_srcdir)/include \ -+ -I$(top_srcdir)/lib/libspl/include -+ -+sbin_PROGRAMS = zed -+ -+zed_SOURCES = \ -+ $(top_srcdir)/cmd/zed/zed.c \ -+ $(top_srcdir)/cmd/zed/zed.h \ -+ $(top_srcdir)/cmd/zed/zed_conf.c \ -+ $(top_srcdir)/cmd/zed/zed_conf.h \ -+ $(top_srcdir)/cmd/zed/zed_event.c \ -+ $(top_srcdir)/cmd/zed/zed_event.h \ -+ $(top_srcdir)/cmd/zed/zed_exec.c \ -+ $(top_srcdir)/cmd/zed/zed_exec.h \ -+ $(top_srcdir)/cmd/zed/zed_file.c \ -+ $(top_srcdir)/cmd/zed/zed_file.h \ -+ $(top_srcdir)/cmd/zed/zed_log.c \ -+ $(top_srcdir)/cmd/zed/zed_log.h \ -+ $(top_srcdir)/cmd/zed/zed_strings.c \ -+ $(top_srcdir)/cmd/zed/zed_strings.h -+ -+zed_LDADD = \ -+ $(top_builddir)/lib/libavl/libavl.la \ -+ $(top_builddir)/lib/libnvpair/libnvpair.la \ -+ $(top_builddir)/lib/libspl/libspl.la \ -+ $(top_builddir)/lib/libzfs/libzfs.la -+ -+zedconfdir = $(sysconfdir)/zfs/zed.d -+ -+dist_zedconf_DATA = \ -+ $(top_srcdir)/cmd/zed/zed.d/zed.rc -+ -+zedexecdir = $(libexecdir)/zfs/zed.d -+ -+dist_zedexec_SCRIPTS = \ -+ $(top_srcdir)/cmd/zed/zed.d/all-debug.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/data-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/generic-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/io-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \ -+ $(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh -+ -+zedconfdefaults = \ -+ all-syslog.sh \ -+ checksum-email.sh \ -+ checksum-spare.sh \ -+ data-email.sh \ -+ io-email.sh \ -+ io-spare.sh \ -+ resilver.finish-email.sh \ -+ scrub.finish-email.sh -+ -+install-data-local: -+ $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" -+ for f in $(zedconfdefaults); do \ -+ test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ -+ -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ -+ ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ -+ done -diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c -new file mode 100644 -index 0000000..c54a59b ---- /dev/null -+++ b/cmd/zed/zed.c -@@ -0,0 +1,235 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_event.h" -+#include "zed_file.h" -+#include "zed_log.h" -+ -+static volatile sig_atomic_t _got_exit = 0; -+static volatile sig_atomic_t _got_hup = 0; -+ -+/* -+ * Signal handler for SIGINT & SIGTERM. -+ */ -+static void -+_exit_handler(int signum) -+{ -+ _got_exit = 1; -+} -+ -+/* -+ * Signal handler for SIGHUP. -+ */ -+static void -+_hup_handler(int signum) -+{ -+ _got_hup = 1; -+} -+ -+/* -+ * Register signal handlers. -+ */ -+static void -+_setup_sig_handlers(void) -+{ -+ struct sigaction sa; -+ -+ if (sigemptyset(&sa.sa_mask) < 0) -+ zed_log_die("Failed to initialize sigset"); -+ -+ sa.sa_flags = SA_RESTART; -+ sa.sa_handler = SIG_IGN; -+ -+ if (sigaction(SIGPIPE, &sa, NULL) < 0) -+ zed_log_die("Failed to ignore SIGPIPE"); -+ -+ sa.sa_handler = _exit_handler; -+ if (sigaction(SIGINT, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGINT handler"); -+ -+ if (sigaction(SIGTERM, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGTERM handler"); -+ -+ sa.sa_handler = _hup_handler; -+ if (sigaction(SIGHUP, &sa, NULL) < 0) -+ zed_log_die("Failed to register SIGHUP handler"); -+} -+ -+/* -+ * Lock all current and future pages in the virtual memory address space. -+ * Access to locked pages will never be delayed by a page fault. -+ * EAGAIN is tested up to max_tries in case this is a transient error. -+ */ -+static void -+_lock_memory(void) -+{ -+#if HAVE_MLOCKALL -+ int i = 0; -+ const int max_tries = 10; -+ -+ for (i = 0; i < max_tries; i++) { -+ if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) { -+ zed_log_msg(LOG_INFO, "Locked all pages in memory"); -+ return; -+ } -+ if (errno != EAGAIN) -+ break; -+ } -+ zed_log_die("Failed to lock memory pages: %s", strerror(errno)); -+ -+#else /* HAVE_MLOCKALL */ -+ zed_log_die("Failed to lock memory pages: mlockall() not supported"); -+#endif /* HAVE_MLOCKALL */ -+} -+ -+/* -+ * Transform the process into a daemon. -+ */ -+static void -+_become_daemon(void) -+{ -+ pid_t pid; -+ int fd; -+ -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_die("Failed to create child process: %s", -+ strerror(errno)); -+ } else if (pid > 0) { -+ _exit(EXIT_SUCCESS); -+ } -+ if (setsid() < 0) -+ zed_log_die("Failed to create new session: %s", -+ strerror(errno)); -+ -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_die("Failed to create grandchild process: %s", -+ strerror(errno)); -+ } else if (pid > 0) { -+ _exit(EXIT_SUCCESS); -+ } -+ fd = open("/dev/null", O_RDWR); -+ -+ if (fd < 0) -+ zed_log_die("Failed to open /dev/null: %s", strerror(errno)); -+ -+ if (dup2(fd, STDIN_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stdin: %s", -+ strerror(errno)); -+ -+ if (dup2(fd, STDOUT_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stdout: %s", -+ strerror(errno)); -+ -+ if (dup2(fd, STDERR_FILENO) < 0) -+ zed_log_die("Failed to dup /dev/null onto stderr: %s", -+ strerror(errno)); -+ -+ if (close(fd) < 0) -+ zed_log_die("Failed to close /dev/null: %s", strerror(errno)); -+} -+ -+/* -+ * ZFS Event Daemon (ZED). -+ */ -+int -+main(int argc, char *argv[]) -+{ -+ struct zed_conf *zcp; -+ uint64_t saved_eid; -+ int64_t saved_etime[2]; -+ -+ zed_log_init(argv[0]); -+ zed_log_stderr_open(LOG_NOTICE); -+ zcp = zed_conf_create(); -+ zed_conf_parse_opts(zcp, argc, argv); -+ if (zcp->do_verbose) -+ zed_log_stderr_open(LOG_INFO); -+ -+ if (geteuid() != 0) -+ zed_log_die("Must be run as root"); -+ -+ (void) umask(0); -+ -+ _setup_sig_handlers(); -+ -+ zed_conf_parse_file(zcp); -+ -+ zed_file_close_from(STDERR_FILENO + 1); -+ -+ if (chdir("/") < 0) -+ zed_log_die("Failed to change to root directory"); -+ -+ if (zed_conf_scan_dir(zcp) < 0) -+ exit(EXIT_FAILURE); -+ -+ if (zcp->do_memlock) -+ _lock_memory(); -+ -+ if (!zcp->do_foreground) { -+ _become_daemon(); -+ zed_log_syslog_open(LOG_DAEMON); -+ zed_log_stderr_close(); -+ } -+ zed_log_msg(LOG_NOTICE, -+ "ZFS Event Daemon %s-%s", ZFS_META_VERSION, ZFS_META_RELEASE); -+ -+ (void) zed_conf_write_pid(zcp); -+ -+ if (zed_conf_open_state(zcp) < 0) -+ exit(EXIT_FAILURE); -+ -+ if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) -+ exit(EXIT_FAILURE); -+ -+ zed_event_init(zcp); -+ zed_event_seek(zcp, saved_eid, saved_etime); -+ -+ while (!_got_exit) { -+ if (_got_hup) { -+ _got_hup = 0; -+ (void) zed_conf_scan_dir(zcp); -+ } -+ zed_event_service(zcp); -+ } -+ zed_log_msg(LOG_NOTICE, "Exiting"); -+ zed_event_fini(zcp); -+ zed_conf_destroy(zcp); -+ zed_log_fini(); -+ exit(EXIT_SUCCESS); -+} -diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh -new file mode 100755 -index 0000000..ae64e0a ---- /dev/null -+++ b/cmd/zed/zed.d/all-debug.sh -@@ -0,0 +1,17 @@ -+#!/bin/sh -+# -+# Log all environment variables to ZED_DEBUG_LOG. -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+# Override the default umask to restrict access to a newly-created logfile. -+umask 077 -+ -+# Append stdout to the logfile after obtaining an advisory lock. -+exec >> "${ZED_DEBUG_LOG:=/tmp/zed.debug.log}" -+flock -x 1 -+ -+printenv | sort -+echo -+ -+exit 0 -diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh -new file mode 100755 -index 0000000..b8bd307 ---- /dev/null -+++ b/cmd/zed/zed.d/all-syslog.sh -@@ -0,0 +1,11 @@ -+#!/bin/sh -+# -+# Log the zevent via syslog. -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+logger -t "${ZED_SYSLOG_TAG:=zed}" -p "${ZED_SYSLOG_PRIORITY:=daemon.notice}" \ -+ eid="${ZEVENT_EID}" class="${ZEVENT_SUBCLASS}" \ -+ "${ZEVENT_POOL:+pool=$ZEVENT_POOL}" -+ -+exit 0 -diff --git a/cmd/zed/zed.d/checksum-email.sh b/cmd/zed/zed.d/checksum-email.sh -new file mode 120000 -index 0000000..f95bec2 ---- /dev/null -+++ b/cmd/zed/zed.d/checksum-email.sh -@@ -0,0 +1 @@ -+io-email.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh -new file mode 120000 -index 0000000..f564f93 ---- /dev/null -+++ b/cmd/zed/zed.d/checksum-spare.sh -@@ -0,0 +1 @@ -+io-spare.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/data-email.sh b/cmd/zed/zed.d/data-email.sh -new file mode 100755 -index 0000000..9f83161 ---- /dev/null -+++ b/cmd/zed/zed.d/data-email.sh -@@ -0,0 +1,81 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a DATA zevent. -+# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -+# class/pool combination. This protects against spamming the recipient -+# should multiple events occur together in time for the same pool. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# State File Format: -+# POOL:TIME_OF_LAST_EMAIL -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" != "data"; then \ -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+NAME="zed.${ZEVENT_SUBCLASS}.email" -+LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -+STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" -+ -+# Obtain lock to ensure mutual exclusion for accessing state. -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Query state for last time email was sent for this pool. -+TIME_NOW=`date +%s` -+TIME_LAST=`egrep "^${ZEVENT_POOL}:" "${STATEFILE}" 2>/dev/null | cut -d: -f2` -+if test -n "${TIME_LAST}"; then -+ TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` -+ if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then -+ exit 2 -+ fi -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" </dev/null > "${STATEFILE}.$$" -+echo "${ZEVENT_POOL}:${TIME_NOW}" >> "${STATEFILE}.$$" -+mv -f "${STATEFILE}.$$" "${STATEFILE}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/generic-email.sh b/cmd/zed/zed.d/generic-email.sh -new file mode 100755 -index 0000000..16bbdb1 ---- /dev/null -+++ b/cmd/zed/zed.d/generic-email.sh -@@ -0,0 +1,59 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a given zevent. -+# This is a generic script than can be symlinked to a file in the zed -+# enabled-scripts directory in order to have email sent when a particular -+# class of zevents occurs. The symlink filename must begin with the zevent -+# (sub)class string (eg, "probe_failure-email.sh" for the "probe_failure" -+# subclass). Refer to the zed(8) manpage for details. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+# Override the default umask to restrict access to the msgbody tmpfile. -+umask 077 -+ -+SUBJECT="ZFS ${ZEVENT_SUBCLASS} event" -+test -n "${ZEVENT_POOL}" && SUBJECT="${SUBJECT} for ${ZEVENT_POOL}" -+SUBJECT="${SUBJECT} on `hostname`" -+ -+MSGBODY="${TMPDIR:=/tmp}/`basename \"$0\"`.$$" -+{ -+ echo "A ZFS ${ZEVENT_SUBCLASS} event has been posted:" -+ echo -+ echo " eid: ${ZEVENT_EID}" -+ echo " host: `hostname`" -+ echo " time: ${ZEVENT_TIME_STRING}" -+ test -n "${ZEVENT_VDEV_TYPE}" -a -n "${ZEVENT_VDEV_PATH}" && \ -+ echo " vdev: ${ZEVENT_VDEV_TYPE}:${ZEVENT_VDEV_PATH}" -+ test -n "${ZEVENT_POOL}" -a -x "${ZPOOL}" && \ -+ "${ZPOOL}" status "${ZEVENT_POOL}" -+} > "${MSGBODY}" -+ -+test -f "${MSGBODY}" && "${MAIL}" -s "${SUBJECT}" "${ZED_EMAIL}" < "${MSGBODY}" -+MAIL_STATUS=$? -+rm -f "${MSGBODY}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/io-email.sh b/cmd/zed/zed.d/io-email.sh -new file mode 100755 -index 0000000..6cfe3c7 ---- /dev/null -+++ b/cmd/zed/zed.d/io-email.sh -@@ -0,0 +1,86 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a CHECKSUM or IO zevent. -+# Only one message per ZED_EMAIL_INTERVAL_SECS will be sent for a given -+# class/pool/vdev combination. This protects against spamming the recipient -+# should multiple events occur together in time for the same pool/device. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# State File Format: -+# POOL:VDEV_PATH:TIME_OF_LAST_EMAIL -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+test -n "${ZEVENT_VDEV_PATH}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" != "checksum" \ -+ -a "${ZEVENT_SUBCLASS}" != "io"; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+ -+NAME="zed.${ZEVENT_SUBCLASS}.email" -+LOCKFILE="${ZED_LOCKDIR:=/var/lock}/${NAME}.lock" -+STATEFILE="${ZED_RUNDIR:=/var/run}/${NAME}.state" -+ -+# Obtain lock to ensure mutual exclusion for accessing state. -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Query state for last time email was sent for this pool/vdev. -+TIME_NOW=`date +%s` -+TIME_LAST=`egrep "^${ZEVENT_POOL}:${ZEVENT_VDEV_PATH}:" "${STATEFILE}" \ -+ 2>/dev/null | cut -d: -f3` -+if test -n "${TIME_LAST}"; then -+ TIME_DELTA=`expr "${TIME_NOW}" - "${TIME_LAST}"` -+ if test "${TIME_DELTA}" -lt "${ZED_EMAIL_INTERVAL_SECS:=3600}"; then -+ exit 2 -+ fi -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" </dev/null > "${STATEFILE}.$$" -+echo "${ZEVENT_POOL}:${ZEVENT_VDEV_PATH}:${TIME_NOW}" >> "${STATEFILE}.$$" -+mv -f "${STATEFILE}.$$" "${STATEFILE}" -+ -+if test "${MAIL_STATUS}" -ne 0; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" exit="${MAIL_STATUS}" -+ exit 1 -+fi -+ -+exit 0 -diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh -new file mode 100755 -index 0000000..dd5bf4e ---- /dev/null -+++ b/cmd/zed/zed.d/io-spare.sh -@@ -0,0 +1,125 @@ -+#!/bin/sh -+# -+# Replace a device with a hot spare in response to IO or checksum errors. -+# The following actions will be performed automatically when the number -+# of errors exceed the limit set by ZED_SPARE_ON_IO_ERRORS or -+# ZED_SPARE_ON_CHECKSUM_ERRORS. -+# -+# 1) FAULT the device on IO errors, no futher IO will be attempted. -+# DEGRADE the device on checksum errors, the device is still -+# functional and can be used to service IO requests. -+# 2) Set the SES fault beacon for the device. -+# 3) Replace the device with a hot spare if any are available. -+# -+# Once the hot sparing operation is complete either the failed device or -+# the hot spare must be manually retired using the 'zpool detach' command. -+# The 'autoreplace' functionality which would normally take care of this -+# under Illumos has not yet been implemented. -+# -+# Full support for autoreplace is planned, but it requires that the full -+# ZFS Diagnosis Engine be ported. In the meanwhile this script provides -+# the majority of the expected hot spare functionality. -+# -+# Exit codes: -+# 0: replaced by hot spare -+# 1: no hot spare device available -+# 2: hot sparing disabled -+# 3: already faulted or degraded -+# 4: unsupported event class -+# 5: internal error -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+test -n "${ZEVENT_VDEV_PATH}" || exit 5 -+test -n "${ZEVENT_VDEV_GUID}" || exit 5 -+ -+# Defaults to disabled, enable in the zed.rc file. -+ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} -+ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} -+ -+if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ -+ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then -+ exit 2 -+fi -+ -+# A lock file is used to serialize execution. -+ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} -+LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" -+ -+exec 8> "${LOCKFILE}" -+flock -x 8 -+ -+# Given a and return the status, (ONLINE, FAULTED, etc...). -+vdev_status() { -+ local POOL=$1 -+ local VDEV=`basename $2` -+ -+ ${ZPOOL} status ${POOL} | \ -+ awk -v pat="${VDEV}|${VDEV/-part?}" '$0 ~ pat { print $1" "$2 }' -+ return 0 -+} -+ -+# Fault devices after N I/O errors. -+if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then -+ ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` -+ -+ if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ -+ ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then -+ ACTION="fault" -+ fi -+# Degrade devices after N checksum errors. -+elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then -+ ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} -+ -+ if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ -+ ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then -+ ACTION="degrade" -+ fi -+else -+ ACTION= -+fi -+ -+if [ -n "${ACTION}" ]; then -+ -+ # Device is already FAULTED or DEGRADED -+ set -- `vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` -+ ZEVENT_VDEV_PATH_FOUND=$1 -+ STATUS=$2 -+ if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then -+ exit 3 -+ fi -+ -+ # Step 1) FAULT or DEGRADE the device -+ # -+ ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} -+ -+ # Step 2) Set the SES fault beacon. -+ # -+ # XXX: Set the 'fault' or 'ident' beacon for the device. This can -+ # be done through the sg_ses utility, the only hard part is to map -+ # the sd device to its corresponding enclosure and slot. We may -+ # be able to leverage the existing vdev_id scripts for this. -+ # -+ # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 -+ # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 -+ -+ # Step 3) Replace the device with a hot spare. -+ # -+ # Round robin through the spares selecting those which are available. -+ # -+ for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do -+ set -- `vdev_status ${ZEVENT_POOL} ${SPARE}` -+ SPARE_VDEV_FOUND=$1 -+ STATUS=$2 -+ if [ "${STATUS}" = "AVAIL" ]; then -+ ${ZPOOL} replace ${ZEVENT_POOL} \ -+ ${ZEVENT_VDEV_GUID} ${SPARE_VDEV_FOUND} && exit 0 -+ fi -+ done -+ -+ exit 1 -+fi -+ -+exit 4 -diff --git a/cmd/zed/zed.d/resilver.finish-email.sh b/cmd/zed/zed.d/resilver.finish-email.sh -new file mode 120000 -index 0000000..1afad32 ---- /dev/null -+++ b/cmd/zed/zed.d/resilver.finish-email.sh -@@ -0,0 +1 @@ -+scrub.finish-email.sh -\ No newline at end of file -diff --git a/cmd/zed/zed.d/scrub.finish-email.sh b/cmd/zed/zed.d/scrub.finish-email.sh -new file mode 100755 -index 0000000..b5ce3f7 ---- /dev/null -+++ b/cmd/zed/zed.d/scrub.finish-email.sh -@@ -0,0 +1,73 @@ -+#!/bin/sh -+# -+# Send email to ZED_EMAIL in response to a RESILVER.FINISH or SCRUB.FINISH. -+# By default, "zpool status" output will only be included in the email for -+# a scrub.finish zevent if the pool is not healthy; to always include its -+# output, set ZED_EMAIL_VERBOSE=1. -+# Exit codes: -+# 0: email sent -+# 1: email failed -+# 2: email suppressed -+# 3: missing executable -+# 4: unsupported event class -+# 5: internal error -+# -+test -f "${ZED_SCRIPT_DIR}/zed.rc" && . "${ZED_SCRIPT_DIR}/zed.rc" -+ -+test -n "${ZEVENT_POOL}" || exit 5 -+test -n "${ZEVENT_SUBCLASS}" || exit 5 -+ -+if test "${ZEVENT_SUBCLASS}" = "resilver.finish"; then -+ ACTION="resilvering" -+elif test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then -+ ACTION="scrubbing" -+else -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: unsupported event class \"${ZEVENT_SUBCLASS}\" -+ exit 4 -+fi -+ -+# Only send email if ZED_EMAIL has been configured. -+test -n "${ZED_EMAIL}" || exit 2 -+ -+# Ensure requisite executables are installed. -+if ! command -v "${MAIL:=mail}" >/dev/null 2>&1; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${MAIL}" not installed -+ exit 3 -+fi -+if ! test -x "${ZPOOL}"; then -+ logger -t "${ZED_SYSLOG_TAG:=zed}" \ -+ -p "${ZED_SYSLOG_PRIORITY:=daemon.warning}" \ -+ `basename "$0"`: "${ZPOOL}" not installed -+ exit 3 -+fi -+ -+# For scrub, suppress email if pool is healthy and verbosity is not enabled. -+if test "${ZEVENT_SUBCLASS}" = "scrub.finish"; then -+ HEALTHY=`"${ZPOOL}" status -x "${ZEVENT_POOL}" | \ -+ grep "'${ZEVENT_POOL}' is healthy"` -+ test -n "${HEALTHY}" -a "${ZED_EMAIL_VERBOSE:=0}" = 0 && exit 2 -+fi -+ -+"${MAIL}" -s "ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on `hostname`" \ -+ "${ZED_EMAIL}" <. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_H -+#define ZED_H -+ -+/* -+ * Absolute path for the default zed configuration file. -+ */ -+#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" -+ -+/* -+ * Absolute path for the default zed pid file. -+ */ -+#define ZED_PID_FILE RUNSTATEDIR "/zed.pid" -+ -+/* -+ * Absolute path for the default zed state file. -+ */ -+#define ZED_STATE_FILE RUNSTATEDIR "/zed.state" -+ -+/* -+ * Absolute path for the default zed script directory. -+ */ -+#define ZED_SCRIPT_DIR SYSCONFDIR "/zfs/zed.d" -+ -+/* -+ * Reserved for future use. -+ */ -+#define ZED_MAX_EVENTS 0 -+ -+/* -+ * Reserved for future use. -+ */ -+#define ZED_MIN_EVENTS 0 -+ -+/* -+ * String prefix for ZED variables passed via environment variables. -+ */ -+#define ZED_VAR_PREFIX "ZED_" -+ -+/* -+ * String prefix for ZFS event names passed via environment variables. -+ */ -+#define ZEVENT_VAR_PREFIX "ZEVENT_" -+ -+#endif /* !ZED_H */ -diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c -new file mode 100644 -index 0000000..78b45e9 ---- /dev/null -+++ b/cmd/zed/zed_conf.c -@@ -0,0 +1,680 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+/* -+ * Return a new configuration with default values. -+ */ -+struct zed_conf * -+zed_conf_create(void) -+{ -+ struct zed_conf *zcp; -+ -+ zcp = malloc(sizeof (*zcp)); -+ if (!zcp) -+ goto nomem; -+ -+ memset(zcp, 0, sizeof (*zcp)); -+ -+ zcp->syslog_facility = LOG_DAEMON; -+ zcp->min_events = ZED_MIN_EVENTS; -+ zcp->max_events = ZED_MAX_EVENTS; -+ zcp->scripts = NULL; /* created via zed_conf_scan_dir() */ -+ zcp->state_fd = -1; /* opened via zed_conf_open_state() */ -+ zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ -+ zcp->zevent_fd = -1; /* opened via zed_event_init() */ -+ -+ if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) -+ goto nomem; -+ -+ if (!(zcp->pid_file = strdup(ZED_PID_FILE))) -+ goto nomem; -+ -+ if (!(zcp->script_dir = strdup(ZED_SCRIPT_DIR))) -+ goto nomem; -+ -+ if (!(zcp->state_file = strdup(ZED_STATE_FILE))) -+ goto nomem; -+ -+ return (zcp); -+ -+nomem: -+ zed_log_die("Failed to create conf: %s", strerror(errno)); -+ return (NULL); -+} -+ -+/* -+ * Destroy the configuration [zcp]. -+ * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini(). -+ */ -+void -+zed_conf_destroy(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ return; -+ -+ if (zcp->state_fd >= 0) { -+ if (close(zcp->state_fd) < 0) -+ zed_log_msg(LOG_WARNING, -+ "Failed to close state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ } -+ if (zcp->pid_file) { -+ if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT)) -+ zed_log_msg(LOG_WARNING, -+ "Failed to remove pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } -+ if (zcp->conf_file) -+ free(zcp->conf_file); -+ -+ if (zcp->pid_file) -+ free(zcp->pid_file); -+ -+ if (zcp->script_dir) -+ free(zcp->script_dir); -+ -+ if (zcp->state_file) -+ free(zcp->state_file); -+ -+ if (zcp->scripts) -+ zed_strings_destroy(zcp->scripts); -+ -+ free(zcp); -+} -+ -+/* -+ * Display command-line help and exit. -+ * If [got_err] is 0, output to stdout and exit normally; -+ * otherwise, output to stderr and exit with a failure status. -+ */ -+static void -+_zed_conf_display_help(const char *prog, int got_err) -+{ -+ FILE *fp = got_err ? stderr : stdout; -+ int w1 = 4; /* width of leading whitespace */ -+ int w2 = 8; /* width of L-justified option field */ -+ -+ fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); -+ fprintf(fp, "\n"); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", -+ "Display help."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", -+ "Display license information."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", -+ "Display version information."); -+ fprintf(fp, "\n"); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", -+ "Be verbose."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", -+ "Force daemon to run."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", -+ "Run daemon in the foreground."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", -+ "Lock all pages in memory."); -+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", -+ "Zero state file."); -+ fprintf(fp, "\n"); -+#if 0 -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", -+ "Read configuration from FILE.", ZED_CONF_FILE); -+#endif -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", -+ "Read enabled scripts from DIR.", ZED_SCRIPT_DIR); -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", -+ "Write daemon's PID to FILE.", ZED_PID_FILE); -+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", -+ "Write daemon's state to FILE.", ZED_STATE_FILE); -+ fprintf(fp, "\n"); -+ -+ exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); -+} -+ -+/* -+ * Display license information to stdout and exit. -+ */ -+static void -+_zed_conf_display_license(void) -+{ -+ const char **pp; -+ const char *text[] = { -+ "The ZFS Event Daemon (ZED) is distributed under the terms of the", -+ " Common Development and Distribution License (CDDL-1.0)", -+ " .", -+ "Developed at Lawrence Livermore National Laboratory" -+ " (LLNL-CODE-403049).", -+ "Copyright (C) 2013-2014" -+ " Lawrence Livermore National Security, LLC.", -+ "", -+ NULL -+ }; -+ -+ for (pp = text; *pp; pp++) -+ printf("%s\n", *pp); -+ -+ exit(EXIT_SUCCESS); -+} -+ -+/* -+ * Display version information to stdout and exit. -+ */ -+static void -+_zed_conf_display_version(void) -+{ -+ printf("%s-%s-%s\n", -+ ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE); -+ -+ exit(EXIT_SUCCESS); -+} -+ -+/* -+ * Copy the [path] string to the [resultp] ptr. -+ * If [path] is not an absolute path, prefix it with the current working dir. -+ * If [resultp] is non-null, free its existing string before assignment. -+ */ -+static void -+_zed_conf_parse_path(char **resultp, const char *path) -+{ -+ char buf[PATH_MAX]; -+ -+ assert(resultp != NULL); -+ assert(path != NULL); -+ -+ if (*resultp) -+ free(*resultp); -+ -+ if (path[0] == '/') { -+ *resultp = strdup(path); -+ } else if (!getcwd(buf, sizeof (buf))) { -+ zed_log_die("Failed to get current working dir: %s", -+ strerror(errno)); -+ } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { -+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); -+ } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { -+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); -+ } else { -+ *resultp = strdup(buf); -+ } -+ if (!*resultp) -+ zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); -+} -+ -+/* -+ * Parse the command-line options into the configuration [zcp]. -+ */ -+void -+zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) -+{ -+ const char * const opts = ":hLVc:d:p:s:vfFMZ"; -+ int opt; -+ -+ if (!zcp || !argv || !argv[0]) -+ zed_log_die("Failed to parse options: Internal error"); -+ -+ opterr = 0; /* suppress default getopt err msgs */ -+ -+ while ((opt = getopt(argc, argv, opts)) != -1) { -+ switch (opt) { -+ case 'h': -+ _zed_conf_display_help(argv[0], EXIT_SUCCESS); -+ break; -+ case 'L': -+ _zed_conf_display_license(); -+ break; -+ case 'V': -+ _zed_conf_display_version(); -+ break; -+ case 'c': -+ _zed_conf_parse_path(&zcp->conf_file, optarg); -+ break; -+ case 'd': -+ _zed_conf_parse_path(&zcp->script_dir, optarg); -+ break; -+ case 'p': -+ _zed_conf_parse_path(&zcp->pid_file, optarg); -+ break; -+ case 's': -+ _zed_conf_parse_path(&zcp->state_file, optarg); -+ break; -+ case 'v': -+ zcp->do_verbose = 1; -+ break; -+ case 'f': -+ zcp->do_force = 1; -+ break; -+ case 'F': -+ zcp->do_foreground = 1; -+ break; -+ case 'M': -+ zcp->do_memlock = 1; -+ break; -+ case 'Z': -+ zcp->do_zero = 1; -+ break; -+ case '?': -+ default: -+ if (optopt == '?') -+ _zed_conf_display_help(argv[0], EXIT_SUCCESS); -+ -+ fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], -+ "Invalid option", optopt); -+ _zed_conf_display_help(argv[0], EXIT_FAILURE); -+ break; -+ } -+ } -+} -+ -+/* -+ * Parse the configuration file into the configuration [zcp]. -+ * FIXME: Not yet implemented. -+ */ -+void -+zed_conf_parse_file(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed to parse config: %s", strerror(EINVAL)); -+} -+ -+/* -+ * Scan the [zcp] script_dir for files to exec based on the event class. -+ * Files must be executable by user, but not writable by group or other. -+ * Dotfiles are ignored. -+ * Return 0 on success with an updated set of scripts, -+ * or -1 on error with errno set. -+ * FIXME: Check if script_dir and all parent dirs are secure. -+ */ -+int -+zed_conf_scan_dir(struct zed_conf *zcp) -+{ -+ zed_strings_t *scripts; -+ DIR *dirp; -+ struct dirent *direntp; -+ char pathname[PATH_MAX]; -+ struct stat st; -+ int n; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to scan script dir: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ scripts = zed_strings_create(); -+ if (!scripts) { -+ errno = ENOMEM; -+ zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ return (-1); -+ } -+ dirp = opendir(zcp->script_dir); -+ if (!dirp) { -+ int errno_bak = errno; -+ zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ zed_strings_destroy(scripts); -+ errno = errno_bak; -+ return (-1); -+ } -+ while ((direntp = readdir(dirp))) { -+ if (direntp->d_name[0] == '.') -+ continue; -+ -+ n = snprintf(pathname, sizeof (pathname), -+ "%s/%s", zcp->script_dir, direntp->d_name); -+ if ((n < 0) || (n >= sizeof (pathname))) { -+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", -+ direntp->d_name, strerror(ENAMETOOLONG)); -+ continue; -+ } -+ if (stat(pathname, &st) < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", -+ pathname, strerror(errno)); -+ continue; -+ } -+ if (!S_ISREG(st.st_mode)) { -+ zed_log_msg(LOG_INFO, -+ "Ignoring \"%s\": not a regular file", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_uid != 0) && !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": not owned by root", -+ direntp->d_name); -+ continue; -+ } -+ if (!(st.st_mode & S_IXUSR)) { -+ zed_log_msg(LOG_INFO, -+ "Ignoring \"%s\": not executable by user", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_mode & S_IWGRP) & !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": writable by group", -+ direntp->d_name); -+ continue; -+ } -+ if ((st.st_mode & S_IWOTH) & !zcp->do_force) { -+ zed_log_msg(LOG_NOTICE, -+ "Ignoring \"%s\": writable by other", -+ direntp->d_name); -+ continue; -+ } -+ if (zed_strings_add(scripts, direntp->d_name) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to register \"%s\": %s", -+ direntp->d_name, strerror(errno)); -+ continue; -+ } -+ if (zcp->do_verbose) -+ zed_log_msg(LOG_INFO, -+ "Registered script \"%s\"", direntp->d_name); -+ } -+ if (closedir(dirp) < 0) { -+ int errno_bak = errno; -+ zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s", -+ zcp->script_dir, strerror(errno)); -+ zed_strings_destroy(scripts); -+ errno = errno_bak; -+ return (-1); -+ } -+ if (zcp->scripts) -+ zed_strings_destroy(zcp->scripts); -+ -+ zcp->scripts = scripts; -+ return (0); -+} -+ -+/* -+ * Write the PID file specified in [zcp]. -+ * Return 0 on success, -1 on error. -+ * XXX: This must be called after fork()ing to become a daemon. -+ */ -+int -+zed_conf_write_pid(struct zed_conf *zcp) -+{ -+ char dirbuf[PATH_MAX]; -+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; -+ int n; -+ char *p; -+ mode_t mask; -+ FILE *fp; -+ -+ if (!zcp || !zcp->pid_file) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to write pid file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ n = strlcpy(dirbuf, zcp->pid_file, sizeof (dirbuf)); -+ if (n >= sizeof (dirbuf)) { -+ errno = ENAMETOOLONG; -+ zed_log_msg(LOG_WARNING, "Failed to write pid file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ p = strrchr(dirbuf, '/'); -+ if (p) -+ *p = '\0'; -+ -+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to create directory \"%s\": %s", -+ dirbuf, strerror(errno)); -+ return (-1); -+ } -+ (void) unlink(zcp->pid_file); -+ -+ mask = umask(0); -+ umask(mask | 022); -+ fp = fopen(zcp->pid_file, "w"); -+ umask(mask); -+ -+ if (!fp) { -+ zed_log_msg(LOG_WARNING, "Failed to open pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else if (fprintf(fp, "%d\n", (int) getpid()) == EOF) { -+ zed_log_msg(LOG_WARNING, "Failed to write pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else if (fclose(fp) == EOF) { -+ zed_log_msg(LOG_WARNING, "Failed to close pid file \"%s\": %s", -+ zcp->pid_file, strerror(errno)); -+ } else { -+ return (0); -+ } -+ (void) unlink(zcp->pid_file); -+ return (-1); -+} -+ -+/* -+ * Open and lock the [zcp] state_file. -+ * Return 0 on success, -1 on error. -+ * FIXME: If state_file exists, verify ownership & permissions. -+ * FIXME: Move lock to pid_file instead. -+ */ -+int -+zed_conf_open_state(struct zed_conf *zcp) -+{ -+ char dirbuf[PATH_MAX]; -+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; -+ int n; -+ char *p; -+ int rv; -+ -+ if (!zcp || !zcp->state_file) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to open state file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf)); -+ if (n >= sizeof (dirbuf)) { -+ errno = ENAMETOOLONG; -+ zed_log_msg(LOG_WARNING, "Failed to open state file: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ p = strrchr(dirbuf, '/'); -+ if (p) -+ *p = '\0'; -+ -+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to create directory \"%s\": %s", -+ dirbuf, strerror(errno)); -+ return (-1); -+ } -+ if (zcp->state_fd >= 0) { -+ if (close(zcp->state_fd) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to close state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ } -+ if (zcp->do_zero) -+ (void) unlink(zcp->state_file); -+ -+ zcp->state_fd = open(zcp->state_file, -+ (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); -+ if (zcp->state_fd < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ rv = zed_file_lock(zcp->state_fd); -+ if (rv < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ if (rv > 0) { -+ pid_t pid = zed_file_is_locked(zcp->state_fd); -+ if (pid < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to test lock on state file \"%s\"", -+ zcp->state_file); -+ } else if (pid > 0) { -+ zed_log_msg(LOG_WARNING, -+ "Found pid %d bound to state file \"%s\"", -+ pid, zcp->state_file); -+ } else { -+ zed_log_msg(LOG_WARNING, -+ "Inconsistent lock state on state file \"%s\"", -+ zcp->state_file); -+ } -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Read the opened [zcp] state_file to obtain the eid & etime -+ * of the last event processed. -+ * Write the state from the last event to the [eidp] & [etime] args -+ * passed by reference. -+ * Note that etime[] is an array of size 2. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) -+{ -+ ssize_t len; -+ struct iovec iov[3]; -+ ssize_t n; -+ -+ if (!zcp || !eidp || !etime) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, -+ "Failed to read state file: %s", strerror(errno)); -+ return (-1); -+ } -+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to reposition state file offset: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ len = 0; -+ iov[0].iov_base = eidp; -+ len += iov[0].iov_len = sizeof (*eidp); -+ iov[1].iov_base = &etime[0]; -+ len += iov[1].iov_len = sizeof (etime[0]); -+ iov[2].iov_base = &etime[1]; -+ len += iov[2].iov_len = sizeof (etime[1]); -+ -+ n = readv(zcp->state_fd, iov, 3); -+ if (n == 0) { -+ *eidp = 0; -+ } else if (n < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to read state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } else if (n != len) { -+ errno = EIO; -+ zed_log_msg(LOG_WARNING, -+ "Failed to read state file \"%s\": Read %d of %d bytes", -+ zcp->state_file, n, len); -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Write the [eid] & [etime] of the last processed event to the opened -+ * [zcp] state_file. -+ * Note that etime[] is an array of size 2. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) -+{ -+ ssize_t len; -+ struct iovec iov[3]; -+ ssize_t n; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, -+ "Failed to write state file: %s", strerror(errno)); -+ return (-1); -+ } -+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t) -1) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to reposition state file offset: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ len = 0; -+ iov[0].iov_base = &eid; -+ len += iov[0].iov_len = sizeof (eid); -+ iov[1].iov_base = &etime[0]; -+ len += iov[1].iov_len = sizeof (etime[0]); -+ iov[2].iov_base = &etime[1]; -+ len += iov[2].iov_len = sizeof (etime[1]); -+ -+ n = writev(zcp->state_fd, iov, 3); -+ if (n < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to write state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ if (n != len) { -+ errno = EIO; -+ zed_log_msg(LOG_WARNING, -+ "Failed to write state file \"%s\": Wrote %d of %d bytes", -+ zcp->state_file, n, len); -+ return (-1); -+ } -+ if (fdatasync(zcp->state_fd) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to sync state file \"%s\": %s", -+ zcp->state_file, strerror(errno)); -+ return (-1); -+ } -+ return (0); -+} -diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h -new file mode 100644 -index 0000000..51b98ea ---- /dev/null -+++ b/cmd/zed/zed_conf.h -@@ -0,0 +1,71 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_CONF_H -+#define ZED_CONF_H -+ -+#include -+#include -+#include "zed_strings.h" -+ -+struct zed_conf { -+ unsigned do_force:1; /* true if force enabled */ -+ unsigned do_foreground:1; /* true if run in foreground */ -+ unsigned do_memlock:1; /* true if locking memory */ -+ unsigned do_verbose:1; /* true if verbosity enabled */ -+ unsigned do_zero:1; /* true if zeroing state */ -+ int syslog_facility; /* syslog facility value */ -+ int min_events; /* RESERVED FOR FUTURE USE */ -+ int max_events; /* RESERVED FOR FUTURE USE */ -+ char *conf_file; /* abs path to config file */ -+ char *pid_file; /* abs path to pid file */ -+ char *script_dir; /* abs path to script dir */ -+ zed_strings_t *scripts; /* names of enabled scripts */ -+ char *state_file; /* abs path to state file */ -+ int state_fd; /* fd to state file */ -+ libzfs_handle_t *zfs_hdl; /* handle to libzfs */ -+ int zevent_fd; /* fd for access to zevents */ -+}; -+ -+struct zed_conf *zed_conf_create(void); -+ -+void zed_conf_destroy(struct zed_conf *zcp); -+ -+void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); -+ -+void zed_conf_parse_file(struct zed_conf *zcp); -+ -+int zed_conf_scan_dir(struct zed_conf *zcp); -+ -+int zed_conf_write_pid(struct zed_conf *zcp); -+ -+int zed_conf_open_state(struct zed_conf *zcp); -+ -+int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); -+ -+int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); -+ -+#endif /* !ZED_CONF_H */ -diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c -new file mode 100644 -index 0000000..e504aef ---- /dev/null -+++ b/cmd/zed/zed_event.c -@@ -0,0 +1,829 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include /* FIXME: Replace with libzfs_core. */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed.h" -+#include "zed_conf.h" -+#include "zed_exec.h" -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+/* -+ * Open the libzfs interface. -+ */ -+void -+zed_event_init(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); -+ -+ zcp->zfs_hdl = libzfs_init(); -+ if (!zcp->zfs_hdl) -+ zed_log_die("Failed to initialize libzfs"); -+ -+ zcp->zevent_fd = open(ZFS_DEV, O_RDWR); -+ if (zcp->zevent_fd < 0) -+ zed_log_die("Failed to open \"%s\": %s", -+ ZFS_DEV, strerror(errno)); -+} -+ -+/* -+ * Close the libzfs interface. -+ */ -+void -+zed_event_fini(struct zed_conf *zcp) -+{ -+ if (!zcp) -+ zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); -+ -+ if (zcp->zevent_fd >= 0) { -+ if (close(zcp->zevent_fd) < 0) -+ zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", -+ ZFS_DEV, strerror(errno)); -+ -+ zcp->zevent_fd = -1; -+ } -+ if (zcp->zfs_hdl) { -+ libzfs_fini(zcp->zfs_hdl); -+ zcp->zfs_hdl = NULL; -+ } -+} -+ -+/* -+ * Seek to the event specified by [saved_eid] and [saved_etime]. -+ * This protects against processing a given event more than once. -+ * Return 0 upon a successful seek to the specified event, or -1 otherwise. -+ * A zevent is considered to be uniquely specified by its (eid,time) tuple. -+ * The unsigned 64b eid is set to 1 when the kernel module is loaded, and -+ * incremented by 1 for each new event. Since the state file can persist -+ * across a kernel module reload, the time must be checked to ensure a match. -+ */ -+int -+zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) -+{ -+ uint64_t eid; -+ int found; -+ nvlist_t *nvl; -+ int n_dropped; -+ int64_t *etime; -+ uint_t nelem; -+ int rv; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to seek zevent: %s", -+ strerror(errno)); -+ return (-1); -+ } -+ eid = 0; -+ found = 0; -+ while ((eid < saved_eid) && !found) { -+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, -+ ZEVENT_NONBLOCK, zcp->zevent_fd); -+ -+ if ((rv != 0) || !nvl) -+ break; -+ -+ if (n_dropped > 0) { -+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); -+ /* -+ * FIXME: Increase max size of event nvlist in -+ * /sys/module/zfs/parameters/zfs_zevent_len_max ? -+ */ -+ } -+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); -+ } else if (nvlist_lookup_int64_array(nvl, "time", -+ &etime, &nelem) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu)", eid); -+ } else if (nelem != 2) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu, nelem=%u)", -+ eid, nelem); -+ } else if ((eid != saved_eid) || -+ (etime[0] != saved_etime[0]) || -+ (etime[1] != saved_etime[1])) { -+ /* no-op */ -+ } else { -+ found = 1; -+ } -+ free(nvl); -+ } -+ if (!found && (saved_eid > 0)) { -+ if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START, -+ zcp->zevent_fd) < 0) -+ zed_log_msg(LOG_WARNING, "Failed to seek to eid=0"); -+ else -+ eid = 0; -+ } -+ zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid); -+ return (found ? 0 : -1); -+} -+ -+static int -+_zed_event_convert_int8_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int8_t *i8p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int8_array(nvp, &i8p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i8p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint8_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint8_t *u8p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint8_array(nvp, &u8p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u8p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int16_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int16_t *i16p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int16_array(nvp, &i16p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i16p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint16_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint16_t *u16p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint16_array(nvp, &u16p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u16p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int32_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int32_t *i32p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int32_array(nvp, &i32p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%d ", i32p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint32_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ uint32_t *u32p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint32_array(nvp, &u32p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%u ", u32p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_int64_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ int64_t *i64p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_int64_array(nvp, &i64p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%lld ", (u_longlong_t) i64p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_uint64_array(char *buf, int buflen, nvpair_t *nvp, -+ const char *fmt) -+{ -+ uint64_t *u64p; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, fmt, (u_longlong_t) u64p[i]); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+static int -+_zed_event_convert_string_array(char *buf, int buflen, nvpair_t *nvp) -+{ -+ char **strp; -+ uint_t nelem; -+ uint_t i; -+ char *p; -+ int n; -+ -+ assert(buf != NULL); -+ -+ (void) nvpair_value_string_array(nvp, &strp, &nelem); -+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { -+ n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : ""); -+ if ((n < 0) || (n >= buflen)) { -+ *buf = '\0'; -+ return (-1); -+ } -+ p += n; -+ buflen -= n; -+ } -+ if (nelem > 0) -+ *--p = '\0'; -+ -+ return (p - buf); -+} -+ -+/* -+ * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0. -+ */ -+static int -+_zed_event_value_is_hex(const char *name) -+{ -+ const char *hex_suffix[] = { -+ "_guid", -+ "_guids", -+ NULL -+ }; -+ const char **pp; -+ char *p; -+ -+ if (!name) -+ return (0); -+ -+ for (pp = hex_suffix; *pp; pp++) { -+ p = strstr(name, *pp); -+ if (p && strlen(p) == strlen(*pp)) -+ return (1); -+ } -+ return (0); -+} -+ -+/* -+ * Convert the nvpair [nvp] to a string which is added to the environment -+ * of the child process. -+ * Return 0 on success, -1 on error. -+ * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? -+ */ -+static void -+_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) -+{ -+ const char *name; -+ data_type_t type; -+ char buf[4096]; -+ int buflen; -+ int n; -+ char *p; -+ const char *q; -+ const char *fmt; -+ -+ boolean_t b; -+ double d; -+ uint8_t i8; -+ uint16_t i16; -+ uint32_t i32; -+ uint64_t i64; -+ char *str; -+ -+ assert(zsp != NULL); -+ assert(nvp != NULL); -+ -+ name = nvpair_name(nvp); -+ type = nvpair_type(nvp); -+ buflen = sizeof (buf); -+ -+ /* Copy NAME prefix for ZED zevent namespace. */ -+ n = strlcpy(buf, ZEVENT_VAR_PREFIX, sizeof (buf)); -+ if (n >= sizeof (buf)) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, "Exceeded buffer size"); -+ return; -+ } -+ buflen -= n; -+ p = buf + n; -+ -+ /* Convert NAME to alphanumeric uppercase. */ -+ for (q = name; *q && (buflen > 0); q++) { -+ *p++ = isalnum(*q) ? toupper(*q) : '_'; -+ buflen--; -+ } -+ -+ /* Separate NAME from VALUE. */ -+ if (buflen > 0) { -+ *p++ = '='; -+ buflen--; -+ } -+ *p = '\0'; -+ -+ /* Convert VALUE. */ -+ switch (type) { -+ case DATA_TYPE_BOOLEAN: -+ n = snprintf(p, buflen, "%s", "1"); -+ break; -+ case DATA_TYPE_BOOLEAN_VALUE: -+ (void) nvpair_value_boolean_value(nvp, &b); -+ n = snprintf(p, buflen, "%s", b ? "1" : "0"); -+ break; -+ case DATA_TYPE_BYTE: -+ (void) nvpair_value_byte(nvp, &i8); -+ n = snprintf(p, buflen, "%d", i8); -+ break; -+ case DATA_TYPE_INT8: -+ (void) nvpair_value_int8(nvp, (int8_t *) &i8); -+ n = snprintf(p, buflen, "%d", i8); -+ break; -+ case DATA_TYPE_UINT8: -+ (void) nvpair_value_uint8(nvp, &i8); -+ n = snprintf(p, buflen, "%u", i8); -+ break; -+ case DATA_TYPE_INT16: -+ (void) nvpair_value_int16(nvp, (int16_t *) &i16); -+ n = snprintf(p, buflen, "%d", i16); -+ break; -+ case DATA_TYPE_UINT16: -+ (void) nvpair_value_uint16(nvp, &i16); -+ n = snprintf(p, buflen, "%u", i16); -+ break; -+ case DATA_TYPE_INT32: -+ (void) nvpair_value_int32(nvp, (int32_t *) &i32); -+ n = snprintf(p, buflen, "%d", i32); -+ break; -+ case DATA_TYPE_UINT32: -+ (void) nvpair_value_uint32(nvp, &i32); -+ n = snprintf(p, buflen, "%u", i32); -+ break; -+ case DATA_TYPE_INT64: -+ (void) nvpair_value_int64(nvp, (int64_t *) &i64); -+ n = snprintf(p, buflen, "%lld", (longlong_t) i64); -+ break; -+ case DATA_TYPE_UINT64: -+ (void) nvpair_value_uint64(nvp, &i64); -+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"; -+ n = snprintf(p, buflen, fmt, (u_longlong_t) i64); -+ break; -+ case DATA_TYPE_DOUBLE: -+ (void) nvpair_value_double(nvp, &d); -+ n = snprintf(p, buflen, "%g", d); -+ break; -+ case DATA_TYPE_HRTIME: -+ (void) nvpair_value_hrtime(nvp, (hrtime_t *) &i64); -+ n = snprintf(p, buflen, "%llu", (u_longlong_t) i64); -+ break; -+ case DATA_TYPE_NVLIST: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_STRING: -+ (void) nvpair_value_string(nvp, &str); -+ n = snprintf(p, buflen, "%s", (str ? str : "")); -+ break; -+ case DATA_TYPE_BOOLEAN_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_BYTE_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ case DATA_TYPE_INT8_ARRAY: -+ n = _zed_event_convert_int8_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT8_ARRAY: -+ n = _zed_event_convert_uint8_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT16_ARRAY: -+ n = _zed_event_convert_int16_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT16_ARRAY: -+ n = _zed_event_convert_uint16_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT32_ARRAY: -+ n = _zed_event_convert_int32_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT32_ARRAY: -+ n = _zed_event_convert_uint32_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_INT64_ARRAY: -+ n = _zed_event_convert_int64_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_UINT64_ARRAY: -+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; -+ n = _zed_event_convert_uint64_array(p, buflen, nvp, fmt); -+ break; -+ case DATA_TYPE_STRING_ARRAY: -+ n = _zed_event_convert_string_array(p, buflen, nvp); -+ break; -+ case DATA_TYPE_NVLIST_ARRAY: -+ /* FIXME */ -+ n = snprintf(p, buflen, "%s", "_NOT_IMPLEMENTED_"); -+ break; -+ default: -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: " -+ "Unrecognized type=%u", name, eid, (unsigned int) type); -+ return; -+ } -+ if ((n < 0) || (n >= sizeof (buf))) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, "Exceeded buffer size"); -+ return; -+ } -+ if (zed_strings_add(zsp, buf) < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to convert nvpair \"%s\" for eid=%llu: %s", -+ name, eid, strerror(ENOMEM)); -+ return; -+ } -+} -+ -+/* -+ * Add the environment variable specified by the format string [fmt]. -+ */ -+static void -+_zed_event_add_var(uint64_t eid, zed_strings_t *zsp, const char *fmt, ...) -+{ -+ char buf[4096]; -+ va_list vargs; -+ int n; -+ const char *p; -+ size_t namelen; -+ -+ assert(zsp != NULL); -+ assert(fmt != NULL); -+ -+ va_start(vargs, fmt); -+ n = vsnprintf(buf, sizeof (buf), fmt, vargs); -+ va_end(vargs); -+ p = strchr(buf, '='); -+ namelen = (p) ? p - buf : strlen(buf); -+ -+ if ((n < 0) || (n >= sizeof (buf))) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, "Exceeded buffer size"); -+ } else if (!p) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, "Missing assignment"); -+ } else if (zed_strings_add(zsp, buf) < 0) { -+ zed_log_msg(LOG_WARNING, "Failed to add %.*s for eid=%llu: %s", -+ namelen, buf, eid, strerror(ENOMEM)); -+ } -+} -+ -+/* -+ * Restrict various environment variables to safe and sane values -+ * when constructing the environment for the child process. -+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. -+ */ -+static void -+_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp) -+{ -+ const char *env_restrict[] = { -+ "IFS= \t\n", -+ "PATH=" _PATH_STDPATH, -+ "ZDB=" SBINDIR "/zdb", -+ "ZED=" SBINDIR "/zed", -+ "ZFS=" SBINDIR "/zfs", -+ "ZINJECT=" SBINDIR "/zinject", -+ "ZPOOL=" SBINDIR "/zpool", -+ "ZFS_ALIAS=" ZFS_META_ALIAS, -+ "ZFS_VERSION=" ZFS_META_VERSION, -+ "ZFS_RELEASE=" ZFS_META_RELEASE, -+ NULL -+ }; -+ const char **pp; -+ -+ assert(zsp != NULL); -+ -+ for (pp = env_restrict; *pp; pp++) { -+ _zed_event_add_var(eid, zsp, "%s", *pp); -+ } -+} -+ -+/* -+ * Preserve specified variables from the parent environment -+ * when constructing the environment for the child process. -+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. -+ */ -+static void -+_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp) -+{ -+ const char *env_preserve[] = { -+ "TZ", -+ NULL -+ }; -+ const char **pp; -+ const char *p; -+ -+ assert(zsp != NULL); -+ -+ for (pp = env_preserve; *pp; pp++) { -+ if ((p = getenv(*pp))) -+ _zed_event_add_var(eid, zsp, "%s=%s", *pp, p); -+ } -+} -+ -+/* -+ * Compute the "subclass" by removing the first 3 components of [class] -+ * (which seem to always be either "ereport.fs.zfs" or "resource.fs.zfs"). -+ * Return a pointer inside the string [class], or NULL if insufficient -+ * components exist. -+ */ -+static const char * -+_zed_event_get_subclass(const char *class) -+{ -+ const char *p; -+ int i; -+ -+ if (!class) -+ return (NULL); -+ -+ p = class; -+ for (i = 0; i < 3; i++) { -+ p = strchr(p, '.'); -+ if (!p) -+ break; -+ p++; -+ } -+ return (p); -+} -+ -+/* -+ * Convert the zevent time from a 2-element array of 64b integers -+ * into a more convenient form: -+ * TIME_SECS is the second component of the time. -+ * TIME_NSECS is the nanosecond component of the time. -+ * TIME_STRING is an almost-RFC3339-compliant string representation. -+ */ -+static void -+_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) -+{ -+ struct tm *stp; -+ char buf[32]; -+ -+ assert(zsp != NULL); -+ assert(etime != NULL); -+ -+ _zed_event_add_var(eid, zsp, "%s%s=%lld", -+ ZEVENT_VAR_PREFIX, "TIME_SECS", (long long int) etime[0]); -+ _zed_event_add_var(eid, zsp, "%s%s=%lld", -+ ZEVENT_VAR_PREFIX, "TIME_NSECS", (long long int) etime[1]); -+ -+ if (!(stp = localtime((const time_t *) &etime[0]))) { -+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error"); -+ } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) { -+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error"); -+ } else { -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZEVENT_VAR_PREFIX, "TIME_STRING", buf); -+ } -+} -+ -+/* -+ * Service the next zevent, blocking until one is available. -+ */ -+void -+zed_event_service(struct zed_conf *zcp) -+{ -+ nvlist_t *nvl; -+ nvpair_t *nvp; -+ int n_dropped; -+ zed_strings_t *zsp; -+ uint64_t eid; -+ int64_t *etime; -+ uint_t nelem; -+ char *class; -+ const char *subclass; -+ int rv; -+ -+ if (!zcp) { -+ errno = EINVAL; -+ zed_log_msg(LOG_ERR, "Failed to service zevent: %s", -+ strerror(errno)); -+ return; -+ } -+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, -+ zcp->zevent_fd); -+ -+ if ((rv != 0) || !nvl) -+ return; -+ -+ if (n_dropped > 0) { -+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); -+ /* -+ * FIXME: Increase max size of event nvlist in -+ * /sys/module/zfs/parameters/zfs_zevent_len_max ? -+ */ -+ } -+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { -+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); -+ } else if (nvlist_lookup_int64_array( -+ nvl, "time", &etime, &nelem) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu)", eid); -+ } else if (nelem != 2) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent time (eid=%llu, nelem=%u)", -+ eid, nelem); -+ } else if (nvlist_lookup_string(nvl, "class", &class) != 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to lookup zevent class (eid=%llu)", eid); -+ } else { -+ zsp = zed_strings_create(); -+ -+ nvp = NULL; -+ while ((nvp = nvlist_next_nvpair(nvl, nvp))) -+ _zed_event_add_nvpair(eid, zsp, nvp); -+ -+ _zed_event_add_env_restrict(eid, zsp); -+ _zed_event_add_env_preserve(eid, zsp); -+ -+ _zed_event_add_var(eid, zsp, "%s%s=%d", -+ ZED_VAR_PREFIX, "PID", (int) getpid()); -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZED_VAR_PREFIX, "SCRIPT_DIR", zcp->script_dir); -+ -+ subclass = _zed_event_get_subclass(class); -+ _zed_event_add_var(eid, zsp, "%s%s=%s", -+ ZEVENT_VAR_PREFIX, "SUBCLASS", -+ (subclass ? subclass : class)); -+ _zed_event_add_time_strings(eid, zsp, etime); -+ -+ zed_exec_process(eid, class, subclass, -+ zcp->script_dir, zcp->scripts, zsp, zcp->zevent_fd); -+ -+ zed_conf_write_state(zcp, eid, etime); -+ -+ zed_strings_destroy(zsp); -+ } -+ nvlist_free(nvl); -+} -diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h -new file mode 100644 -index 0000000..71b3a2b ---- /dev/null -+++ b/cmd/zed/zed_event.h -@@ -0,0 +1,41 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_EVENT_H -+#define ZED_EVENT_H -+ -+#include -+ -+void zed_event_init(struct zed_conf *zcp); -+ -+void zed_event_fini(struct zed_conf *zcp); -+ -+int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, -+ int64_t saved_etime[]); -+ -+void zed_event_service(struct zed_conf *zcp); -+ -+#endif /* !ZED_EVENT_H */ -diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c -new file mode 100644 -index 0000000..f461b78 ---- /dev/null -+++ b/cmd/zed/zed_exec.c -@@ -0,0 +1,207 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_file.h" -+#include "zed_log.h" -+#include "zed_strings.h" -+ -+#define ZEVENT_FILENO 3 -+ -+/* -+ * Create an environment string array for passing to execve() using the -+ * NAME=VALUE strings in container [zsp]. -+ * Return a newly-allocated environment, or NULL on error. -+ */ -+static char ** -+_zed_exec_create_env(zed_strings_t *zsp) -+{ -+ int num_ptrs; -+ int buflen; -+ char *buf; -+ char **pp; -+ char *p; -+ const char *q; -+ int i; -+ int len; -+ -+ num_ptrs = zed_strings_count(zsp) + 1; -+ buflen = num_ptrs * sizeof (char *); -+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) -+ buflen += strlen(q) + 1; -+ -+ buf = malloc(buflen); -+ if (!buf) -+ return (NULL); -+ -+ pp = (char **) buf; -+ p = buf + (num_ptrs * sizeof (char *)); -+ i = 0; -+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { -+ pp[i] = p; -+ len = strlen(q) + 1; -+ memcpy(p, q, len); -+ p += len; -+ i++; -+ } -+ pp[i] = NULL; -+ assert(buf + buflen == p); -+ return ((char **) buf); -+} -+ -+/* -+ * Fork a child process to handle event [eid]. The program [prog] -+ * in directory [dir] is executed with the envionment [env]. -+ * The file descriptor [zfd] is the zevent_fd used to track the -+ * current cursor location within the zevent nvlist. -+ */ -+static void -+_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, -+ char *env[], int zfd) -+{ -+ char path[PATH_MAX]; -+ int n; -+ pid_t pid; -+ int fd; -+ pid_t wpid; -+ int status; -+ -+ assert(dir != NULL); -+ assert(prog != NULL); -+ assert(env != NULL); -+ assert(zfd >= 0); -+ -+ n = snprintf(path, sizeof (path), "%s/%s", dir, prog); -+ if ((n < 0) || (n >= sizeof (path))) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to fork \"%s\" for eid=%llu: %s", -+ prog, eid, strerror(ENAMETOOLONG)); -+ return; -+ } -+ pid = fork(); -+ if (pid < 0) { -+ zed_log_msg(LOG_WARNING, -+ "Failed to fork \"%s\" for eid=%llu: %s", -+ prog, eid, strerror(errno)); -+ return; -+ } else if (pid == 0) { -+ (void) umask(022); -+ fd = open("/dev/null", O_RDWR); -+ (void) dup2(fd, STDIN_FILENO); -+ (void) dup2(fd, STDOUT_FILENO); -+ (void) dup2(fd, STDERR_FILENO); -+ (void) dup2(zfd, ZEVENT_FILENO); -+ zed_file_close_from(ZEVENT_FILENO + 1); -+ execle(path, prog, NULL, env); -+ _exit(127); -+ } else { -+ zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", -+ prog, eid, pid); -+ /* FIXME: Timeout rogue child processes with sigalarm? */ -+restart: -+ wpid = waitpid(pid, &status, 0); -+ if (wpid == (pid_t) -1) { -+ if (errno == EINTR) -+ goto restart; -+ zed_log_msg(LOG_WARNING, -+ "Failed to wait for \"%s\" eid=%llu pid=%d", -+ prog, eid, pid); -+ } else if (WIFEXITED(status)) { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d exit=%d", -+ prog, eid, pid, WEXITSTATUS(status)); -+ } else if (WIFSIGNALED(status)) { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", -+ prog, eid, pid, WTERMSIG(status), -+ strsignal(WTERMSIG(status))); -+ } else { -+ zed_log_msg(LOG_INFO, -+ "Finished \"%s\" eid=%llu pid=%d status=0x%X", -+ prog, eid, (unsigned int) status); -+ } -+ } -+} -+ -+/* -+ * Process the event [eid] by synchronously invoking all scripts with a -+ * matching class prefix. -+ * Each executable in [scripts] from the directory [dir] is matched against -+ * the event's [class], [subclass], and the "all" class (which matches -+ * all events). Every script with a matching class prefix is invoked. -+ * The NAME=VALUE strings in [envs] will be passed to the script as -+ * environment variables. -+ * The file descriptor [zfd] is the zevent_fd used to track the -+ * current cursor location within the zevent nvlist. -+ * Return 0 on success, -1 on error. -+ */ -+int -+zed_exec_process(uint64_t eid, const char *class, const char *subclass, -+ const char *dir, zed_strings_t *scripts, zed_strings_t *envs, int zfd) -+{ -+ const char *class_strings[4]; -+ const char *allclass = "all"; -+ const char **csp; -+ const char *s; -+ char **e; -+ int n; -+ -+ if (!dir || !scripts || !envs || zfd < 0) -+ return (-1); -+ -+ csp = class_strings; -+ -+ if (class) -+ *csp++ = class; -+ -+ if (subclass) -+ *csp++ = subclass; -+ -+ if (allclass) -+ *csp++ = allclass; -+ -+ *csp = NULL; -+ -+ e = _zed_exec_create_env(envs); -+ -+ for (s = zed_strings_first(scripts); s; s = zed_strings_next(scripts)) { -+ for (csp = class_strings; *csp; csp++) { -+ n = strlen(*csp); -+ if ((strncmp(s, *csp, n) == 0) && !isalpha(s[n])) -+ _zed_exec_fork_child(eid, dir, s, e, zfd); -+ } -+ } -+ free(e); -+ return (0); -+} -diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h -new file mode 100644 -index 0000000..52bdc12 ---- /dev/null -+++ b/cmd/zed/zed_exec.h -@@ -0,0 +1,36 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_EXEC_H -+#define ZED_EXEC_H -+ -+#include -+ -+int zed_exec_process(uint64_t eid, const char *class, const char *subclass, -+ const char *dir, zed_strings_t *scripts, zed_strings_t *envs, -+ int zevent_fd); -+ -+#endif /* !ZED_EXEC_H */ -diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c -new file mode 100644 -index 0000000..7b77345 ---- /dev/null -+++ b/cmd/zed/zed_file.c -@@ -0,0 +1,227 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_log.h" -+ -+/* -+ * Read up to [n] bytes from [fd] into [buf]. -+ * Return the number of bytes read, 0 on EOF, or -1 on error. -+ */ -+ssize_t -+zed_file_read_n(int fd, void *buf, size_t n) -+{ -+ unsigned char *p; -+ size_t n_left; -+ ssize_t n_read; -+ -+ p = buf; -+ n_left = n; -+ while (n_left > 0) { -+ if ((n_read = read(fd, p, n_left)) < 0) { -+ if (errno == EINTR) -+ continue; -+ else -+ return (-1); -+ -+ } else if (n_read == 0) { -+ break; -+ } -+ n_left -= n_read; -+ p += n_read; -+ } -+ return (n - n_left); -+} -+ -+/* -+ * Write [n] bytes from [buf] out to [fd]. -+ * Return the number of bytes written, or -1 on error. -+ */ -+ssize_t -+zed_file_write_n(int fd, void *buf, size_t n) -+{ -+ const unsigned char *p; -+ size_t n_left; -+ ssize_t n_written; -+ -+ p = buf; -+ n_left = n; -+ while (n_left > 0) { -+ if ((n_written = write(fd, p, n_left)) < 0) { -+ if (errno == EINTR) -+ continue; -+ else -+ return (-1); -+ -+ } -+ n_left -= n_written; -+ p += n_written; -+ } -+ return (n); -+} -+ -+/* -+ * Set an exclusive advisory lock on the open file descriptor [fd]. -+ * Return 0 on success, 1 if a conflicting lock is held by another process, -+ * or -1 on error (with errno set). -+ */ -+int -+zed_file_lock(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_WRLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_SETLK, &lock) < 0) { -+ if ((errno == EACCES) || (errno == EAGAIN)) -+ return (1); -+ -+ return (-1); -+ } -+ return (0); -+} -+ -+/* -+ * Release an advisory lock held on the open file descriptor [fd]. -+ * Return 0 on success, or -1 on error (with errno set). -+ */ -+int -+zed_file_unlock(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_UNLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_SETLK, &lock) < 0) -+ return (-1); -+ -+ return (0); -+} -+ -+/* -+ * Test whether an exclusive advisory lock could be obtained for the open -+ * file descriptor [fd]. -+ * Return 0 if the file is not locked, >0 for the pid of another process -+ * holding a conflicting lock, or -1 on error (with errno set). -+ */ -+pid_t -+zed_file_is_locked(int fd) -+{ -+ struct flock lock; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ lock.l_type = F_WRLCK; -+ lock.l_whence = SEEK_SET; -+ lock.l_start = 0; -+ lock.l_len = 0; -+ -+ if (fcntl(fd, F_GETLK, &lock) < 0) -+ return (-1); -+ -+ if (lock.l_type == F_UNLCK) -+ return (0); -+ -+ return (lock.l_pid); -+} -+ -+/* -+ * Close all open file descriptors greater than or equal to [lowfd]. -+ * Any errors encountered while closing file descriptors are ignored. -+ */ -+void -+zed_file_close_from(int lowfd) -+{ -+ const int maxfd_def = 256; -+ int errno_bak; -+ struct rlimit rl; -+ int maxfd; -+ int fd; -+ -+ errno_bak = errno; -+ -+ if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { -+ maxfd = maxfd_def; -+ } else if (rl.rlim_max == RLIM_INFINITY) { -+ maxfd = maxfd_def; -+ } else { -+ maxfd = rl.rlim_max; -+ } -+ for (fd = lowfd; fd < maxfd; fd++) -+ (void) close(fd); -+ -+ errno = errno_bak; -+} -+ -+/* -+ * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically -+ * closed upon successful execution of one of the exec functions. -+ * Return 0 on success, or -1 on error. -+ * FIXME: No longer needed? -+ */ -+int -+zed_file_close_on_exec(int fd) -+{ -+ int flags; -+ -+ if (fd < 0) { -+ errno = EBADF; -+ return (-1); -+ } -+ flags = fcntl(fd, F_GETFD); -+ if (flags == -1) -+ return (-1); -+ -+ flags |= FD_CLOEXEC; -+ -+ if (fcntl(fd, F_SETFD, flags) == -1) -+ return (-1); -+ -+ return (0); -+} -diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h -new file mode 100644 -index 0000000..df70201 ---- /dev/null -+++ b/cmd/zed/zed_file.h -@@ -0,0 +1,47 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_FILE_H -+#define ZED_FILE_H -+ -+#include -+#include -+ -+ssize_t zed_file_read_n(int fd, void *buf, size_t n); -+ -+ssize_t zed_file_write_n(int fd, void *buf, size_t n); -+ -+int zed_file_lock(int fd); -+ -+int zed_file_unlock(int fd); -+ -+pid_t zed_file_is_locked(int fd); -+ -+void zed_file_close_from(int fd); -+ -+int zed_file_close_on_exec(int fd); -+ -+#endif /* !ZED_FILE_H */ -diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c -new file mode 100644 -index 0000000..bc432bc ---- /dev/null -+++ b/cmd/zed/zed_log.c -@@ -0,0 +1,171 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_log.h" -+ -+#define ZED_LOG_MAX_ID_LEN 64 -+#define ZED_LOG_MAX_LOG_LEN 1024 -+ -+static struct { -+ unsigned do_stderr:1; -+ unsigned do_syslog:1; -+ int level; -+ char id[ZED_LOG_MAX_ID_LEN]; -+} _ctx; -+ -+void -+zed_log_init(const char *identity) -+{ -+ const char *p; -+ -+ if (identity) { -+ p = (p = strrchr(identity, '/')) ? p + 1 : identity; -+ strlcpy(_ctx.id, p, sizeof (_ctx.id)); -+ } else { -+ _ctx.id[0] = '\0'; -+ } -+} -+ -+void -+zed_log_fini() -+{ -+ if (_ctx.do_syslog) { -+ closelog(); -+ } -+} -+ -+void -+zed_log_stderr_open(int level) -+{ -+ _ctx.do_stderr = 1; -+ _ctx.level = level; -+} -+ -+void -+zed_log_stderr_close(void) -+{ -+ _ctx.do_stderr = 0; -+} -+ -+void -+zed_log_syslog_open(int facility) -+{ -+ const char *identity; -+ -+ _ctx.do_syslog = 1; -+ identity = (_ctx.id[0] == '\0') ? NULL : _ctx.id; -+ openlog(identity, LOG_NDELAY, facility); -+} -+ -+void -+zed_log_syslog_close(void) -+{ -+ _ctx.do_syslog = 0; -+ closelog(); -+} -+ -+static void -+_zed_log_aux(int priority, const char *fmt, va_list vargs) -+{ -+ char buf[ZED_LOG_MAX_LOG_LEN]; -+ char *syslogp; -+ char *p; -+ int len; -+ int n; -+ -+ assert(fmt != NULL); -+ -+ syslogp = NULL; -+ p = buf; -+ len = sizeof (buf); -+ -+ if (_ctx.id[0] != '\0') { -+ n = snprintf(p, len, "%s: ", _ctx.id); -+ if ((n < 0) || (n >= len)) { -+ p += len - 1; -+ len = 0; -+ } else { -+ p += n; -+ len -= n; -+ } -+ } -+ if ((len > 0) && fmt) { -+ syslogp = p; -+ n = vsnprintf(p, len, fmt, vargs); -+ if ((n < 0) || (n >= len)) { -+ p += len - 1; -+ len = 0; -+ } else { -+ p += n; -+ len -= n; -+ } -+ } -+ *p = '\0'; -+ -+ if (_ctx.do_syslog && syslogp) -+ syslog(priority, "%s", syslogp); -+ -+ if (_ctx.do_stderr && priority <= _ctx.level) -+ fprintf(stderr, "%s\n", buf); -+} -+ -+/* -+ * Log a message at the given [priority] level specified by the printf-style -+ * format string [fmt]. -+ */ -+void -+zed_log_msg(int priority, const char *fmt, ...) -+{ -+ va_list vargs; -+ -+ if (fmt) { -+ va_start(vargs, fmt); -+ _zed_log_aux(priority, fmt, vargs); -+ va_end(vargs); -+ } -+} -+ -+/* -+ * Log a fatal error message specified by the printf-style format string [fmt]. -+ */ -+void -+zed_log_die(const char *fmt, ...) -+{ -+ va_list vargs; -+ -+ if (fmt) { -+ va_start(vargs, fmt); -+ _zed_log_aux(LOG_ERR, fmt, vargs); -+ va_end(vargs); -+ } -+ exit(EXIT_FAILURE); -+} -diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h -new file mode 100644 -index 0000000..7ae4549 ---- /dev/null -+++ b/cmd/zed/zed_log.h -@@ -0,0 +1,48 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_LOG_H -+#define ZED_LOG_H -+ -+#include -+ -+void zed_log_init(const char *identity); -+ -+void zed_log_fini(void); -+ -+void zed_log_stderr_open(int level); -+ -+void zed_log_stderr_close(void); -+ -+void zed_log_syslog_open(int facility); -+ -+void zed_log_syslog_close(void); -+ -+void zed_log_msg(int priority, const char *fmt, ...); -+ -+void zed_log_die(const char *fmt, ...); -+ -+#endif /* !ZED_LOG_H */ -diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c -new file mode 100644 -index 0000000..05a3740 ---- /dev/null -+++ b/cmd/zed/zed_strings.c -@@ -0,0 +1,200 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "zed_strings.h" -+ -+struct zed_strings { -+ avl_tree_t tree; -+ avl_node_t *iteratorp; -+}; -+ -+struct zed_strings_node { -+ avl_node_t node; -+ char string[]; -+}; -+ -+typedef struct zed_strings_node zed_strings_node_t; -+ -+/* -+ * Compare zed_strings_node_t nodes [x1] and [x2]. -+ * As required for the AVL tree, return exactly -+ * -1 for <, 0 for ==, and +1 for >. -+ */ -+static int -+_zed_strings_node_compare(const void *x1, const void *x2) -+{ -+ const char *s1; -+ const char *s2; -+ int rv; -+ -+ assert(x1 != NULL); -+ assert(x2 != NULL); -+ -+ s1 = ((const zed_strings_node_t *) x1)->string; -+ assert(s1 != NULL); -+ s2 = ((const zed_strings_node_t *) x2)->string; -+ assert(s2 != NULL); -+ rv = strcmp(s1, s2); -+ -+ if (rv < 0) -+ return (-1); -+ -+ if (rv > 0) -+ return (1); -+ -+ return (0); -+} -+ -+/* -+ * Return a new string container, or NULL on error. -+ */ -+zed_strings_t * -+zed_strings_create(void) -+{ -+ zed_strings_t *zsp; -+ -+ zsp = malloc(sizeof (*zsp)); -+ if (!zsp) -+ return (NULL); -+ -+ memset(zsp, 0, sizeof (*zsp)); -+ avl_create(&zsp->tree, _zed_strings_node_compare, -+ sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node)); -+ -+ zsp->iteratorp = NULL; -+ return (zsp); -+} -+ -+/* -+ * Destroy the string container [zsp] and all strings within. -+ */ -+void -+zed_strings_destroy(zed_strings_t *zsp) -+{ -+ void *cookie; -+ zed_strings_node_t *np; -+ -+ if (!zsp) -+ return; -+ -+ cookie = NULL; -+ while ((np = avl_destroy_nodes(&zsp->tree, &cookie))) -+ free(np); -+ -+ avl_destroy(&zsp->tree); -+ free(zsp); -+} -+ -+/* -+ * Add a copy of the string [s] to the container [zsp]. -+ * Return 0 on success, or -1 on error. -+ * FIXME: Handle dup strings. -+ */ -+int -+zed_strings_add(zed_strings_t *zsp, const char *s) -+{ -+ size_t len; -+ zed_strings_node_t *np; -+ -+ if (!zsp || !s) { -+ errno = EINVAL; -+ return (-1); -+ } -+ len = sizeof (zed_strings_node_t) + strlen(s) + 1; -+ np = malloc(len); -+ if (!np) -+ return (-1); -+ -+ memset(np, 0, len); -+ assert((char *) np->string + strlen(s) < (char *) np + len); -+ (void) strcpy(np->string, s); -+ avl_add(&zsp->tree, np); -+ return (0); -+} -+ -+/* -+ * Return the first string in container [zsp]. -+ * Return NULL if there are no strings, or on error. -+ * This can be called multiple times to re-traverse [zsp]. -+ * XXX: Not thread-safe. -+ */ -+const char * -+zed_strings_first(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (NULL); -+ } -+ zsp->iteratorp = avl_first(&zsp->tree); -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ return (((zed_strings_node_t *) zsp->iteratorp)->string); -+ -+} -+ -+/* -+ * Return the next string in container [zsp]. -+ * Return NULL after the last string, or on error. -+ * This must be called after zed_strings_first(). -+ * XXX: Not thread-safe. -+ */ -+const char * -+zed_strings_next(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (NULL); -+ } -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp); -+ if (!zsp->iteratorp) -+ return (NULL); -+ -+ return (((zed_strings_node_t *)zsp->iteratorp)->string); -+} -+ -+/* -+ * Return the number of strings in container [zsp], or -1 on error. -+ */ -+int -+zed_strings_count(zed_strings_t *zsp) -+{ -+ if (!zsp) { -+ errno = EINVAL; -+ return (-1); -+ } -+ return (avl_numnodes(&zsp->tree)); -+} -diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h -new file mode 100644 -index 0000000..c1ea804 ---- /dev/null -+++ b/cmd/zed/zed_strings.h -@@ -0,0 +1,44 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license from the top-level -+ * OPENSOLARIS.LICENSE or . -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each file -+ * and include the License file from the top-level OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+ */ -+ -+#ifndef ZED_STRINGS_H -+#define ZED_STRINGS_H -+ -+typedef struct zed_strings zed_strings_t; -+ -+zed_strings_t * zed_strings_create(void); -+ -+void zed_strings_destroy(zed_strings_t *zsp); -+ -+int zed_strings_add(zed_strings_t *zsp, const char *s); -+ -+const char * zed_strings_first(zed_strings_t *zsp); -+ -+const char * zed_strings_next(zed_strings_t *zsp); -+ -+int zed_strings_count(zed_strings_t *zsp); -+ -+#endif /* !ZED_STRINGS_H */ -diff --git a/cmd/zfs/Makefile.am b/cmd/zfs/Makefile.am -index 8f381f1..08580c9 100644 ---- a/cmd/zfs/Makefile.am -+++ b/cmd/zfs/Makefile.am -@@ -18,4 +18,6 @@ zfs_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zfs_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zfs_LDADD += $(ZLIB) -+zfs_LDFLAGS = -pthread -diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c -index 6239a8f..8892d91 100644 ---- a/cmd/zfs/zfs_iter.c -+++ b/cmd/zfs/zfs_iter.c -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -23,2 +24,3 @@ - * Copyright (c) 2012 Pawel Jakub Dawidek . -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -110,3 +112,4 @@ zfs_callback(zfs_handle_t *zhp, void *data) - if (zfs_expand_proplist(zhp, cb->cb_proplist, -- (cb->cb_flags & ZFS_ITER_RECVD_PROPS)) -+ (cb->cb_flags & ZFS_ITER_RECVD_PROPS), -+ (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) - != 0) { -@@ -312,4 +315,4 @@ zfs_sort(const void *larg, const void *rarg, void *data) - -- (void) strlcpy(lbuf, zfs_get_name(l), sizeof(lbuf)); -- (void) strlcpy(rbuf, zfs_get_name(r), sizeof(rbuf)); -+ (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); -+ (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); - -diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h -index 7f740e7..2697fbd 100644 ---- a/cmd/zfs/zfs_iter.h -+++ b/cmd/zfs/zfs_iter.h -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -23,2 +24,3 @@ - * Use is subject to license terms. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -45,3 +47,4 @@ typedef struct zfs_sort_column { - #define ZFS_ITER_RECVD_PROPS (1 << 4) --#define ZFS_ITER_SIMPLE (1 << 5) -+#define ZFS_ITER_LITERAL_PROPS (1 << 5) -+#define ZFS_ITER_SIMPLE (1 << 6) - -diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c -index 5753cce..d7c1a2a 100644 ---- a/cmd/zfs/zfs_main.c -+++ b/cmd/zfs/zfs_main.c -@@ -23,5 +23,6 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2012 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -57,2 +58,3 @@ - #include -+#include - #include -@@ -74,2 +76,3 @@ static FILE *mnttab_file; - static char history_str[HIS_MAX_RECORD_LEN]; -+static boolean_t log_history = B_TRUE; - -@@ -233,6 +236,5 @@ get_usage(zfs_help_t idx) - case HELP_LIST: -- return (gettext("\tlist [-rH][-d max] " -- "[-o property[,...]] [-t type[,...]] [-s property] ...\n" -- "\t [-S property] ... " -- "[filesystem|volume|snapshot|snap] ...\n")); -+ return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " -+ "[-s property]...\n\t [-S property]... [-t type[,...]] " -+ "[filesystem|volume|snapshot] ...\n")); - case HELP_MOUNT: -@@ -263,3 +265,3 @@ get_usage(zfs_help_t idx) - return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " -- "\n")); -+ " ...\n")); - case HELP_UNMOUNT: -@@ -292,8 +294,8 @@ get_usage(zfs_help_t idx) - return (gettext("\tuserspace [-Hinp] [-o field[,...]] " -- "[-s field] ...\n\t[-S field] ... " -- "[-t type[,...]] \n")); -+ "[-s field]...\n\t [-S field]... [-t type[,...]] " -+ "\n")); - case HELP_GROUPSPACE: - return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " -- "[-s field] ...\n\t[-S field] ... " -- "[-t type[,...]] \n")); -+ "[-s field]...\n\t [-S field]... [-t type[,...]] " -+ "\n")); - case HELP_HOLD: -@@ -649,2 +651,7 @@ zfs_do_clone(int argc, char **argv) - -+ if (log_history) { -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; -+ } -+ - clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); -@@ -828,2 +835,7 @@ zfs_do_create(int argc, char **argv) - -+ if (log_history) { -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; -+ } -+ - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) -@@ -892,7 +904,8 @@ typedef struct destroy_cbdata { - nvlist_t *cb_nvl; -+ nvlist_t *cb_batchedsnaps; - - /* first snap in contiguous run */ -- zfs_handle_t *cb_firstsnap; -+ char *cb_firstsnap; - /* previous snap in contiguous run */ -- zfs_handle_t *cb_prevsnap; -+ char *cb_prevsnap; - int64_t cb_snapused; -@@ -988,5 +1001,23 @@ destroy_callback(zfs_handle_t *zhp, void *data) - } -+ if (cb->cb_dryrun) { -+ zfs_close(zhp); -+ return (0); -+ } - -- if (!cb->cb_dryrun) { -- if (zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || -+ /* -+ * We batch up all contiguous snapshots (even of different -+ * filesystems) and destroy them with one ioctl. We can't -+ * simply do all snap deletions and then all fs deletions, -+ * because we must delete a clone before its origin. -+ */ -+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { -+ fnvlist_add_boolean(cb->cb_batchedsnaps, name); -+ } else { -+ int error = zfs_destroy_snaps_nvl(g_zfs, -+ cb->cb_batchedsnaps, B_FALSE); -+ fnvlist_free(cb->cb_batchedsnaps); -+ cb->cb_batchedsnaps = fnvlist_alloc(); -+ -+ if (error != 0 || -+ zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { -@@ -1010,7 +1041,9 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) - if (cb->cb_firstsnap == NULL) -- cb->cb_firstsnap = zfs_handle_dup(zhp); -+ cb->cb_firstsnap = strdup(name); - if (cb->cb_prevsnap != NULL) -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - /* this snap continues the current range */ -- cb->cb_prevsnap = zfs_handle_dup(zhp); -+ cb->cb_prevsnap = strdup(name); -+ if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) -+ nomem(); - if (cb->cb_verbose) { -@@ -1029,8 +1062,8 @@ destroy_print_cb(zfs_handle_t *zhp, void *arg) - uint64_t used = 0; -- err = zfs_get_snapused_int(cb->cb_firstsnap, -+ err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); - cb->cb_snapused += used; -- zfs_close(cb->cb_firstsnap); -+ free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; -@@ -1051,3 +1084,3 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) - if (err == 0) { -- err = zfs_get_snapused_int(cb->cb_firstsnap, -+ err = lzc_snaprange_space(cb->cb_firstsnap, - cb->cb_prevsnap, &used); -@@ -1055,5 +1088,5 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) - cb->cb_snapused += used; -- zfs_close(cb->cb_firstsnap); -+ free(cb->cb_firstsnap); - cb->cb_firstsnap = NULL; -- zfs_close(cb->cb_prevsnap); -+ free(cb->cb_prevsnap); - cb->cb_prevsnap = NULL; -@@ -1144,4 +1177,6 @@ zfs_do_destroy(int argc, char **argv) - destroy_cbdata_t cb = { 0 }; -+ int rv = 0; -+ int err = 0; - int c; -- zfs_handle_t *zhp; -+ zfs_handle_t *zhp = NULL; - char *at; -@@ -1199,7 +1234,5 @@ zfs_do_destroy(int argc, char **argv) - if (at != NULL) { -- int err = 0; - - /* Build the list of snaps to destroy in cb_nvl. */ -- if (nvlist_alloc(&cb.cb_nvl, NV_UNIQUE_NAME, 0) != 0) -- nomem(); -+ cb.cb_nvl = fnvlist_alloc(); - -@@ -1214,5 +1247,4 @@ zfs_do_destroy(int argc, char **argv) - cb.cb_error) { -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1222,5 +1254,4 @@ zfs_do_destroy(int argc, char **argv) - "snapshots to destroy; check snapshot names.\n")); -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1243,6 +1274,16 @@ zfs_do_destroy(int argc, char **argv) - if (!cb.cb_dryrun) { -- if (cb.cb_doclones) -+ if (cb.cb_doclones) { -+ cb.cb_batchedsnaps = fnvlist_alloc(); - err = destroy_clones(&cb); -+ if (err == 0) { -+ err = zfs_destroy_snaps_nvl(g_zfs, -+ cb.cb_batchedsnaps, B_FALSE); -+ } -+ if (err != 0) { -+ rv = 1; -+ goto out; -+ } -+ } - if (err == 0) { -- err = zfs_destroy_snaps_nvl(zhp, cb.cb_nvl, -+ err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, - cb.cb_defer_destroy); -@@ -1251,6 +1292,4 @@ zfs_do_destroy(int argc, char **argv) - -- zfs_close(zhp); -- nvlist_free(cb.cb_nvl); - if (err != 0) -- return (1); -+ rv = 1; - } else { -@@ -1275,4 +1314,4 @@ zfs_do_destroy(int argc, char **argv) - "to destroy the pool itself\n"), zfs_get_name(zhp)); -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1286,4 +1325,4 @@ zfs_do_destroy(int argc, char **argv) - &cb) != 0) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1291,10 +1330,11 @@ zfs_do_destroy(int argc, char **argv) - if (cb.cb_error) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } - -+ cb.cb_batchedsnaps = fnvlist_alloc(); - if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, - &cb) != 0) { -- zfs_close(zhp); -- return (1); -+ rv = 1; -+ goto out; - } -@@ -1305,7 +1345,18 @@ zfs_do_destroy(int argc, char **argv) - */ -- if (destroy_callback(zhp, &cb) != 0) -- return (1); -+ err = destroy_callback(zhp, &cb); -+ zhp = NULL; -+ if (err == 0) { -+ err = zfs_destroy_snaps_nvl(g_zfs, -+ cb.cb_batchedsnaps, cb.cb_defer_destroy); -+ } -+ if (err != 0) -+ rv = 1; - } - -- return (0); -+out: -+ fnvlist_free(cb.cb_batchedsnaps); -+ fnvlist_free(cb.cb_nvl); -+ if (zhp != NULL) -+ zfs_close(zhp); -+ return (rv); - } -@@ -1910,5 +1961,7 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data) - * be doing ioctls to different pools. We need -- * to log this history once to each pool. -+ * to log this history once to each pool, and bypass -+ * the normal history logging that happens in main(). - */ -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; - } -@@ -2056,3 +2109,3 @@ zfs_do_upgrade(int argc, char **argv) - * -o Control which fields to display. -- * -p Use exact (parseable) numeric output. -+ * -p Use exact (parsable) numeric output. - * -s Specify sort columns, descending order. -@@ -2090,3 +2143,3 @@ static int us_type_bits[] = { - }; --static char *us_type_names[] = { "posixgroup", "posxiuser", "smbgroup", -+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", - "smbuser", "all" }; -@@ -2746,15 +2799,15 @@ zfs_do_userspace(int argc, char **argv) - /* -- * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] -- * [-s property [-s property]...] [-S property [-S property]...] -- * ... -+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] -+ * [-t type[,...]] [filesystem|volume|snapshot] ... - * -+ * -H Scripted mode; elide headers and separate columns by tabs -+ * -p Display values in parsable (literal) format. - * -r Recurse over all children - * -d Limit recursion by depth. -- * -H Scripted mode; elide headers and separate columns by tabs - * -o Control which fields to display. -- * -t Control which object types to display. - * -s Specify sort columns, descending order. - * -S Specify sort columns, ascending order. -+ * -t Control which object types to display. - * -- * When given no arguments, lists all filesystems in the system. -+ * When given no arguments, list all filesystems in the system. - * Otherwise, list the specified datasets, optionally recursing down them if -@@ -2764,2 +2817,3 @@ typedef struct list_cbdata { - boolean_t cb_first; -+ boolean_t cb_literal; - boolean_t cb_scripted; -@@ -2772,4 +2826,5 @@ typedef struct list_cbdata { - static void --print_header(zprop_list_t *pl) -+print_header(list_cbdata_t *cb) - { -+ zprop_list_t *pl = cb->cb_proplist; - char headerbuf[ZFS_MAXPROPLEN]; -@@ -2814,4 +2869,5 @@ print_header(zprop_list_t *pl) - static void --print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) -+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) - { -+ zprop_list_t *pl = cb->cb_proplist; - boolean_t first = B_TRUE; -@@ -2822,3 +2878,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - boolean_t right_justify; -- int width; - -@@ -2826,3 +2881,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (!first) { -- if (scripted) -+ if (cb->cb_scripted) - (void) printf("\t"); -@@ -2836,3 +2891,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - (void) strlcpy(property, zfs_get_name(zhp), -- sizeof(property)); -+ sizeof (property)); - propstr = property; -@@ -2841,3 +2896,4 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get(zhp, pl->pl_prop, property, -- sizeof (property), NULL, NULL, 0, B_FALSE) != 0) -+ sizeof (property), NULL, NULL, 0, -+ cb->cb_literal) != 0) - propstr = "-"; -@@ -2845,3 +2901,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - propstr = property; -- - right_justify = zfs_prop_align_right(pl->pl_prop); -@@ -2849,3 +2904,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, -- property, sizeof (property), B_FALSE) != 0) -+ property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; -@@ -2856,3 +2911,3 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - if (zfs_prop_get_written(zhp, pl->pl_user_prop, -- property, sizeof (property), B_FALSE) != 0) -+ property, sizeof (property), cb->cb_literal) != 0) - propstr = "-"; -@@ -2871,4 +2926,2 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - -- width = pl->pl_width; -- - /* -@@ -2878,8 +2931,8 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) - */ -- if (scripted || (pl->pl_next == NULL && !right_justify)) -+ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) - (void) printf("%s", propstr); - else if (right_justify) -- (void) printf("%*s", width, propstr); -+ (void) printf("%*s", (int)pl->pl_width, propstr); - else -- (void) printf("%-*s", width, propstr); -+ (void) printf("%-*s", (int)pl->pl_width, propstr); - } -@@ -2899,3 +2952,3 @@ list_callback(zfs_handle_t *zhp, void *data) - if (!cbp->cb_scripted) -- print_header(cbp->cb_proplist); -+ print_header(cbp); - cbp->cb_first = B_FALSE; -@@ -2903,3 +2956,3 @@ list_callback(zfs_handle_t *zhp, void *data) - -- print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted); -+ print_dataset(zhp, cbp); - -@@ -2912,3 +2965,2 @@ zfs_do_list(int argc, char **argv) - int c; -- boolean_t scripted = B_FALSE; - static char default_fields[] = -@@ -2926,3 +2978,3 @@ zfs_do_list(int argc, char **argv) - /* check options */ -- while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { -+ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { - switch (c) { -@@ -2931,2 +2983,6 @@ zfs_do_list(int argc, char **argv) - break; -+ case 'p': -+ cb.cb_literal = B_TRUE; -+ flags |= ZFS_ITER_LITERAL_PROPS; -+ break; - case 'd': -@@ -2938,3 +2994,3 @@ zfs_do_list(int argc, char **argv) - case 'H': -- scripted = B_TRUE; -+ cb.cb_scripted = B_TRUE; - break; -@@ -3028,3 +3084,2 @@ zfs_do_list(int argc, char **argv) - -- cb.cb_scripted = scripted; - cb.cb_first = B_TRUE; -@@ -3424,2 +3479,34 @@ zfs_do_set(int argc, char **argv) - -+typedef struct snap_cbdata { -+ nvlist_t *sd_nvl; -+ boolean_t sd_recursive; -+ const char *sd_snapname; -+} snap_cbdata_t; -+ -+static int -+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) -+{ -+ snap_cbdata_t *sd = arg; -+ char *name; -+ int rv = 0; -+ int error; -+ -+ if (sd->sd_recursive && -+ zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { -+ zfs_close(zhp); -+ return (0); -+ } -+ -+ error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); -+ if (error == -1) -+ nomem(); -+ fnvlist_add_boolean(sd->sd_nvl, name); -+ free(name); -+ -+ if (sd->sd_recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); -+ zfs_close(zhp); -+ return (rv); -+} -+ - /* -@@ -3433,3 +3520,2 @@ zfs_do_snapshot(int argc, char **argv) - { -- boolean_t recursive = B_FALSE; - int ret = 0; -@@ -3437,2 +3523,4 @@ zfs_do_snapshot(int argc, char **argv) - nvlist_t *props; -+ snap_cbdata_t sd = { 0 }; -+ boolean_t multiple_snaps = B_FALSE; - -@@ -3440,2 +3528,4 @@ zfs_do_snapshot(int argc, char **argv) - nomem(); -+ if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) -+ nomem(); - -@@ -3449,3 +3539,4 @@ zfs_do_snapshot(int argc, char **argv) - case 'r': -- recursive = B_TRUE; -+ sd.sd_recursive = B_TRUE; -+ multiple_snaps = B_TRUE; - break; -@@ -3466,10 +3557,26 @@ zfs_do_snapshot(int argc, char **argv) - } -- if (argc > 1) { -- (void) fprintf(stderr, gettext("too many arguments\n")); -- goto usage; -+ -+ if (argc > 1) -+ multiple_snaps = B_TRUE; -+ for (; argc > 0; argc--, argv++) { -+ char *atp; -+ zfs_handle_t *zhp; -+ -+ atp = strchr(argv[0], '@'); -+ if (atp == NULL) -+ goto usage; -+ *atp = '\0'; -+ sd.sd_snapname = atp + 1; -+ zhp = zfs_open(g_zfs, argv[0], -+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); -+ if (zhp == NULL) -+ goto usage; -+ if (zfs_snapshot_cb(zhp, &sd) != 0) -+ goto usage; - } - -- ret = zfs_snapshot(g_zfs, argv[0], recursive, props); -+ ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); -+ nvlist_free(sd.sd_nvl); - nvlist_free(props); -- if (ret && recursive) -+ if (ret != 0 && multiple_snaps) - (void) fprintf(stderr, gettext("no snapshots were created\n")); -@@ -3478,2 +3585,3 @@ zfs_do_snapshot(int argc, char **argv) - usage: -+ nvlist_free(sd.sd_nvl); - nvlist_free(props); -@@ -5030,10 +5138,2 @@ cleanup2: - --/* -- * zfs allow [-r] [-t] ... -- * -- * -r Recursively hold -- * -t Temporary hold (hidden option) -- * -- * Apply a user-hold with the given tag to the list of snapshots. -- */ - static int -@@ -5044,10 +5144,2 @@ zfs_do_allow(int argc, char **argv) - --/* -- * zfs unallow [-r] [-t] ... -- * -- * -r Recursively hold -- * -t Temporary hold (hidden option) -- * -- * Apply a user-hold with the given tag to the list of snapshots. -- */ - static int -@@ -5065,3 +5157,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - boolean_t recursive = B_FALSE; -- boolean_t temphold = B_FALSE; - const char *opts = holding ? "rt" : "r"; -@@ -5075,5 +5166,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - break; -- case 't': -- temphold = B_TRUE; -- break; - case '?': -@@ -5125,4 +5213,3 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - if (holding) { -- if (zfs_hold(zhp, delim+1, tag, recursive, -- temphold, B_FALSE, -1, 0, 0) != 0) -+ if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) - ++errors; -@@ -5142,3 +5229,2 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) - * -r Recursively hold -- * -t Temporary hold (hidden option) - * -@@ -5778,3 +5864,7 @@ share_mount(int op, int argc, char **argv) - */ -- rewind(mnttab_file); -+ -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while (getmntent(mnttab_file, &entry) == 0) { -@@ -5881,3 +5971,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) - */ -- rewind(mnttab_file); -+ -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { -@@ -6035,3 +6129,6 @@ unshare_unmount(int op, int argc, char **argv) - -- rewind(mnttab_file); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) -+ return (ENOENT); -+ - while (getmntent(mnttab_file, &entry) == 0) { -@@ -6386,4 +6483,3 @@ main(int argc, char **argv) - -- zpool_set_history_str("zfs", argc, argv, history_str); -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - -@@ -6394,3 +6490,3 @@ main(int argc, char **argv) - */ -- libzfs_mnttab_cache(g_zfs, B_FALSE); -+ libzfs_mnttab_cache(g_zfs, B_TRUE); - if (find_command_idx(cmdname, &i) == 0) { -@@ -6408,2 +6504,6 @@ main(int argc, char **argv) - } -+ -+ if (ret == 0 && log_history) -+ (void) zpool_log_history(g_zfs, history_str); -+ - libzfs_fini(g_zfs); -diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am -index 47da245..922aef9 100644 ---- a/cmd/zhack/Makefile.am -+++ b/cmd/zhack/Makefile.am -@@ -15,4 +15,5 @@ zhack_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zhack_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zhack_LDADD += $(ZLIB) -diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c -index b2cf815..64ab8ed 100644 ---- a/cmd/zhack/zhack.c -+++ b/cmd/zhack/zhack.c -@@ -23,2 +23,3 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -48,2 +49,3 @@ - #include -+#include - #undef ZFS_MAXNAMELEN -@@ -125,3 +127,3 @@ import_pool(const char *target, boolean_t readonly) - nvlist_t *props; -- const char *name; -+ char *name; - -@@ -153,3 +155,3 @@ import_pool(const char *target, boolean_t readonly) - -- if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) { -+ if (nvlist_empty(pools)) { - if (!g_importargs.can_be_active) { -@@ -275,8 +277,11 @@ zhack_do_feature_stat(int argc, char **argv) - static void --feature_enable_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_enable_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_enable(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack enable feature", tx, -+ "name=%s can_readonly=%u", -+ feature->fi_guid, feature->fi_can_readonly); - } -@@ -343,4 +348,4 @@ zhack_do_feature_enable(int argc, char **argv) - -- VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, -- feature_enable_sync, spa, &feature, 5)); -+ VERIFY0(dsl_sync_task(spa_name(spa), NULL, -+ feature_enable_sync, &feature, 5)); - -@@ -352,8 +357,10 @@ zhack_do_feature_enable(int argc, char **argv) - static void --feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_incr_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_incr(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack feature incr", tx, -+ "name=%s", feature->fi_guid); - } -@@ -361,8 +368,10 @@ feature_incr_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static void --feature_decr_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+feature_decr_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - - spa_feature_decr(spa, feature, tx); -+ spa_history_log_internal(spa, "zhack feature decr", tx, -+ "name=%s", feature->fi_guid); - } -@@ -437,4 +446,4 @@ zhack_do_feature_ref(int argc, char **argv) - -- VERIFY3U(0, ==, dsl_sync_task_do(spa->spa_dsl_pool, NULL, -- decr ? feature_decr_sync : feature_incr_sync, spa, &feature, 5)); -+ VERIFY0(dsl_sync_task(spa_name(spa), NULL, -+ decr ? feature_decr_sync : feature_incr_sync, &feature, 5)); - -diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am -index d1d32d5..4adef11 100644 ---- a/cmd/zinject/Makefile.am -+++ b/cmd/zinject/Makefile.am -@@ -17,4 +17,3 @@ zinject_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --zinject_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la -diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c -index b2ccb67..5cc9d9f 100644 ---- a/cmd/zinject/translate.c -+++ b/cmd/zinject/translate.c -@@ -469,3 +469,3 @@ translate_device(const char *pool, const char *device, err_type_t label_type, - -- record->zi_guid = strtoull(device, &end, 16); -+ record->zi_guid = strtoull(device, &end, 0); - if (record->zi_guid == 0 || *end != '\0') { -diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c -index 13d067d..f6c8915 100644 ---- a/cmd/zinject/zinject.c -+++ b/cmd/zinject/zinject.c -@@ -297,7 +297,5 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - int ret; - -- zc.zc_guid = 0; -- - while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) -@@ -424,3 +422,3 @@ cancel_one_handler(int id, const char *pool, zinject_record_t *record, - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -457,3 +455,3 @@ cancel_handler(int id) - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -479,3 +477,3 @@ register_handler(const char *pool, int flags, zinject_record_t *record, - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -@@ -536,3 +534,3 @@ perform_action(const char *pool, zinject_record_t *record, int cmd) - { -- zfs_cmd_t zc; -+ zfs_cmd_t zc = {"\0"}; - -diff --git a/cmd/zpios/zpios.h b/cmd/zpios/zpios.h -index 23c3237..92d96fc 100644 ---- a/cmd/zpios/zpios.h -+++ b/cmd/zpios/zpios.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,6 +31,6 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_H --#define _ZPIOS_H -+#define _ZPIOS_H - -@@ -38,27 +38,28 @@ - --#define VERSION_SIZE 64 -+#define VERSION_SIZE 64 - - /* Regular expressions */ --#define REGEX_NUMBERS "^[0-9]*[0-9]$" --#define REGEX_NUMBERS_COMMA "^([0-9]+,)*[0-9]+$" --#define REGEX_SIZE "^[0-9][0-9]*[kmgt]$" --#define REGEX_SIZE_COMMA "^([0-9][0-9]*[kmgt]+,)*[0-9][0-9]*[kmgt]$" -+#define REGEX_NUMBERS "^[0-9]*[0-9]$" -+#define REGEX_NUMBERS_COMMA "^([0-9]+,)*[0-9]+$" -+#define REGEX_SIZE "^[0-9][0-9]*[kmgt]$" -+#define REGEX_SIZE_COMMA "^([0-9][0-9]*[kmgt]+,)*[0-9][0-9]*[kmgt]$" - - /* Flags for low, high, incr */ --#define FLAG_SET 0x01 --#define FLAG_LOW 0x02 --#define FLAG_HIGH 0x04 --#define FLAG_INCR 0x08 -+#define FLAG_SET 0x01 -+#define FLAG_LOW 0x02 -+#define FLAG_HIGH 0x04 -+#define FLAG_INCR 0x08 - --#define TRUE 1 --#define FALSE 0 -+#define TRUE 1 -+#define FALSE 0 - --#define KB (1024) --#define MB (KB * 1024) --#define GB (MB * 1024) --#define TB (GB * 1024) -+#define KB (1024) -+#define MB (KB * 1024) -+#define GB (MB * 1024) -+#define TB (GB * 1024) - --#define KMGT_SIZE 16 -+#define KMGT_SIZE 16 - --/* All offsets, sizes and counts can be passed to the application in -+/* -+ * All offsets, sizes and counts can be passed to the application in - * multiple ways. -@@ -69,4 +70,4 @@ - typedef struct pios_range_repeat { -- uint64_t val[32]; /* Comma sep array, or low, high, inc */ -- uint64_t val_count; /* Num of values */ -+ uint64_t val[32]; /* Comma sep array, or low, high, inc */ -+ uint64_t val_count; /* Num of values */ - uint64_t val_low; -@@ -74,3 +75,3 @@ typedef struct pios_range_repeat { - uint64_t val_inc_perc; -- uint64_t next_val; /* Used for multiple runs in get_next() */ -+ uint64_t next_val; /* For multiple runs in get_next() */ - } range_repeat_t; -@@ -78,22 +79,22 @@ typedef struct pios_range_repeat { - typedef struct cmd_args { -- range_repeat_t T; /* Thread count */ -- range_repeat_t N; /* Region count */ -- range_repeat_t O; /* Offset count */ -- range_repeat_t C; /* Chunksize */ -- range_repeat_t S; /* Regionsize */ -- -- const char *pool; /* Pool */ -- const char *name; /* Name */ -- uint32_t flags; /* Flags */ -- uint32_t io_type; /* DMUIO only */ -- uint32_t verbose; /* Verbose */ -- uint32_t human_readable; /* Human readable output */ -- -- uint64_t regionnoise; /* Region noise */ -- uint64_t chunknoise; /* Chunk noise */ -- uint64_t thread_delay; /* Thread delay */ -- -- char pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -- char post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -- char log[ZPIOS_PATH_SIZE]; /* Requested log dir */ -+ range_repeat_t T; /* Thread count */ -+ range_repeat_t N; /* Region count */ -+ range_repeat_t O; /* Offset count */ -+ range_repeat_t C; /* Chunksize */ -+ range_repeat_t S; /* Regionsize */ -+ -+ const char *pool; /* Pool */ -+ const char *name; /* Name */ -+ uint32_t flags; /* Flags */ -+ uint32_t io_type; /* DMUIO only */ -+ uint32_t verbose; /* Verbose */ -+ uint32_t human_readable; /* Human readable output */ -+ -+ uint64_t regionnoise; /* Region noise */ -+ uint64_t chunknoise; /* Chunk noise */ -+ uint64_t thread_delay; /* Thread delay */ -+ -+ char pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -+ char post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -+ char log[ZPIOS_PATH_SIZE]; /* Requested log dir */ - -@@ -111,5 +112,5 @@ typedef struct cmd_args { - int set_count(char *pattern1, char *pattern2, range_repeat_t *range, -- char *optarg, uint32_t *flags, char *arg); -+ char *optarg, uint32_t *flags, char *arg); - int set_lhi(char *pattern, range_repeat_t *range, char *optarg, -- int flag, uint32_t *flag_thread, char *arg); -+ int flag, uint32_t *flag_thread, char *arg); - int set_noise(uint64_t *noise, char *optarg, char *arg); -diff --git a/cmd/zpios/zpios_main.c b/cmd/zpios/zpios_main.c -index 1c01d9a..971a886 100644 ---- a/cmd/zpios/zpios_main.c -+++ b/cmd/zpios/zpios_main.c -@@ -1,5 +1,5 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. - * It is designed to have the test code running in the Linux kernel -- * against ZFS while still being flexibly controled from user space. -+ * against ZFS while still being flexibly controlled from user space. - * -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -44,43 +44,44 @@ - --static const char short_opt[] = "t:l:h:e:n:i:j:k:o:m:q:r:c:a:b:g:s:A:B:C:" -- "L:p:M:xP:R:G:I:N:T:VzOfHv?"; -+static const char short_opt[] = -+ "t:l:h:e:n:i:j:k:o:m:q:r:c:a:b:g:s:A:B:C:" -+ "L:p:M:xP:R:G:I:N:T:VzOfHv?"; - static const struct option long_opt[] = { -- {"threadcount", required_argument, 0, 't' }, -- {"threadcount_low", required_argument, 0, 'l' }, -- {"threadcount_high", required_argument, 0, 'h' }, -- {"threadcount_incr", required_argument, 0, 'e' }, -- {"regioncount", required_argument, 0, 'n' }, -- {"regioncount_low", required_argument, 0, 'i' }, -- {"regioncount_high", required_argument, 0, 'j' }, -- {"regioncount_incr", required_argument, 0, 'k' }, -- {"offset", required_argument, 0, 'o' }, -- {"offset_low", required_argument, 0, 'm' }, -- {"offset_high", required_argument, 0, 'q' }, -- {"offset_incr", required_argument, 0, 'r' }, -- {"chunksize", required_argument, 0, 'c' }, -- {"chunksize_low", required_argument, 0, 'a' }, -- {"chunksize_high", required_argument, 0, 'b' }, -- {"chunksize_incr", required_argument, 0, 'g' }, -- {"regionsize", required_argument, 0, 's' }, -- {"regionsize_low", required_argument, 0, 'A' }, -- {"regionsize_high", required_argument, 0, 'B' }, -- {"regionsize_incr", required_argument, 0, 'C' }, -- {"load", required_argument, 0, 'L' }, -- {"pool", required_argument, 0, 'p' }, -- {"name", required_argument, 0, 'M' }, -- {"cleanup", no_argument, 0, 'x' }, -- {"prerun", required_argument, 0, 'P' }, -- {"postrun", required_argument, 0, 'R' }, -- {"log", required_argument, 0, 'G' }, -- {"regionnoise", required_argument, 0, 'I' }, -- {"chunknoise", required_argument, 0, 'N' }, -- {"threaddelay", required_argument, 0, 'T' }, -- {"verify", no_argument, 0, 'V' }, -- {"zerocopy", no_argument, 0, 'z' }, -- {"nowait", no_argument, 0, 'O' }, -- {"noprefetch", no_argument, 0, 'f' }, -- {"human-readable", no_argument, 0, 'H' }, -- {"verbose", no_argument, 0, 'v' }, -- {"help", no_argument, 0, '?' }, -- { 0, 0, 0, 0 }, -+ {"threadcount", required_argument, 0, 't' }, -+ {"threadcount_low", required_argument, 0, 'l' }, -+ {"threadcount_high", required_argument, 0, 'h' }, -+ {"threadcount_incr", required_argument, 0, 'e' }, -+ {"regioncount", required_argument, 0, 'n' }, -+ {"regioncount_low", required_argument, 0, 'i' }, -+ {"regioncount_high", required_argument, 0, 'j' }, -+ {"regioncount_incr", required_argument, 0, 'k' }, -+ {"offset", required_argument, 0, 'o' }, -+ {"offset_low", required_argument, 0, 'm' }, -+ {"offset_high", required_argument, 0, 'q' }, -+ {"offset_incr", required_argument, 0, 'r' }, -+ {"chunksize", required_argument, 0, 'c' }, -+ {"chunksize_low", required_argument, 0, 'a' }, -+ {"chunksize_high", required_argument, 0, 'b' }, -+ {"chunksize_incr", required_argument, 0, 'g' }, -+ {"regionsize", required_argument, 0, 's' }, -+ {"regionsize_low", required_argument, 0, 'A' }, -+ {"regionsize_high", required_argument, 0, 'B' }, -+ {"regionsize_incr", required_argument, 0, 'C' }, -+ {"load", required_argument, 0, 'L' }, -+ {"pool", required_argument, 0, 'p' }, -+ {"name", required_argument, 0, 'M' }, -+ {"cleanup", no_argument, 0, 'x' }, -+ {"prerun", required_argument, 0, 'P' }, -+ {"postrun", required_argument, 0, 'R' }, -+ {"log", required_argument, 0, 'G' }, -+ {"regionnoise", required_argument, 0, 'I' }, -+ {"chunknoise", required_argument, 0, 'N' }, -+ {"threaddelay", required_argument, 0, 'T' }, -+ {"verify", no_argument, 0, 'V' }, -+ {"zerocopy", no_argument, 0, 'z' }, -+ {"nowait", no_argument, 0, 'O' }, -+ {"noprefetch", no_argument, 0, 'f' }, -+ {"human-readable", no_argument, 0, 'H' }, -+ {"verbose", no_argument, 0, 'v' }, -+ {"help", no_argument, 0, '?' }, -+ { 0, 0, 0, 0 }, - }; -@@ -97,41 +98,41 @@ usage(void) - fprintf(stderr, -- " --threadcount -t =values\n" -- " --threadcount_low -l =value\n" -- " --threadcount_high -h =value\n" -- " --threadcount_incr -e =value\n" -- " --regioncount -n =values\n" -- " --regioncount_low -i =value\n" -- " --regioncount_high -j =value\n" -- " --regioncount_incr -k =value\n" -- " --offset -o =values\n" -- " --offset_low -m =value\n" -- " --offset_high -q =value\n" -- " --offset_incr -r =value\n" -- " --chunksize -c =values\n" -- " --chunksize_low -a =value\n" -- " --chunksize_high -b =value\n" -- " --chunksize_incr -g =value\n" -- " --regionsize -s =values\n" -- " --regionsize_low -A =value\n" -- " --regionsize_high -B =value\n" -- " --regionsize_incr -C =value\n" -- " --load -L =dmuio|ssf|fpp\n" -- " --pool -p =pool name\n" -+ " --threadcount -t =values\n" -+ " --threadcount_low -l =value\n" -+ " --threadcount_high -h =value\n" -+ " --threadcount_incr -e =value\n" -+ " --regioncount -n =values\n" -+ " --regioncount_low -i =value\n" -+ " --regioncount_high -j =value\n" -+ " --regioncount_incr -k =value\n" -+ " --offset -o =values\n" -+ " --offset_low -m =value\n" -+ " --offset_high -q =value\n" -+ " --offset_incr -r =value\n" -+ " --chunksize -c =values\n" -+ " --chunksize_low -a =value\n" -+ " --chunksize_high -b =value\n" -+ " --chunksize_incr -g =value\n" -+ " --regionsize -s =values\n" -+ " --regionsize_low -A =value\n" -+ " --regionsize_high -B =value\n" -+ " --regionsize_incr -C =value\n" -+ " --load -L =dmuio|ssf|fpp\n" -+ " --pool -p =pool name\n" - " --name -M =test name\n" -- " --cleanup -x\n" -- " --prerun -P =pre-command\n" -- " --postrun -R =post-command\n" -- " --log -G =log directory\n" -- " --regionnoise -I =shift\n" -- " --chunknoise -N =bytes\n" -- " --threaddelay -T =jiffies\n" -- " --verify -V\n" -- " --zerocopy -z\n" -- " --nowait -O\n" -+ " --cleanup -x\n" -+ " --prerun -P =pre-command\n" -+ " --postrun -R =post-command\n" -+ " --log -G =log directory\n" -+ " --regionnoise -I =shift\n" -+ " --chunknoise -N =bytes\n" -+ " --threaddelay -T =jiffies\n" -+ " --verify -V\n" -+ " --zerocopy -z\n" -+ " --nowait -O\n" - " --noprefetch -f\n" -- " --human-readable -H\n" -- " --verbose -v =increase verbosity\n" -- " --help -? =this help\n\n"); -+ " --human-readable -H\n" -+ " --verbose -v =increase verbosity\n" -+ " --help -? =this help\n\n"); - -- return 0; -+ return (0); - } -@@ -157,3 +158,3 @@ args_init(int argc, char **argv) - usage(); -- return (cmd_args_t *)NULL; -+ return ((cmd_args_t *)NULL); - } -@@ -161,9 +162,9 @@ args_init(int argc, char **argv) - /* Configure and populate the args structures */ -- args = malloc(sizeof(*args)); -+ args = malloc(sizeof (*args)); - if (args == NULL) -- return NULL; -+ return (NULL); - -- memset(args, 0, sizeof(*args)); -+ memset(args, 0, sizeof (*args)); - -- while ((c=getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { -+ while ((c = getopt_long(argc, argv, short_opt, long_opt, NULL)) != -1) { - rc = 0; -@@ -172,4 +173,4 @@ args_init(int argc, char **argv) - case 't': /* --thread count */ -- rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -- &args->T, optarg, &fl_th, "threadcount"); -+ rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -+ &args->T, optarg, &fl_th, "threadcount"); - break; -@@ -177,3 +178,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_LOW, &fl_th, "threadcount_low"); -+ FLAG_LOW, &fl_th, "threadcount_low"); - break; -@@ -181,3 +182,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_HIGH, &fl_th, "threadcount_high"); -+ FLAG_HIGH, &fl_th, "threadcount_high"); - break; -@@ -185,3 +186,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->T, optarg, -- FLAG_INCR, &fl_th, "threadcount_incr"); -+ FLAG_INCR, &fl_th, "threadcount_incr"); - break; -@@ -189,3 +190,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_NUMBERS, REGEX_NUMBERS_COMMA, -- &args->N, optarg, &fl_rc, "regioncount"); -+ &args->N, optarg, &fl_rc, "regioncount"); - break; -@@ -193,3 +194,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_LOW, &fl_rc, "regioncount_low"); -+ FLAG_LOW, &fl_rc, "regioncount_low"); - break; -@@ -197,3 +198,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_HIGH, &fl_rc, "regioncount_high"); -+ FLAG_HIGH, &fl_rc, "regioncount_high"); - break; -@@ -201,3 +202,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->N, optarg, -- FLAG_INCR, &fl_rc, "regioncount_incr"); -+ FLAG_INCR, &fl_rc, "regioncount_incr"); - break; -@@ -205,3 +206,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->O, optarg, &fl_of, "offset"); -+ &args->O, optarg, &fl_of, "offset"); - break; -@@ -209,3 +210,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->O, optarg, -- FLAG_LOW, &fl_of, "offset_low"); -+ FLAG_LOW, &fl_of, "offset_low"); - break; -@@ -213,3 +214,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->O, optarg, -- FLAG_HIGH, &fl_of, "offset_high"); -+ FLAG_HIGH, &fl_of, "offset_high"); - break; -@@ -217,3 +218,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->O, optarg, -- FLAG_INCR, &fl_of, "offset_incr"); -+ FLAG_INCR, &fl_of, "offset_incr"); - break; -@@ -221,3 +222,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->C, optarg, &fl_cs, "chunksize"); -+ &args->C, optarg, &fl_cs, "chunksize"); - break; -@@ -225,3 +226,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->C, optarg, -- FLAG_LOW, &fl_cs, "chunksize_low"); -+ FLAG_LOW, &fl_cs, "chunksize_low"); - break; -@@ -229,3 +230,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->C, optarg, -- FLAG_HIGH, &fl_cs, "chunksize_high"); -+ FLAG_HIGH, &fl_cs, "chunksize_high"); - break; -@@ -233,3 +234,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->C, optarg, -- FLAG_INCR, &fl_cs, "chunksize_incr"); -+ FLAG_INCR, &fl_cs, "chunksize_incr"); - break; -@@ -237,3 +238,3 @@ args_init(int argc, char **argv) - rc = set_count(REGEX_SIZE, REGEX_SIZE_COMMA, -- &args->S, optarg, &fl_rs, "regionsize"); -+ &args->S, optarg, &fl_rs, "regionsize"); - break; -@@ -241,3 +242,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->S, optarg, -- FLAG_LOW, &fl_rs, "regionsize_low"); -+ FLAG_LOW, &fl_rs, "regionsize_low"); - break; -@@ -245,3 +246,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_SIZE, &args->S, optarg, -- FLAG_HIGH, &fl_rs, "regionsize_high"); -+ FLAG_HIGH, &fl_rs, "regionsize_high"); - break; -@@ -249,3 +250,3 @@ args_init(int argc, char **argv) - rc = set_lhi(REGEX_NUMBERS, &args->S, optarg, -- FLAG_INCR, &fl_rs, "regionsize_incr"); -+ FLAG_INCR, &fl_rs, "regionsize_incr"); - break; -@@ -273,3 +274,4 @@ args_init(int argc, char **argv) - case 'I': /* --regionnoise */ -- rc = set_noise(&args->regionnoise, optarg, "regionnoise"); -+ rc = set_noise(&args->regionnoise, optarg, -+ "regionnoise"); - break; -@@ -279,3 +281,4 @@ args_init(int argc, char **argv) - case 'T': /* --threaddelay */ -- rc = set_noise(&args->thread_delay, optarg, "threaddelay"); -+ rc = set_noise(&args->thread_delay, optarg, -+ "threaddelay"); - break; -@@ -303,3 +306,4 @@ args_init(int argc, char **argv) - default: -- fprintf(stderr,"Unknown option '%s'\n",argv[optind-1]); -+ fprintf(stderr, "Unknown option '%s'\n", -+ argv[optind - 1]); - rc = EINVAL; -@@ -311,3 +315,3 @@ args_init(int argc, char **argv) - args_fini(args); -- return NULL; -+ return (NULL); - } -@@ -325,3 +329,3 @@ args_init(int argc, char **argv) - args_fini(args); -- return NULL; -+ return (NULL); - } -@@ -330,10 +334,10 @@ args_init(int argc, char **argv) - (args->flags & DMU_VERIFY)) { -- fprintf(stderr, "Error, --zerocopy incompatible --verify, " -- "used for performance analysis only\n"); -+ fprintf(stderr, "Error, --zerocopy incompatible --verify, " -+ "used for performance analysis only\n"); - usage(); - args_fini(args); -- return NULL; -+ return (NULL); - } - -- return args; -+ return (args); - } -@@ -346,5 +350,5 @@ dev_clear(void) - -- memset(&cfg, 0, sizeof(cfg)); -+ memset(&cfg, 0, sizeof (cfg)); - cfg.cfg_magic = ZPIOS_CFG_MAGIC; -- cfg.cfg_cmd = ZPIOS_CFG_BUFFER_CLEAR; -+ cfg.cfg_cmd = ZPIOS_CFG_BUFFER_CLEAR; - cfg.cfg_arg1 = 0; -@@ -354,3 +358,3 @@ dev_clear(void) - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", -- (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -+ (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); - -@@ -358,3 +362,3 @@ dev_clear(void) - -- return rc; -+ return (rc); - } -@@ -368,5 +372,5 @@ dev_size(int size) - -- memset(&cfg, 0, sizeof(cfg)); -+ memset(&cfg, 0, sizeof (cfg)); - cfg.cfg_magic = ZPIOS_CFG_MAGIC; -- cfg.cfg_cmd = ZPIOS_CFG_BUFFER_SIZE; -+ cfg.cfg_cmd = ZPIOS_CFG_BUFFER_SIZE; - cfg.cfg_arg1 = size; -@@ -376,7 +380,7 @@ dev_size(int size) - fprintf(stderr, "Ioctl() error %lu / %d: %d\n", -- (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -- return rc; -+ (unsigned long) ZPIOS_CFG, cfg.cfg_cmd, errno); -+ return (rc); - } - -- return cfg.cfg_rc1; -+ return (cfg.cfg_rc1); - } -@@ -392,3 +396,3 @@ dev_fini(void) - fprintf(stderr, "Unable to close %s: %d\n", -- ZPIOS_DEV, errno); -+ ZPIOS_DEV, errno); - } -@@ -405,3 +409,3 @@ dev_init(void) - fprintf(stderr, "Unable to open %s: %d\n" -- "Is the zpios module loaded?\n", ZPIOS_DEV, errno); -+ "Is the zpios module loaded?\n", ZPIOS_DEV, errno); - rc = errno; -@@ -424,3 +428,3 @@ dev_init(void) - memset(zpios_buffer, 0, zpios_buffer_size); -- return 0; -+ return (0); - error: -@@ -429,3 +433,3 @@ error: - fprintf(stderr, "Unable to close %s: %d\n", -- ZPIOS_DEV, errno); -+ ZPIOS_DEV, errno); - } -@@ -433,3 +437,3 @@ error: - -- return rc; -+ return (rc); - } -@@ -442,6 +446,6 @@ get_next(uint64_t *val, range_repeat_t *range) - *val = (range->val_low) + -- (range->val_low * range->next_val / 100); -+ (range->val_low * range->next_val / 100); - - if (*val > range->val_high) -- return 0; /* No more values, limit exceeded */ -+ return (0); /* No more values, limit exceeded */ - -@@ -450,5 +454,5 @@ get_next(uint64_t *val, range_repeat_t *range) - else -- range->next_val = range->next_val+range->val_inc_perc; -+ range->next_val = range->next_val + range->val_inc_perc; - -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - -@@ -457,3 +461,3 @@ get_next(uint64_t *val, range_repeat_t *range) - if (range->next_val) -- return 0; /* No more values, we only have one */ -+ return (0); /* No more values, we only have one */ - -@@ -461,3 +465,3 @@ get_next(uint64_t *val, range_repeat_t *range) - range->next_val = 1; -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - -@@ -466,3 +470,3 @@ get_next(uint64_t *val, range_repeat_t *range) - if (range->next_val > range->val_count - 1) -- return 0; /* No more values, limit exceeded */ -+ return (0); /* No more values, limit exceeded */ - -@@ -470,6 +474,6 @@ get_next(uint64_t *val, range_repeat_t *range) - range->next_val++; -- return 1; /* more values to come */ -+ return (1); /* more values to come */ - } - -- return 0; -+ return (0); - } -@@ -478,16 +482,18 @@ static int - run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, -- uint64_t C, uint64_t S, uint64_t O) -+ uint64_t C, uint64_t S, uint64_t O) - { - zpios_cmd_t *cmd; -- int rc, rc2, cmd_size; -+ int rc, rc2, cmd_size; - -- dev_clear(); -+ dev_clear(); - -- cmd_size = sizeof(zpios_cmd_t) + ((T + N + 1) * sizeof(zpios_stats_t)); -- cmd = (zpios_cmd_t *)malloc(cmd_size); -- if (cmd == NULL) -- return ENOMEM; -+ cmd_size = -+ sizeof (zpios_cmd_t) -+ + ((T + N + 1) * sizeof (zpios_stats_t)); -+ cmd = (zpios_cmd_t *)malloc(cmd_size); -+ if (cmd == NULL) -+ return (ENOMEM); - -- memset(cmd, 0, cmd_size); -- cmd->cmd_magic = ZPIOS_CMD_MAGIC; -+ memset(cmd, 0, cmd_size); -+ cmd->cmd_magic = ZPIOS_CMD_MAGIC; - strncpy(cmd->cmd_pool, args->pool, ZPIOS_NAME_SIZE - 1); -@@ -496,15 +502,15 @@ run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, - strncpy(cmd->cmd_log, args->log, ZPIOS_PATH_SIZE - 1); -- cmd->cmd_id = id; -- cmd->cmd_chunk_size = C; -+ cmd->cmd_id = id; -+ cmd->cmd_chunk_size = C; - cmd->cmd_thread_count = T; - cmd->cmd_region_count = N; -- cmd->cmd_region_size = S; -- cmd->cmd_offset = O; -+ cmd->cmd_region_size = S; -+ cmd->cmd_offset = O; - cmd->cmd_region_noise = args->regionnoise; -- cmd->cmd_chunk_noise = args->chunknoise; -+ cmd->cmd_chunk_noise = args->chunknoise; - cmd->cmd_thread_delay = args->thread_delay; -- cmd->cmd_flags = args->flags; -- cmd->cmd_data_size = (T + N + 1) * sizeof(zpios_stats_t); -+ cmd->cmd_flags = args->flags; -+ cmd->cmd_data_size = (T + N + 1) * sizeof (zpios_stats_t); - -- rc = ioctl(zpiosctl_fd, ZPIOS_CMD, cmd); -+ rc = ioctl(zpiosctl_fd, ZPIOS_CMD, cmd); - if (rc) -@@ -514,15 +520,15 @@ run_one(cmd_args_t *args, uint32_t id, uint32_t T, uint32_t N, - -- if (args->verbose) { -- rc2 = read(zpiosctl_fd, zpios_buffer, zpios_buffer_size - 1); -- if (rc2 < 0) { -- fprintf(stdout, "Error reading results: %d\n", rc2); -- } else if ((rc2 > 0) && (strlen(zpios_buffer) > 0)) { -- fprintf(stdout, "\n%s\n", zpios_buffer); -- fflush(stdout); -- } -- } -+ if (args->verbose) { -+ rc2 = read(zpiosctl_fd, zpios_buffer, zpios_buffer_size - 1); -+ if (rc2 < 0) { -+ fprintf(stdout, "Error reading results: %d\n", rc2); -+ } else if ((rc2 > 0) && (strlen(zpios_buffer) > 0)) { -+ fprintf(stdout, "\n%s\n", zpios_buffer); -+ fflush(stdout); -+ } -+ } - -- free(cmd); -+ free(cmd); - -- return rc; -+ return (rc); - } -@@ -536,4 +542,4 @@ run_offsets(cmd_args_t *args) - rc = run_one(args, args->current_id, -- args->current_T, args->current_N, args->current_C, -- args->current_S, args->current_O); -+ args->current_T, args->current_N, args->current_C, -+ args->current_S, args->current_O); - args->current_id++; -@@ -542,3 +548,3 @@ run_offsets(cmd_args_t *args) - args->O.next_val = 0; -- return rc; -+ return (rc); - } -@@ -551,6 +557,6 @@ run_region_counts(cmd_args_t *args) - while (rc == 0 && get_next((uint64_t *)&args->current_N, &args->N)) -- rc = run_offsets(args); -+ rc = run_offsets(args); - - args->N.next_val = 0; -- return rc; -+ return (rc); - } -@@ -564,5 +570,5 @@ run_region_sizes(cmd_args_t *args) - if (args->current_S < args->current_C) { -- fprintf(stderr, "Error: in any run chunksize can " -- "not be smaller than regionsize.\n"); -- return EINVAL; -+ fprintf(stderr, "Error: in any run chunksize must " -+ "be strictly smaller than regionsize.\n"); -+ return (EINVAL); - } -@@ -573,3 +579,3 @@ run_region_sizes(cmd_args_t *args) - args->S.next_val = 0; -- return rc; -+ return (rc); - } -@@ -582,3 +588,3 @@ run_chunk_sizes(cmd_args_t *args) - while (rc == 0 && get_next(&args->current_C, &args->C)) { -- rc = run_region_sizes(args); -+ rc = run_region_sizes(args); - } -@@ -586,3 +592,3 @@ run_chunk_sizes(cmd_args_t *args) - args->C.next_val = 0; -- return rc; -+ return (rc); - } -@@ -597,3 +603,3 @@ run_thread_counts(cmd_args_t *args) - -- return rc; -+ return (rc); - } -@@ -627,3 +633,3 @@ out: - dev_fini(); -- return rc; -+ return (rc); - } -diff --git a/cmd/zpios/zpios_util.c b/cmd/zpios/zpios_util.c -index 9b06655..b226322 100644 ---- a/cmd/zpios/zpios_util.c -+++ b/cmd/zpios/zpios_util.c -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -51,3 +51,3 @@ kmgt_to_uint64(const char *str, uint64_t *val) - if ((str == endptr) && (*val == 0)) -- return EINVAL; -+ return (EINVAL); - -@@ -72,3 +72,3 @@ kmgt_to_uint64(const char *str, uint64_t *val) - -- return rc; -+ return (rc); - } -@@ -87,8 +87,8 @@ uint64_to_kmgt(char *str, uint64_t val) - if (i >= 4) -- (void)snprintf(str, KMGT_SIZE-1, "inf"); -+ (void) snprintf(str, KMGT_SIZE-1, "inf"); - else -- (void)snprintf(str, KMGT_SIZE-1, "%lu%c", (unsigned long)val, -- (i == -1) ? '\0' : postfix[i]); -+ (void) snprintf(str, KMGT_SIZE-1, "%lu%c", (unsigned long)val, -+ (i == -1) ? '\0' : postfix[i]); - -- return str; -+ return (str); - } -@@ -108,8 +108,8 @@ kmgt_per_sec(char *str, uint64_t v, double t) - if (i >= 4) -- (void)snprintf(str, KMGT_SIZE-1, "inf"); -+ (void) snprintf(str, KMGT_SIZE-1, "inf"); - else -- (void)snprintf(str, KMGT_SIZE-1, "%.2f%c", val, -- (i == -1) ? '\0' : postfix[i]); -+ (void) snprintf(str, KMGT_SIZE-1, "%.2f%c", val, -+ (i == -1) ? '\0' : postfix[i]); - -- return str; -+ return (str); - } -@@ -128,3 +128,3 @@ print_flags(char *str, uint32_t flags) - -- return str; -+ return (str); - } -@@ -140,3 +140,3 @@ regex_match(const char *string, char *pattern) - fprintf(stderr, "Error: Couldn't do regcomp, %d\n", rc); -- return rc; -+ return (rc); - } -@@ -146,3 +146,3 @@ regex_match(const char *string, char *pattern) - -- return rc; -+ return (rc); - } -@@ -158,3 +158,3 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - if ((rc = regex_match(optarg, pattern))) -- return rc; -+ return (rc); - -@@ -162,6 +162,7 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - if (cp == NULL) -- return ENOMEM; -+ return (ENOMEM); - - do { -- /* STRTOK(3) Each subsequent call, with a null pointer as the -+ /* -+ * STRTOK(3) Each subsequent call, with a null pointer as the - * value of the * first argument, starts searching from the -@@ -179,3 +180,3 @@ split_string(const char *optarg, char *pattern, range_repeat_t *range) - free(cp); -- return 0; -+ return (0); - } -@@ -184,3 +185,3 @@ int - set_count(char *pattern1, char *pattern2, range_repeat_t *range, -- char *optarg, uint32_t *flags, char *arg) -+ char *optarg, uint32_t *flags, char *arg) - { -@@ -196,14 +197,16 @@ set_count(char *pattern1, char *pattern2, range_repeat_t *range, - fprintf(stderr, "Error: Incorrect pattern for %s, '%s'\n", -- arg, optarg); -- return EINVAL; -+ arg, optarg); -+ return (EINVAL); - } - -- return 0; -+ return (0); - } - --/* validates the value with regular expression and sets low, high, incr -- * according to value at which flag will be set. Sets the flag after. */ -+/* -+ * Validates the value with regular expression and sets low, high, incr -+ * according to value at which flag will be set. Sets the flag after. -+ */ - int - set_lhi(char *pattern, range_repeat_t *range, char *optarg, -- int flag, uint32_t *flag_thread, char *arg) -+ int flag, uint32_t *flag_thread, char *arg) - { -@@ -214,3 +217,3 @@ set_lhi(char *pattern, range_repeat_t *range, char *optarg, - arg, optarg); -- return rc; -+ return (rc); - } -@@ -233,3 +236,3 @@ set_lhi(char *pattern, range_repeat_t *range, char *optarg, - -- return 0; -+ return (0); - } -@@ -243,6 +246,6 @@ set_noise(uint64_t *noise, char *optarg, char *arg) - fprintf(stderr, "Error: Incorrect pattern for %s\n", arg); -- return EINVAL; -+ return (EINVAL); - } - -- return 0; -+ return (0); - } -@@ -257,3 +260,3 @@ set_load_params(cmd_args_t *args, char *optarg) - if (search == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -277,3 +280,3 @@ set_load_params(cmd_args_t *args, char *optarg) - -- return rc; -+ return (rc); - } -@@ -281,5 +284,7 @@ set_load_params(cmd_args_t *args, char *optarg) - --/* checks the low, high, increment values against the single value for -+/* -+ * Checks the low, high, increment values against the single value for - * mutual exclusion, for e.g threadcount is mutually exclusive to -- * threadcount_low, ..._high, ..._incr */ -+ * threadcount_low, ..._high, ..._incr -+ */ - int -@@ -289,12 +294,12 @@ check_mutual_exclusive_command_lines(uint32_t flag, char *arg) - fprintf(stderr, "Error: --%s can not be given with --%s_low, " -- "--%s_high or --%s_incr.\n", arg, arg, arg, arg); -- return 0; -+ "--%s_high or --%s_incr.\n", arg, arg, arg, arg); -+ return (0); - } - -- if ((flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) && !(flag & FLAG_SET)){ -+ if ((flag & (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) && !(flag & FLAG_SET)) { - if (flag != (FLAG_LOW | FLAG_HIGH | FLAG_INCR)) { - fprintf(stderr, "Error: One or more values missing " -- "from --%s_low, --%s_high, --%s_incr.\n", -- arg, arg, arg); -- return 0; -+ "from --%s_low, --%s_high, --%s_incr.\n", -+ arg, arg, arg); -+ return (0); - } -@@ -302,3 +307,3 @@ check_mutual_exclusive_command_lines(uint32_t flag, char *arg) - -- return 1; -+ return (1); - } -@@ -309,16 +314,20 @@ print_stats_header(cmd_args_t *args) - if (args->verbose) { -- printf("status name id\tth-cnt\trg-cnt\trg-sz\t" -- "ch-sz\toffset\trg-no\tch-no\tth-dly\tflags\ttime\t" -- "cr-time\trm-time\twr-time\trd-time\twr-data\twr-ch\t" -- "wr-bw\trd-data\trd-ch\trd-bw\n"); -- printf("------------------------------------------------" -- "------------------------------------------------" -- "------------------------------------------------" -- "----------------------------------------------\n"); -+ printf( -+ "status name id\tth-cnt\trg-cnt\trg-sz\t" -+ "ch-sz\toffset\trg-no\tch-no\tth-dly\tflags\ttime\t" -+ "cr-time\trm-time\twr-time\trd-time\twr-data\twr-ch\t" -+ "wr-bw\trd-data\trd-ch\trd-bw\n"); -+ printf( -+ "------------------------------------------------" -+ "------------------------------------------------" -+ "------------------------------------------------" -+ "----------------------------------------------\n"); - } else { -- printf("status name id\t" -- "wr-data\twr-ch\twr-bw\t" -- "rd-data\trd-ch\trd-bw\n"); -- printf("-----------------------------------------" -- "--------------------------------------\n"); -+ printf( -+ "status name id\t" -+ "wr-data\twr-ch\twr-bw\t" -+ "rd-data\trd-ch\trd-bw\n"); -+ printf( -+ "-----------------------------------------" -+ "--------------------------------------\n"); - } -@@ -339,13 +348,13 @@ print_stats_human_readable(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%-12s", args->name ? args->name : ZPIOS_NAME); -- printf("%2u\t", cmd->cmd_id); -+ printf("%2u\t", cmd->cmd_id); - - if (args->verbose) { -- printf("%u\t", cmd->cmd_thread_count); -- printf("%u\t", cmd->cmd_region_count); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_size)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_size)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_offset)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_noise)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_noise)); -- printf("%s\t", uint64_to_kmgt(str, cmd->cmd_thread_delay)); -+ printf("%u\t", cmd->cmd_thread_count); -+ printf("%u\t", cmd->cmd_region_count); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_size)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_size)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_offset)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_region_noise)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_chunk_noise)); -+ printf("%s\t", uint64_to_kmgt(str, cmd->cmd_thread_delay)); - printf("%s\t", print_flags(str, cmd->cmd_flags)); -@@ -373,8 +382,8 @@ print_stats_human_readable(cmd_args_t *args, zpios_cmd_t *cmd) - -- printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_data)); -- printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_chunks)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_data)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->wr_chunks)); - printf("%s\t", kmgt_per_sec(str, summary_stats->wr_data, wr_time)); - -- printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_data)); -- printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_chunks)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_data)); -+ printf("%s\t", uint64_to_kmgt(str, summary_stats->rd_chunks)); - printf("%s\n", kmgt_per_sec(str, summary_stats->rd_data, rd_time)); -@@ -395,13 +404,13 @@ print_stats_table(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%-12s", args->name ? args->name : ZPIOS_NAME); -- printf("%2u\t", cmd->cmd_id); -+ printf("%2u\t", cmd->cmd_id); - - if (args->verbose) { -- printf("%u\t", cmd->cmd_thread_count); -- printf("%u\t", cmd->cmd_region_count); -- printf("%llu\t", (long long unsigned)cmd->cmd_region_size); -- printf("%llu\t", (long long unsigned)cmd->cmd_chunk_size); -- printf("%llu\t", (long long unsigned)cmd->cmd_offset); -- printf("%u\t", cmd->cmd_region_noise); -- printf("%u\t", cmd->cmd_chunk_noise); -- printf("%u\t", cmd->cmd_thread_delay); -+ printf("%u\t", cmd->cmd_thread_count); -+ printf("%u\t", cmd->cmd_region_count); -+ printf("%llu\t", (long long unsigned)cmd->cmd_region_size); -+ printf("%llu\t", (long long unsigned)cmd->cmd_chunk_size); -+ printf("%llu\t", (long long unsigned)cmd->cmd_offset); -+ printf("%u\t", cmd->cmd_region_noise); -+ printf("%u\t", cmd->cmd_chunk_noise); -+ printf("%u\t", cmd->cmd_thread_delay); - printf("0x%x\t", cmd->cmd_flags); -@@ -420,24 +429,24 @@ print_stats_table(cmd_args_t *args, zpios_cmd_t *cmd) - printf("%ld.%02ld\t", -- (long)summary_stats->total_time.delta.ts_sec, -- (long)summary_stats->total_time.delta.ts_nsec); -+ (long)summary_stats->total_time.delta.ts_sec, -+ (long)summary_stats->total_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->cr_time.delta.ts_sec, -- (long)summary_stats->cr_time.delta.ts_nsec); -+ (long)summary_stats->cr_time.delta.ts_sec, -+ (long)summary_stats->cr_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->rm_time.delta.ts_sec, -- (long)summary_stats->rm_time.delta.ts_nsec); -+ (long)summary_stats->rm_time.delta.ts_sec, -+ (long)summary_stats->rm_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->wr_time.delta.ts_sec, -- (long)summary_stats->wr_time.delta.ts_nsec); -+ (long)summary_stats->wr_time.delta.ts_sec, -+ (long)summary_stats->wr_time.delta.ts_nsec); - printf("%ld.%02ld\t", -- (long)summary_stats->rd_time.delta.ts_sec, -- (long)summary_stats->rd_time.delta.ts_nsec); -+ (long)summary_stats->rd_time.delta.ts_sec, -+ (long)summary_stats->rd_time.delta.ts_nsec); - } - -- printf("%lld\t", (long long unsigned)summary_stats->wr_data); -- printf("%lld\t", (long long unsigned)summary_stats->wr_chunks); -+ printf("%lld\t", (long long unsigned)summary_stats->wr_data); -+ printf("%lld\t", (long long unsigned)summary_stats->wr_chunks); - printf("%.4f\t", (double)summary_stats->wr_data / wr_time); - -- printf("%lld\t", (long long unsigned)summary_stats->rd_data); -- printf("%lld\t", (long long unsigned)summary_stats->rd_chunks); -+ printf("%lld\t", (long long unsigned)summary_stats->rd_data); -+ printf("%lld\t", (long long unsigned)summary_stats->rd_chunks); - printf("%.4f\n", (double)summary_stats->rd_data / rd_time); -diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am -index 2ce8efc..a39a240 100644 ---- a/cmd/zpool/Makefile.am -+++ b/cmd/zpool/Makefile.am -@@ -19,4 +19,4 @@ zpool_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -- --zpool_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ -+ $(LIBBLKID) -diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c -index b96fbe4..e38213c 100644 ---- a/cmd/zpool/zpool_main.c -+++ b/cmd/zpool/zpool_main.c -@@ -51,2 +51,3 @@ - #include -+#include - -@@ -198,5 +199,5 @@ static zpool_command_t command_table[] = { - --zpool_command_t *current_command; -+static zpool_command_t *current_command; - static char history_str[HIS_MAX_RECORD_LEN]; -- -+static boolean_t log_history = B_TRUE; - static uint_t timestamp_fmt = NODATE; -@@ -258,3 +259,3 @@ get_usage(zpool_help_t idx) { - case HELP_STATUS: -- return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " -+ return (gettext("\tstatus [-vxD] [-T d|u] [pool] ... [interval " - "[count]]\n")); -@@ -267,3 +268,3 @@ get_usage(zpool_help_t idx) { - case HELP_GET: -- return (gettext("\tget <\"all\" | property[,...]> " -+ return (gettext("\tget [-p] <\"all\" | property[,...]> " - " ...\n")); -@@ -833,2 +834,3 @@ zpool_do_create(int argc, char **argv) - case 'm': -+ /* Equivalent to -O mountpoint=optarg */ - mountpoint = optarg; -@@ -871,4 +873,14 @@ zpool_do_create(int argc, char **argv) - -- if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) -+ /* -+ * Mountpoints are checked and then added later. -+ * Uniquely among properties, they can be specified -+ * more than once, to avoid conflict with -m. -+ */ -+ if (0 == strcmp(optarg, -+ zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { -+ mountpoint = propval; -+ } else if (add_prop_list(optarg, propval, &fsprops, -+ B_FALSE)) { - goto errout; -+ } - break; -@@ -989,2 +1001,14 @@ zpool_do_create(int argc, char **argv) - -+ /* -+ * Now that the mountpoint's validity has been checked, ensure that -+ * the property is set appropriately prior to creating the pool. -+ */ -+ if (mountpoint != NULL) { -+ ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), -+ mountpoint, &fsprops, B_FALSE); -+ if (ret != 0) -+ goto errout; -+ } -+ -+ ret = 1; - if (dryrun) { -@@ -1023,4 +1047,5 @@ zpool_do_create(int argc, char **argv) - -- if (add_prop_list(propname, ZFS_FEATURE_ENABLED, -- &props, B_TRUE) != 0) -+ ret = add_prop_list(propname, -+ ZFS_FEATURE_ENABLED, &props, B_TRUE); -+ if (ret != 0) - goto errout; -@@ -1028,2 +1053,4 @@ zpool_do_create(int argc, char **argv) - } -+ -+ ret = 1; - if (zpool_create(g_zfs, poolname, -@@ -1033,7 +1060,2 @@ zpool_do_create(int argc, char **argv) - if (pool != NULL) { -- if (mountpoint != NULL) -- verify(zfs_prop_set(pool, -- zfs_prop_to_name( -- ZFS_PROP_MOUNTPOINT), -- mountpoint) == 0); - if (zfs_mount(pool, NULL, 0) == 0) -@@ -1121,3 +1143,6 @@ zpool_do_destroy(int argc, char **argv) - -- ret = (zpool_destroy(zhp) != 0); -+ /* The history must be logged as part of the export */ -+ log_history = B_FALSE; -+ -+ ret = (zpool_destroy(zhp, history_str) != 0); - -@@ -1185,6 +1210,9 @@ zpool_do_export(int argc, char **argv) - -+ /* The history must be logged as part of the export */ -+ log_history = B_FALSE; -+ - if (hardforce) { -- if (zpool_export_force(zhp) != 0) -+ if (zpool_export_force(zhp, history_str) != 0) - ret = 1; -- } else if (zpool_export(zhp, force) != 0) { -+ } else if (zpool_export(zhp, force, history_str) != 0) { - ret = 1; -@@ -1583,3 +1611,4 @@ show_import(nvlist_t *config) - nvlist_t *nvroot; -- int reason; -+ zpool_status_t reason; -+ zpool_errata_t errata; - const char *health; -@@ -1602,3 +1631,3 @@ show_import(nvlist_t *config) - -- reason = zpool_import_status(config, &msgid); -+ reason = zpool_import_status(config, &msgid, &errata); - -@@ -1690,2 +1719,7 @@ show_import(nvlist_t *config) - -+ case ZPOOL_STATUS_ERRATA: -+ (void) printf(gettext(" status: Errata #%d detected.\n"), -+ errata); -+ break; -+ - default: -@@ -1711,2 +1745,30 @@ show_import(nvlist_t *config) - "identifier and\n\tthe '-f' flag.\n")); -+ } else if (reason == ZPOOL_STATUS_ERRATA) { -+ switch (errata) { -+ case ZPOOL_ERRATA_NONE: -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_SCRUB: -+ (void) printf(gettext(" action: The pool can " -+ "be imported using its name or numeric " -+ "identifier,\n\thowever there is a compat" -+ "ibility issue which should be corrected" -+ "\n\tby running 'zpool scrub'\n")); -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: -+ (void) printf(gettext(" action: The pool can" -+ "not be imported with this version of ZFS " -+ "due to\n\tan active asynchronous destroy. " -+ "Revert to an earlier version\n\tand " -+ "allow the destroy to complete before " -+ "updating.\n")); -+ break; -+ -+ default: -+ /* -+ * All errata must contain an action message. -+ */ -+ assert(0); -+ } - } else { -@@ -1954,3 +2016,3 @@ zpool_do_import(int argc, char **argv) - /* check options */ -- while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX")) != -1) { -+ while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:tT:VX")) != -1) { - switch (c) { -@@ -2016,2 +2078,6 @@ zpool_do_import(int argc, char **argv) - break; -+ case 't': -+ flags |= ZFS_IMPORT_TEMP_NAME; -+ break; -+ - case 'T': -@@ -2543,3 +2609,3 @@ get_columns(void) - -- return columns; -+ return (columns); - } -@@ -4100,3 +4166,4 @@ status_callback(zpool_handle_t *zhp, void *data) - char *msgid; -- int reason; -+ zpool_status_t reason; -+ zpool_errata_t errata; - const char *health; -@@ -4106,3 +4173,3 @@ status_callback(zpool_handle_t *zhp, void *data) - config = zpool_get_config(zhp, NULL); -- reason = zpool_get_status(zhp, &msgid); -+ reason = zpool_get_status(zhp, &msgid, &errata); - -@@ -4324,2 +4391,24 @@ status_callback(zpool_handle_t *zhp, void *data) - -+ case ZPOOL_STATUS_ERRATA: -+ (void) printf(gettext("status: Errata #%d detected.\n"), -+ errata); -+ -+ switch (errata) { -+ case ZPOOL_ERRATA_NONE: -+ break; -+ -+ case ZPOOL_ERRATA_ZOL_2094_SCRUB: -+ (void) printf(gettext("action: To correct the issue " -+ "run 'zpool scrub'.\n")); -+ break; -+ -+ default: -+ /* -+ * All errata which allow the pool to be imported -+ * must contain an action message. -+ */ -+ assert(0); -+ } -+ break; -+ - default: -@@ -4587,9 +4676,2 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) - --#if 0 -- /* -- * XXX: This code can be enabled when Illumos commit -- * 4445fffbbb1ea25fd0e9ea68b9380dd7a6709025 is merged. -- * It reworks the history logging among other things. -- */ -- - /* -@@ -4602,3 +4684,2 @@ upgrade_cb(zpool_handle_t *zhp, void *arg) - log_history = B_FALSE; --#endif - } -@@ -4702,2 +4783,10 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) - } -+ /* -+ * If they did "zpool upgrade -a", then we could -+ * be doing ioctls to different pools. We need -+ * to log this history once to each pool, and bypass -+ * the normal history logging that happens in main(). -+ */ -+ (void) zpool_log_history(g_zfs, history_str); -+ log_history = B_FALSE; - } -@@ -4957,4 +5046,4 @@ typedef struct hist_cbdata { - boolean_t first; -- int longfmt; -- int internal; -+ boolean_t longfmt; -+ boolean_t internal; - } hist_cbdata_t; -@@ -4970,17 +5059,4 @@ get_history_one(zpool_handle_t *zhp, void *data) - uint_t numrecords; -- char *cmdstr; -- char *pathstr; -- uint64_t dst_time; -- time_t tsec; -- struct tm t; -- char tbuf[30]; - int ret, i; -- uint64_t who; -- struct passwd *pwd; -- char *hostname; -- char *zonename; -- char internalstr[MAXPATHLEN]; - hist_cbdata_t *cb = (hist_cbdata_t *)data; -- uint64_t txg; -- uint64_t ievent; - -@@ -4996,32 +5072,71 @@ get_history_one(zpool_handle_t *zhp, void *data) - for (i = 0; i < numrecords; i++) { -- if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME, -- &dst_time) != 0) -- continue; -+ nvlist_t *rec = records[i]; -+ char tbuf[30] = ""; - -- /* is it an internal event or a standard event? */ -- if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD, -- &cmdstr) != 0) { -- if (cb->internal == 0) -- continue; -+ if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { -+ time_t tsec; -+ struct tm t; -+ -+ tsec = fnvlist_lookup_uint64(records[i], -+ ZPOOL_HIST_TIME); -+ (void) localtime_r(&tsec, &t); -+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); -+ } - -- if (nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_INT_EVENT, &ievent) != 0) -+ if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { -+ (void) printf("%s %s", tbuf, -+ fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { -+ int ievent = -+ fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); -+ if (!cb->internal) -+ continue; -+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { -+ (void) printf("%s unrecognized record:\n", -+ tbuf); -+ dump_nvlist(rec, 4); -+ continue; -+ } -+ (void) printf("%s [internal %s txg:%lld] %s", tbuf, -+ zfs_history_event_names[ievent], -+ (longlong_t) fnvlist_lookup_uint64( -+ rec, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { -+ if (!cb->internal) - continue; -- verify(nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_TXG, &txg) == 0); -- verify(nvlist_lookup_string(records[i], -- ZPOOL_HIST_INT_STR, &pathstr) == 0); -- if (ievent >= LOG_END) -+ (void) printf("%s [txg:%lld] %s", tbuf, -+ (longlong_t) fnvlist_lookup_uint64( -+ rec, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); -+ if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { -+ (void) printf(" %s (%llu)", -+ fnvlist_lookup_string(rec, -+ ZPOOL_HIST_DSNAME), -+ (u_longlong_t)fnvlist_lookup_uint64(rec, -+ ZPOOL_HIST_DSID)); -+ } -+ (void) printf(" %s", fnvlist_lookup_string(rec, -+ ZPOOL_HIST_INT_STR)); -+ } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { -+ if (!cb->internal) -+ continue; -+ (void) printf("%s ioctl %s\n", tbuf, -+ fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); -+ if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { -+ (void) printf(" input:\n"); -+ dump_nvlist(fnvlist_lookup_nvlist(rec, -+ ZPOOL_HIST_INPUT_NVL), 8); -+ } -+ if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { -+ (void) printf(" output:\n"); -+ dump_nvlist(fnvlist_lookup_nvlist(rec, -+ ZPOOL_HIST_OUTPUT_NVL), 8); -+ } -+ } else { -+ if (!cb->internal) - continue; -- (void) snprintf(internalstr, -- sizeof (internalstr), -- "[internal %s txg:%llu] %s", -- zfs_history_event_names[ievent], (u_longlong_t)txg, -- pathstr); -- cmdstr = internalstr; -+ (void) printf("%s unrecognized record:\n", tbuf); -+ dump_nvlist(rec, 4); - } -- tsec = dst_time; -- (void) localtime_r(&tsec, &t); -- (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); -- (void) printf("%s %s", tbuf, cmdstr); - -@@ -5032,22 +5147,16 @@ get_history_one(zpool_handle_t *zhp, void *data) - (void) printf(" ["); -- if (nvlist_lookup_uint64(records[i], -- ZPOOL_HIST_WHO, &who) == 0) { -- pwd = getpwuid((uid_t)who); -- if (pwd) -- (void) printf("user %s on", -- pwd->pw_name); -- else -- (void) printf("user %d on", -- (int)who); -- } else { -- (void) printf(gettext("no info]\n")); -- continue; -+ if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { -+ uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); -+ struct passwd *pwd = getpwuid(who); -+ (void) printf("user %d ", (int)who); -+ if (pwd != NULL) -+ (void) printf("(%s) ", pwd->pw_name); - } -- if (nvlist_lookup_string(records[i], -- ZPOOL_HIST_HOST, &hostname) == 0) { -- (void) printf(" %s", hostname); -+ if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { -+ (void) printf("on %s", -+ fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); - } -- if (nvlist_lookup_string(records[i], -- ZPOOL_HIST_ZONE, &zonename) == 0) { -- (void) printf(":%s", zonename); -+ if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { -+ (void) printf(":%s", -+ fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); - } -@@ -5068,4 +5177,2 @@ get_history_one(zpool_handle_t *zhp, void *data) - */ -- -- - int -@@ -5082,6 +5189,6 @@ zpool_do_history(int argc, char **argv) - case 'l': -- cbdata.longfmt = 1; -+ cbdata.longfmt = B_TRUE; - break; - case 'i': -- cbdata.internal = 1; -+ cbdata.internal = B_TRUE; - break; -@@ -5124,6 +5231,6 @@ zpool_do_events_short(nvlist_t *nvl) - (void) ctime_r((const time_t *)&tv[0], ctime_str); -- (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ -- (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ -- (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ -- (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]);/* '.123456789' */ -+ (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ -+ (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ -+ (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ -+ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ - (void) printf(gettext("%s "), str); -@@ -5235,6 +5342,6 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - printf(gettext("%*s%s[%d] = %s\n"), -- depth, "", name, i, "(embedded nvlist)"); -+ depth, "", name, i, "(embedded nvlist)"); - zpool_do_events_nvprint(val[i], depth + 8); - printf(gettext("%*s(end %s[%i])\n"), -- depth, "", name, i); -+ depth, "", name, i); - } -@@ -5316,3 +5423,4 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - for (i = 0; i < nelem; i++) -- printf(gettext("0x%llx "), (u_longlong_t)val[i]); -+ printf(gettext("0x%llx "), -+ (u_longlong_t)val[i]); - -@@ -5327,3 +5435,16 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - for (i = 0; i < nelem; i++) -- printf(gettext("0x%llx "), (u_longlong_t)val[i]); -+ printf(gettext("0x%llx "), -+ (u_longlong_t)val[i]); -+ -+ break; -+ } -+ -+ case DATA_TYPE_STRING_ARRAY: { -+ char **str; -+ uint_t i, nelem; -+ -+ (void) nvpair_value_string_array(nvp, &str, &nelem); -+ for (i = 0; i < nelem; i++) -+ printf(gettext("\"%s\" "), -+ str[i] ? str[i] : ""); - -@@ -5332,3 +5453,2 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) - -- case DATA_TYPE_STRING_ARRAY: - case DATA_TYPE_BOOLEAN_ARRAY: -@@ -5349,6 +5469,6 @@ zpool_do_events_next(ev_opts_t *opts) - nvlist_t *nvl; -- int cleanup_fd, ret, dropped; -+ int zevent_fd, ret, dropped; - -- cleanup_fd = open(ZFS_DEV, O_RDWR); -- VERIFY(cleanup_fd >= 0); -+ zevent_fd = open(ZFS_DEV, O_RDWR); -+ VERIFY(zevent_fd >= 0); - -@@ -5359,3 +5479,3 @@ zpool_do_events_next(ev_opts_t *opts) - ret = zpool_events_next(g_zfs, &nvl, &dropped, -- !!opts->follow, cleanup_fd); -+ (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); - if (ret || nvl == NULL) -@@ -5377,3 +5497,3 @@ zpool_do_events_next(ev_opts_t *opts) - -- VERIFY(0 == close(cleanup_fd)); -+ VERIFY(0 == close(zevent_fd)); - -@@ -5435,3 +5555,3 @@ zpool_do_events(int argc, char **argv) - -- return ret; -+ return (ret); - } -@@ -5468,4 +5588,4 @@ get_callback(zpool_handle_t *zhp, void *data) - } else { -- if (zpool_get_prop(zhp, pl->pl_prop, value, -- sizeof (value), &srctype) != 0) -+ if (zpool_get_prop_literal(zhp, pl->pl_prop, value, -+ sizeof (value), &srctype, cbp->cb_literal) != 0) - continue; -@@ -5485,5 +5605,22 @@ zpool_do_get(int argc, char **argv) - zprop_list_t fake_name = { 0 }; -- int ret; -+ int c, ret; - -- if (argc < 2) { -+ /* check options */ -+ while ((c = getopt(argc, argv, "p")) != -1) { -+ switch (c) { -+ case 'p': -+ cb.cb_literal = B_TRUE; -+ break; -+ -+ case '?': -+ (void) fprintf(stderr, gettext("invalid option '%c'\n"), -+ optopt); -+ usage(B_FALSE); -+ } -+ } -+ -+ argc -= optind; -+ argv += optind; -+ -+ if (argc < 1) { - (void) fprintf(stderr, gettext("missing property " -@@ -5501,6 +5638,8 @@ zpool_do_get(int argc, char **argv) - -- if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, -- ZFS_TYPE_POOL) != 0) -+ if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) - usage(B_FALSE); - -+ argc--; -+ argv++; -+ - if (cb.cb_proplist != NULL) { -@@ -5512,3 +5651,3 @@ zpool_do_get(int argc, char **argv) - -- ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist, -+ ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - get_callback, &cb); -@@ -5630,4 +5769,3 @@ main(int argc, char **argv) - */ -- if ((strcmp(cmdname, "-?") == 0) || -- strcmp(cmdname, "--help") == 0) -+ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) - usage(B_TRUE); -@@ -5639,4 +5777,3 @@ main(int argc, char **argv) - -- zpool_set_history_str("zpool", argc, argv, history_str); -- verify(zpool_stage_history(g_zfs, history_str) == 0); -+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); - -@@ -5668,2 +5805,5 @@ main(int argc, char **argv) - -+ if (ret == 0 && log_history) -+ (void) zpool_log_history(g_zfs, history_str); -+ - libzfs_fini(g_zfs); -diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h -index b67ff8b..1b4ce51 100644 ---- a/cmd/zpool/zpool_util.h -+++ b/cmd/zpool/zpool_util.h -@@ -46,3 +46,4 @@ uint_t num_logs(nvlist_t *nv); - nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, -- int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv); -+ int check_rep, boolean_t replacing, boolean_t dryrun, int argc, -+ char **argv); - nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, -diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c -index 723e10b..316e291 100644 ---- a/cmd/zpool/zpool_vdev.c -+++ b/cmd/zpool/zpool_vdev.c -@@ -82,3 +82,3 @@ - #else --#define blkid_cache void * -+#define blkid_cache void * - #endif /* HAVE_LIBBLKID */ -@@ -108,8 +108,47 @@ typedef struct vdev_disk_db_entry - static vdev_disk_db_entry_t vdev_disk_database[] = { -+ {"ATA ADATA SSD S396 3", 8192}, -+ {"ATA APPLE SSD SM128E", 8192}, -+ {"ATA APPLE SSD SM256E", 8192}, -+ {"ATA APPLE SSD SM512E", 8192}, -+ {"ATA APPLE SSD SM768E", 8192}, -+ {"ATA C400-MTFDDAC064M", 8192}, -+ {"ATA C400-MTFDDAC128M", 8192}, -+ {"ATA C400-MTFDDAC256M", 8192}, -+ {"ATA C400-MTFDDAC512M", 8192}, - {"ATA Corsair Force 3 ", 8192}, -+ {"ATA Corsair Force GS", 8192}, - {"ATA INTEL SSDSA2CT04", 8192}, -+ {"ATA INTEL SSDSA2BZ10", 8192}, -+ {"ATA INTEL SSDSA2BZ20", 8192}, -+ {"ATA INTEL SSDSA2BZ30", 8192}, -+ {"ATA INTEL SSDSA2CW04", 8192}, -+ {"ATA INTEL SSDSA2CW08", 8192}, -+ {"ATA INTEL SSDSA2CW12", 8192}, - {"ATA INTEL SSDSA2CW16", 8192}, -+ {"ATA INTEL SSDSA2CW30", 8192}, -+ {"ATA INTEL SSDSA2CW60", 8192}, -+ {"ATA INTEL SSDSC2BA10", 8192}, -+ {"ATA INTEL SSDSC2BA20", 8192}, -+ {"ATA INTEL SSDSC2BA40", 8192}, -+ {"ATA INTEL SSDSC2BA80", 8192}, -+ {"ATA INTEL SSDSC2BB08", 8192}, -+ {"ATA INTEL SSDSC2BB12", 8192}, -+ {"ATA INTEL SSDSC2BB16", 8192}, -+ {"ATA INTEL SSDSC2BB24", 8192}, -+ {"ATA INTEL SSDSC2BB30", 8192}, -+ {"ATA INTEL SSDSC2BB40", 8192}, -+ {"ATA INTEL SSDSC2BB48", 8192}, -+ {"ATA INTEL SSDSC2BB60", 8192}, -+ {"ATA INTEL SSDSC2BB80", 8192}, -+ {"ATA INTEL SSDSC2CT06", 8192}, -+ {"ATA INTEL SSDSC2CT12", 8192}, - {"ATA INTEL SSDSC2CT18", 8192}, -+ {"ATA INTEL SSDSC2CT24", 8192}, -+ {"ATA INTEL SSDSC2CW06", 8192}, - {"ATA INTEL SSDSC2CW12", 8192}, -+ {"ATA INTEL SSDSC2CW18", 8192}, -+ {"ATA INTEL SSDSC2CW24", 8192}, -+ {"ATA INTEL SSDSC2CW48", 8192}, - {"ATA KINGSTON SH100S3", 8192}, -+ {"ATA KINGSTON SH103S3", 8192}, - {"ATA M4-CT064M4SSD2 ", 8192}, -@@ -119,2 +158,3 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA OCZ-AGILITY2 ", 8192}, -+ {"ATA OCZ-AGILITY3 ", 8192}, - {"ATA OCZ-VERTEX2 3.5 ", 8192}, -@@ -123,4 +163,18 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA OCZ-VERTEX3 MI ", 8192}, -+ {"ATA OCZ-VERTEX4 ", 8192}, -+ {"ATA SAMSUNG MZ7WD120", 8192}, -+ {"ATA SAMSUNG MZ7WD240", 8192}, -+ {"ATA SAMSUNG MZ7WD480", 8192}, -+ {"ATA SAMSUNG MZ7WD960", 8192}, - {"ATA SAMSUNG SSD 830 ", 8192}, - {"ATA Samsung SSD 840 ", 8192}, -+ {"ATA SanDisk SSD U100", 8192}, -+ {"ATA TOSHIBA THNSNH06", 8192}, -+ {"ATA TOSHIBA THNSNH12", 8192}, -+ {"ATA TOSHIBA THNSNH25", 8192}, -+ {"ATA TOSHIBA THNSNH51", 8192}, -+ {"ATA APPLE SSD TS064C", 4096}, -+ {"ATA APPLE SSD TS128C", 4096}, -+ {"ATA APPLE SSD TS256C", 4096}, -+ {"ATA APPLE SSD TS512C", 4096}, - {"ATA INTEL SSDSA2M040", 4096}, -@@ -128,3 +182,10 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"ATA INTEL SSDSA2M160", 4096}, -- /* Imported from Open Solaris*/ -+ {"ATA INTEL SSDSC2MH12", 4096}, -+ {"ATA INTEL SSDSC2MH25", 4096}, -+ {"ATA OCZ CORE_SSD ", 4096}, -+ {"ATA OCZ-VERTEX ", 4096}, -+ {"ATA SAMSUNG MCCOE32G", 4096}, -+ {"ATA SAMSUNG MCCOE64G", 4096}, -+ {"ATA SAMSUNG SSD PM80", 4096}, -+ /* Imported from Open Solaris */ - {"ATA MARVELL SD88SA02", 4096}, -@@ -148,2 +209,4 @@ static vdev_disk_db_entry_t vdev_disk_database[] = { - {"OI COMSTAR ", 8192}, -+ {"SUN COMSTAR ", 8192}, -+ {"NETAPP LUN ", 8192}, - #endif -@@ -170,6 +233,6 @@ check_sector_size_database(char *path, int *sector_size) - /* Prepare INQUIRY command */ -- memset(&io_hdr, 0, sizeof(sg_io_hdr_t)); -+ memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); - io_hdr.interface_id = 'S'; -- io_hdr.cmd_len = sizeof(inq_cmd_blk); -- io_hdr.mx_sb_len = sizeof(sense_buffer); -+ io_hdr.cmd_len = sizeof (inq_cmd_blk); -+ io_hdr.mx_sb_len = sizeof (sense_buffer); - io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; -@@ -179,3 +242,3 @@ check_sector_size_database(char *path, int *sector_size) - io_hdr.sbp = sense_buffer; -- io_hdr.timeout = 10; /* 10 milliseconds is ample time */ -+ io_hdr.timeout = 10; /* 10 milliseconds is ample time */ - -@@ -324,3 +387,3 @@ check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) - vdev_error(gettext("%s contains a filesystem of " -- "type '%s'\n"), path, value); -+ "type '%s'\n"), path, value); - } -@@ -342,3 +405,3 @@ static int - check_disk(const char *path, blkid_cache cache, int force, -- boolean_t isspare, boolean_t iswholedisk) -+ boolean_t isspare, boolean_t iswholedisk) - { -@@ -351,3 +414,3 @@ check_disk(const char *path, blkid_cache cache, int force, - if (!iswholedisk) -- return check_slice(path, cache, force, isspare); -+ return (check_slice(path, cache, force, isspare)); - -@@ -363,3 +426,3 @@ check_disk(const char *path, blkid_cache cache, int force, - check_error(errno); -- return -1; -+ return (-1); - } -@@ -370,3 +433,3 @@ check_disk(const char *path, blkid_cache cache, int force, - if (force) { -- return 0; -+ return (0); - } else { -@@ -375,3 +438,3 @@ check_disk(const char *path, blkid_cache cache, int force, - "information in the MBR.\n"), path); -- return -1; -+ return (-1); - } -@@ -390,3 +453,3 @@ check_disk(const char *path, blkid_cache cache, int force, - /* Partitions will no be created using the backup */ -- return 0; -+ return (0); - } else { -@@ -394,3 +457,3 @@ check_disk(const char *path, blkid_cache cache, int force, - "EFI label.\n"), path); -- return -1; -+ return (-1); - } -@@ -425,3 +488,3 @@ static int - check_device(const char *path, boolean_t force, -- boolean_t isspare, boolean_t iswholedisk) -+ boolean_t isspare, boolean_t iswholedisk) - { -@@ -439,3 +502,3 @@ check_device(const char *path, boolean_t force, - check_error(err); -- return -1; -+ return (-1); - } -@@ -445,3 +508,3 @@ check_device(const char *path, boolean_t force, - check_error(err); -- return -1; -+ return (-1); - } -@@ -450,3 +513,3 @@ check_device(const char *path, boolean_t force, - -- return check_disk(path, cache, force, isspare, iswholedisk); -+ return (check_disk(path, cache, force, isspare, iswholedisk)); - } -@@ -465,3 +528,3 @@ is_whole_disk(const char *path) - struct dk_gpt *label; -- int fd; -+ int fd; - -@@ -486,3 +549,3 @@ static int - is_shorthand_path(const char *arg, char *path, -- struct stat64 *statbuf, boolean_t *wholedisk) -+ struct stat64 *statbuf, boolean_t *wholedisk) - { -@@ -497,4 +560,4 @@ is_shorthand_path(const char *arg, char *path, - -- strlcpy(path, arg, sizeof(path)); -- memset(statbuf, 0, sizeof(*statbuf)); -+ strlcpy(path, arg, sizeof (path)); -+ memset(statbuf, 0, sizeof (*statbuf)); - *wholedisk = B_FALSE; -@@ -1075,3 +1138,3 @@ zero_label(char *path) - -- return 0; -+ return (0); - } -@@ -1164,3 +1227,3 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) - if (!is_exclusive || !is_spare(NULL, udevpath)) { -- ret = strncmp(udevpath,UDISK_ROOT,strlen(UDISK_ROOT)); -+ ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); - if (ret == 0) { -@@ -1175,3 +1238,3 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) - -- ret = zpool_label_disk_wait(udevpath, 1000); -+ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); - if (ret) { -@@ -1238,3 +1301,3 @@ check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, - verify(!nvlist_lookup_uint64(nv, -- ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); -+ ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); - -@@ -1441,4 +1504,4 @@ construct_spec(nvlist_t *props, int argc, char **argv) - zpool_no_memory(); -- if ((nv = make_leaf_vdev(props, argv[c], B_FALSE)) -- == NULL) -+ if ((nv = make_leaf_vdev(props, argv[c], -+ B_FALSE)) == NULL) - return (NULL); -@@ -1497,3 +1560,4 @@ construct_spec(nvlist_t *props, int argc, char **argv) - */ -- if ((nv = make_leaf_vdev(props, argv[0], is_log)) == NULL) -+ if ((nv = make_leaf_vdev(props, argv[0], -+ is_log)) == NULL) - return (NULL); -diff --git a/cmd/zstreamdump/Makefile.am b/cmd/zstreamdump/Makefile.am -index 3d7ec41..d6c64f5 100644 ---- a/cmd/zstreamdump/Makefile.am -+++ b/cmd/zstreamdump/Makefile.am -@@ -15,4 +15,5 @@ zstreamdump_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --zstreamdump_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+zstreamdump_LDADD += $(ZLIB) -diff --git a/cmd/ztest/Makefile.am b/cmd/ztest/Makefile.am -index 3989201..a3cd834 100644 ---- a/cmd/ztest/Makefile.am -+++ b/cmd/ztest/Makefile.am -@@ -17,4 +17,5 @@ ztest_LDADD = \ - $(top_builddir)/lib/libzpool/libzpool.la \ -- $(top_builddir)/lib/libzfs/libzfs.la -+ $(top_builddir)/lib/libzfs/libzfs.la \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la - --ztest_LDFLAGS = -pthread -lm $(ZLIB) -lrt -ldl $(LIBUUID) $(LIBBLKID) -+ztest_LDADD += -lm -ldl -diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c -index 93a5f1e..d392a62 100644 ---- a/cmd/ztest/ztest.c -+++ b/cmd/ztest/ztest.c -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -108,2 +109,3 @@ - #include -+#include - #include -@@ -112,2 +114,3 @@ - #include -+#include - #include -@@ -207,2 +210,3 @@ enum ztest_io_type { - ZTEST_IO_SETATTR, -+ ZTEST_IO_REWRITE, - ZTEST_IO_TYPES -@@ -364,3 +368,3 @@ ztest_info_t ztest_info[] = { - { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, -- { ztest_reguid, 1, &zopt_sometimes }, -+ { ztest_reguid, 1, &zopt_rarely }, - { ztest_spa_rename, 1, &zopt_rarely }, -@@ -369,3 +373,3 @@ ztest_info_t ztest_info[] = { - { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, -- { ztest_vdev_attach_detach, 1, &zopt_rarely }, -+ { ztest_vdev_attach_detach, 1, &zopt_sometimes }, - { ztest_vdev_LUN_growth, 1, &zopt_rarely }, -@@ -787,2 +791,14 @@ ztest_kill(ztest_shared_t *zs) - zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); -+ -+ /* -+ * Before we kill off ztest, make sure that the config is updated. -+ * See comment above spa_config_sync(). -+ */ -+ mutex_enter(&spa_namespace_lock); -+ spa_config_sync(ztest_spa, B_FALSE, B_FALSE); -+ mutex_exit(&spa_namespace_lock); -+ -+ if (ztest_opts.zo_verbose >= 3) -+ zfs_dbgmsg_print(FTAG); -+ - (void) kill(getpid(), SIGKILL); -@@ -1033,5 +1049,4 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, - -- error = dsl_prop_set(osname, propname, -- (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), -- sizeof (value), 1, &value); -+ error = dsl_prop_set_int(osname, propname, -+ (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); - -@@ -1044,4 +1059,3 @@ ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, - setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); -- VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), -- 1, &curval, setpoint), ==, 0); -+ VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); - -@@ -1901,2 +1915,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -2049,2 +2069,5 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) - -+ /* -+ * No object was found. -+ */ - if (od->od_object == 0) -@@ -2164,2 +2187,3 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) - { -+ int err; - ztest_block_tag_t wbt; -@@ -2218,2 +2242,21 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) - break; -+ -+ case ZTEST_IO_REWRITE: -+ (void) rw_enter(&ztest_name_lock, RW_READER); -+ err = ztest_dsl_prop_set_uint64(zd->zd_name, -+ ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), -+ B_FALSE); -+ VERIFY(err == 0 || err == ENOSPC); -+ err = ztest_dsl_prop_set_uint64(zd->zd_name, -+ ZFS_PROP_COMPRESSION, -+ ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), -+ B_FALSE); -+ VERIFY(err == 0 || err == ENOSPC); -+ (void) rw_exit(&ztest_name_lock); -+ -+ VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, -+ DMU_READ_NO_PREFETCH)); -+ -+ (void) ztest_write(zd, object, offset, blocksize, data); -+ break; - } -@@ -2305,2 +2348,7 @@ ztest_zil_remount(ztest_ds_t *zd, uint64_t id) - -+ /* -+ * We grab the zd_dirobj_lock to ensure that no other thread is -+ * updating the zil (i.e. adding in-memory log records) and the -+ * zd_zilog_lock to block any I/O. -+ */ - mutex_enter(&zd->zd_dirobj_lock); -@@ -2336,3 +2384,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - VERIFY3U(ENOENT, ==, -- spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); -+ spa_create("ztest_bad_file", nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2344,3 +2392,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - VERIFY3U(ENOENT, ==, -- spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); -+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2353,3 +2401,3 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) - nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); -- VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); -+ VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); - nvlist_free(nvroot); -@@ -2411,3 +2459,3 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) - zpool_prop_to_name(ZPOOL_PROP_VERSION), version); -- VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); -+ VERIFY3S(spa_create(name, nvroot, props, NULL), ==, 0); - fnvlist_free(nvroot); -@@ -2486,4 +2534,3 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) - mutex_enter(&ztest_vdev_lock); -- leaves = -- MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; -+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; - -@@ -2509,3 +2556,3 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) - * grab a reference on the dataset which may cause -- * dmu_objset_destroy() to fail with EBUSY thus -+ * dsl_destroy_head() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. -@@ -2741,3 +2788,3 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) - uint64_t oldguid, pguid; -- size_t oldsize, newsize; -+ uint64_t oldsize, newsize; - char *oldpath, *newpath; -@@ -2902,4 +2949,4 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) - "returned %d, expected %d", -- oldpath, (longlong_t)oldsize, newpath, -- (longlong_t)newsize, replacing, error, expected_error); -+ oldpath, oldsize, newpath, -+ newsize, replacing, error, expected_error); - } -@@ -3198,3 +3245,3 @@ ztest_objset_destroy_cb(const char *name, void *arg) - */ -- VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); - error = dmu_object_info(os, ZTEST_DIROBJ, &doi); -@@ -3206,3 +3253,3 @@ ztest_objset_destroy_cb(const char *name, void *arg) - } -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -3211,3 +3258,7 @@ ztest_objset_destroy_cb(const char *name, void *arg) - */ -- VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); -+ if (strchr(name, '@') != NULL) { -+ VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); -+ } else { -+ VERIFY0(dsl_destroy_head(name)); -+ } - return (0); -@@ -3221,7 +3272,5 @@ ztest_snapshot_create(char *osname, uint64_t id) - -- (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, -- (u_longlong_t)id); -+ (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); - -- error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error == ENOSPC) { -@@ -3230,4 +3279,6 @@ ztest_snapshot_create(char *osname, uint64_t id) - } -- if (error != 0 && error != EEXIST) -- fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); -+ if (error != 0 && error != EEXIST) { -+ fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, -+ snapname, error); -+ } - return (B_TRUE); -@@ -3244,3 +3295,3 @@ ztest_snapshot_destroy(char *osname, uint64_t id) - -- error = dmu_objset_destroy(snapname, B_FALSE); -+ error = dsl_destroy_snapshot(snapname, B_FALSE); - if (error != 0 && error != ENOENT) -@@ -3272,3 +3323,3 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - * If this dataset exists from a previous run, process its replay log -- * half of the time. If we don't replay it, then dmu_objset_destroy() -+ * half of the time. If we don't replay it, then dsl_destroy_head() - * (invoked from ztest_objset_destroy_cb()) should just throw it away. -@@ -3294,3 +3345,4 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - */ -- VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); -+ VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, -+ FTAG, &os)); - -@@ -3308,4 +3360,3 @@ ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) - -- VERIFY3U(0, ==, -- dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); - -@@ -3399,17 +3450,17 @@ ztest_dsl_dataset_cleanup(char *osname, uint64_t id) - -- error = dmu_objset_destroy(clone2name, B_FALSE); -+ error = dsl_destroy_head(clone2name); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); -- error = dmu_objset_destroy(snap3name, B_FALSE); -+ fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); -+ error = dsl_destroy_snapshot(snap3name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); -- error = dmu_objset_destroy(snap2name, B_FALSE); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); -+ error = dsl_destroy_snapshot(snap2name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); -- error = dmu_objset_destroy(clone1name, B_FALSE); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); -+ error = dsl_destroy_head(clone1name); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); -- error = dmu_objset_destroy(snap1name, B_FALSE); -+ fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); -+ error = dsl_destroy_snapshot(snap1name, B_FALSE); - if (error && error != ENOENT) -- fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); -+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); - -@@ -3428,4 +3479,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - { -- objset_t *clone; -- dsl_dataset_t *ds; -+ objset_t *os; - char *snap1name; -@@ -3459,4 +3509,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); - if (error && error != EEXIST) { -@@ -3469,8 +3518,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(snap1name, FTAG, &clone); -- if (error) -- fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); -- -- error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -+ error = dmu_objset_clone(clone1name, snap1name); - if (error) { -@@ -3483,4 +3527,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); - if (error && error != EEXIST) { -@@ -3493,4 +3536,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, -- NULL, NULL, B_FALSE, B_FALSE, -1); -+ error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); - if (error && error != EEXIST) { -@@ -3503,8 +3545,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(snap3name, FTAG, &clone); -- if (error) -- fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); -- -- error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -+ error = dmu_objset_clone(clone2name, snap3name); - if (error) { -@@ -3517,5 +3554,5 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds); -+ error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); - if (error) -- fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error); -+ fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); - error = dsl_dataset_promote(clone2name, NULL); -@@ -3524,3 +3561,3 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) - error); -- dsl_dataset_disown(ds, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -3539,3 +3576,3 @@ out: - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 4 -+#define OD_ARRAY_SIZE 4 - -@@ -3552,3 +3589,3 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) - -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -3574,3 +3611,3 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 2 -+#define OD_ARRAY_SIZE 2 - -@@ -3586,3 +3623,3 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) - objset_t *os = zd->zd_os; -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -3692,2 +3729,5 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) - -+ /* This accounts for setting the checksum/compression. */ -+ dmu_tx_hold_bonus(tx, bigobj); -+ - txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); -@@ -3850,3 +3890,3 @@ compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, - #undef OD_ARRAY_SIZE --#define OD_ARRAY_SIZE 2 -+#define OD_ARRAY_SIZE 2 - -@@ -3873,3 +3913,3 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) - -- size = sizeof(ztest_od_t) * OD_ARRAY_SIZE; -+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; - od = umem_alloc(size, UMEM_NOFAIL); -@@ -4094,3 +4134,3 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - uint64_t offset = (1ULL << (ztest_random(20) + 43)) + -@@ -4111,3 +4151,3 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4124,3 +4164,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - -@@ -4128,4 +4168,5 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - -- if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ if (ztest_object_init(zd, od, sizeof (ztest_od_t), -+ !ztest_random(2)) != 0) { -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4134,3 +4175,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4152,3 +4193,3 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) - umem_free(data, blocksize); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4177,3 +4218,3 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4300,3 +4341,3 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) - out: -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4314,3 +4355,3 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4347,3 +4388,3 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) - out: -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4363,3 +4404,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); -@@ -4367,3 +4408,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4398,3 +4439,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - count = -1ULL; -- VERIFY(zap_count(os, object, &count) == 0); -+ VERIFY0(zap_count(os, object, &count)); - ASSERT(count != -1ULL); -@@ -4461,3 +4502,3 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4552,3 +4593,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); -@@ -4556,3 +4597,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4599,3 +4640,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -4671,3 +4712,3 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) - -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -4716,2 +4757,18 @@ ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) - -+static int -+user_release_one(const char *snapname, const char *holdname) -+{ -+ nvlist_t *snaps, *holds; -+ int error; -+ -+ snaps = fnvlist_alloc(); -+ holds = fnvlist_alloc(); -+ fnvlist_add_boolean(holds, holdname); -+ fnvlist_add_nvlist(snaps, snapname, holds); -+ fnvlist_free(holds); -+ error = dsl_dataset_user_release(snaps, NULL); -+ fnvlist_free(snaps); -+ return (error); -+} -+ - /* -@@ -4730,2 +4787,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - char osname[MAXNAMELEN]; -+ nvlist_t *holds; - -@@ -4735,6 +4793,8 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- (void) snprintf(snapname, 100, "sh1_%llu", (u_longlong_t)id); -- (void) snprintf(fullname, 100, "%s@%s", osname, snapname); -- (void) snprintf(clonename, 100, "%s/ch1_%llu",osname,(u_longlong_t)id); -- (void) snprintf(tag, 100, "tag_%llu", (u_longlong_t)id); -+ (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", -+ (u_longlong_t)id); -+ (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); -+ (void) snprintf(clonename, sizeof (clonename), -+ "%s/ch1_%llu", osname, (u_longlong_t)id); -+ (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); - -@@ -4743,5 +4803,11 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- (void) dmu_objset_destroy(clonename, B_FALSE); -- (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); -- (void) dmu_objset_destroy(fullname, B_FALSE); -+ error = dsl_destroy_head(clonename); -+ if (error != ENOENT) -+ ASSERT0(error); -+ error = user_release_one(fullname, tag); -+ if (error != ESRCH && error != ENOENT) -+ ASSERT0(error); -+ error = dsl_destroy_snapshot(fullname, B_FALSE); -+ if (error != ENOENT) -+ ASSERT0(error); - -@@ -4751,4 +4817,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, -- FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error) { -@@ -4761,8 +4826,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_hold(fullname, FTAG, &origin); -- if (error) -- fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); -- -- error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); -- dmu_objset_rele(origin, FTAG); -+ error = dmu_objset_clone(clonename, fullname); - if (error) { -@@ -4775,5 +4835,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_TRUE); -+ error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { -- fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); -@@ -4781,5 +4841,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(clonename, B_FALSE); -+ error = dsl_destroy_head(clonename); - if (error) -- fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); -+ fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); - -@@ -4794,4 +4854,3 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - */ -- error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE, -- FALSE, -1); -+ error = dmu_objset_snapshot_one(osname, snapname); - if (error) { -@@ -4804,4 +4863,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, -- B_TRUE, -1); -+ holds = fnvlist_alloc(); -+ fnvlist_add_string(holds, fullname, tag); -+ error = dsl_dataset_user_hold(holds, 0, NULL); -+ fnvlist_free(holds); -+ - if (error) -@@ -4809,5 +4871,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_FALSE); -+ error = dsl_destroy_snapshot(fullname, B_FALSE); - if (error != EBUSY) { -- fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", - fullname, error); -@@ -4815,5 +4877,5 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dmu_objset_destroy(fullname, B_TRUE); -+ error = dsl_destroy_snapshot(fullname, B_TRUE); - if (error) { -- fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", -+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", - fullname, error); -@@ -4821,7 +4883,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) - -- error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); -+ error = user_release_one(fullname, tag); - if (error) -- fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); -+ fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); - -- VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); -+ VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); - -@@ -4868,2 +4930,10 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - /* -+ * Grab the name lock as reader. There are some operations -+ * which don't like to have their vdevs changed while -+ * they are in progress (i.e. spa_change_guid). Those -+ * operations will have grabbed the name lock as writer. -+ */ -+ (void) rw_enter(&ztest_name_lock, RW_READER); -+ -+ /* - * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. -@@ -4896,3 +4966,10 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - -- if (vd0 != NULL && maxfaults != 1) { -+ /* -+ * If the top-level vdev needs to be resilvered -+ * then we only allow faults on the device that is -+ * resilvering. -+ */ -+ if (vd0 != NULL && maxfaults != 1 && -+ (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || -+ vd0->vdev_resilver_txg != 0)) { - /* -@@ -4927,2 +5004,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - spa_config_exit(spa, SCL_STATE, FTAG); -+ (void) rw_exit(&ztest_name_lock); - goto out; -@@ -4940,2 +5018,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - spa_config_exit(spa, SCL_STATE, FTAG); -+ (void) rw_exit(&ztest_name_lock); - -@@ -4955,3 +5034,3 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) - * grab a reference on the dataset which may cause -- * dmu_objset_destroy() to fail with EBUSY thus -+ * dsl_destroy_head() to fail with EBUSY thus - * leaving the dataset in an inconsistent state. -@@ -5049,3 +5128,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - -- od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL); -+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); - ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); -@@ -5053,3 +5132,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5068,3 +5147,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5083,3 +5162,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - return; -@@ -5092,4 +5171,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - uint64_t offset = i * blocksize; -- VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db, -- DMU_READ_NO_PREFETCH) == 0); -+ int error = dmu_buf_hold(os, object, offset, FTAG, &db, -+ DMU_READ_NO_PREFETCH); -+ if (error != 0) { -+ fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", -+ os, (long long)object, (long long) offset, error); -+ } - ASSERT(db->db_offset == offset); -@@ -5109,4 +5192,4 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - */ -- VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db, -- DMU_READ_NO_PREFETCH) == 0); -+ VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, -+ DMU_READ_NO_PREFETCH)); - blk = *((dmu_buf_impl_t *)db)->db_blkptr; -@@ -5128,3 +5211,3 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) - (void) rw_exit(&ztest_name_lock); -- umem_free(od, sizeof(ztest_od_t)); -+ umem_free(od, sizeof (ztest_od_t)); - } -@@ -5308,2 +5391,3 @@ ztest_spa_import_export(char *oldname, char *newname) - spa_t *spa; -+ int error; - -@@ -5352,3 +5436,8 @@ ztest_spa_import_export(char *oldname, char *newname) - */ -- VERIFY3U(0, ==, spa_import(newname, config, NULL, 0)); -+ error = spa_import(newname, config, NULL, 0); -+ if (error != 0) { -+ dump_nvlist(config, 0); -+ fatal(B_FALSE, "couldn't import pool %s as %s: error %u", -+ oldname, newname, error); -+ } - -@@ -5408,3 +5497,3 @@ ztest_resume_thread(void *arg) - --#define GRACE 300 -+#define GRACE 300 - -@@ -5559,3 +5648,3 @@ ztest_dataset_open(int d) - -- VERIFY0(dmu_objset_hold(name, zd, &os)); -+ VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); - (void) rw_exit(&ztest_name_lock); -@@ -5600,3 +5689,3 @@ ztest_dataset_close(int d) - zil_close(zd->zd_zilog); -- dmu_objset_rele(zd->zd_os, zd); -+ dmu_objset_disown(zd->zd_os, zd); - -@@ -5646,3 +5735,3 @@ ztest_run(ztest_shared_t *zs) - kernel_init(FREAD | FWRITE); -- VERIFY(spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0); -+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); - spa->spa_debug = B_TRUE; -@@ -5650,5 +5739,6 @@ ztest_run(ztest_shared_t *zs) - -- VERIFY3U(0, ==, dmu_objset_hold(ztest_opts.zo_pool, FTAG, &os)); -+ VERIFY0(dmu_objset_own(ztest_opts.zo_pool, -+ DMU_OST_ANY, B_TRUE, FTAG, &os)); - zs->zs_guid = dmu_objset_fsid_guid(os); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - -@@ -5743,2 +5833,5 @@ ztest_run(ztest_shared_t *zs) - -+ if (ztest_opts.zo_verbose >= 3) -+ zfs_dbgmsg_print(FTAG); -+ - umem_free(tid, ztest_opts.zo_threads * sizeof (kt_did_t)); -@@ -5805,2 +5898,4 @@ ztest_freeze(void) - VERIFY3U(0, ==, ztest_dataset_open(0)); -+ spa->spa_debug = B_TRUE; -+ ztest_spa = spa; - -@@ -5940,5 +6035,5 @@ ztest_init(ztest_shared_t *zs) - } -- VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, -- NULL, NULL)); -+ VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); - nvlist_free(nvroot); -+ nvlist_free(props); - -@@ -5993,3 +6088,3 @@ setup_hdr(void) - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - -@@ -6020,3 +6115,3 @@ setup_data(void) - PROT_READ, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - -@@ -6027,3 +6122,3 @@ setup_data(void) - PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); -- ASSERT(hdr != MAP_FAILED); -+ VERIFY3P(hdr, !=, MAP_FAILED); - buf = (uint8_t *)hdr; -@@ -6155,2 +6250,4 @@ main(int argc, char **argv) - -+ dprintf_setup(&argc, argv); -+ - ztest_fd_rand = open("/dev/urandom", O_RDONLY); -@@ -6159,3 +6256,2 @@ main(int argc, char **argv) - if (!fd_data_str) { -- dprintf_setup(&argc, argv); - process_options(argc, argv); -diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c -index 018bb66..d9c80b3 100644 ---- a/cmd/zvol_id/zvol_id_main.c -+++ b/cmd/zvol_id/zvol_id_main.c -@@ -35,3 +35,4 @@ - --int ioctl_get_msg(char *var, int fd) -+static int -+ioctl_get_msg(char *var, int fd) - { -@@ -49,3 +50,4 @@ int ioctl_get_msg(char *var, int fd) - --int main(int argc, char **argv) -+int -+main(int argc, char **argv) - { -diff --git a/config/Rules.am b/config/Rules.am -index e3fa5b5..4fb40c4 100644 ---- a/config/Rules.am -+++ b/config/Rules.am -@@ -3,4 +3,6 @@ DEFAULT_INCLUDES = -include ${top_builddir}/zfs_config.h - AM_LIBTOOLFLAGS = --silent --AM_CFLAGS = -Wall -Wstrict-prototypes --AM_CFLAGS += -fno-strict-aliasing ${NO_UNUSED_BUT_SET_VARIABLE} ${DEBUG_CFLAGS} -+AM_CFLAGS = ${DEBUG_CFLAGS} -Wall -Wstrict-prototypes -+AM_CFLAGS += ${NO_UNUSED_BUT_SET_VARIABLE} -+AM_CFLAGS += ${NO_AGGRESSIVE_LOOP_OPTIMIZATIONS} -+AM_CFLAGS += -fno-strict-aliasing - AM_CPPFLAGS = -D_GNU_SOURCE -D__EXTENSIONS__ -D_REENTRANT -@@ -8 +10,5 @@ AM_CPPFLAGS += -D_POSIX_PTHREAD_SEMANTICS -D_FILE_OFFSET_BITS=64 - AM_CPPFLAGS += -D_LARGEFILE64_SOURCE -DTEXT_DOMAIN=\"zfs-linux-user\" -+AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" -+AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" -+AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" -+AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" -diff --git a/config/always-no-aggressive-loop-optimizations.m4 b/config/always-no-aggressive-loop-optimizations.m4 -new file mode 100644 -index 0000000..0a5576d ---- /dev/null -+++ b/config/always-no-aggressive-loop-optimizations.m4 -@@ -0,0 +1,20 @@ -+dnl # -+dnl # Check if gcc supports -fno-aggressive-loop-optimizations -+dnl # -+AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_NO_AGGRESSIVE_LOOP_OPTIMIZATIONS], [ -+ AC_MSG_CHECKING([for -fno-aggressive-loop-optimizations support]) -+ -+ saved_flags="$CFLAGS" -+ CFLAGS="$CFLAGS -fno-aggressive-loop-optimizations" -+ -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ -+ NO_AGGRESSIVE_LOOP_OPTIMIZATIONS=-fno-aggressive-loop-optimizations -+ AC_MSG_RESULT([yes]) -+ ], [ -+ NO_AGGRESSIVE_LOOP_OPTIMIZATIONS= -+ AC_MSG_RESULT([no]) -+ ]) -+ -+ CFLAGS="$saved_flags" -+ AC_SUBST([NO_AGGRESSIVE_LOOP_OPTIMIZATIONS]) -+]) -diff --git a/config/always-no-unused-but-set-variable.m4 b/config/always-no-unused-but-set-variable.m4 -index 4a3ceb6..863c90a 100644 ---- a/config/always-no-unused-but-set-variable.m4 -+++ b/config/always-no-unused-but-set-variable.m4 -@@ -14,3 +14,3 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE], [ - -- AC_RUN_IFELSE([AC_LANG_PROGRAM([], [])], -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], - [ -diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 -new file mode 100644 -index 0000000..a03ee5b ---- /dev/null -+++ b/config/kernel-acl.m4 -@@ -0,0 +1,265 @@ -+dnl # -+dnl # Check if posix_acl_release can be used from a CDDL module, -+dnl # The is_owner_or_cap macro was replaced by -+dnl # inode_owner_or_capable -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ -+ AC_MSG_CHECKING([whether posix_acl_release() is available]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ ],[ -+ struct posix_acl* tmp = posix_acl_alloc(1, 0); -+ posix_acl_release(tmp); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_RELEASE, 1, -+ [posix_acl_release() is available]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether posix_acl_release() is GPL-only]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ -+ MODULE_LICENSE("CDDL"); -+ ],[ -+ struct posix_acl* tmp = posix_acl_alloc(1, 0); -+ posix_acl_release(tmp); -+ ],[ -+ AC_MSG_RESULT(no) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_RELEASE_GPL_ONLY, 1, -+ [posix_acl_release() is GPL-only]) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # posix_acl_chmod_masq() is not exported anymore and posix_acl_chmod() -+dnl # was introduced to replace it. -+dnl # -+dnl # 3.14 API change, -+dnl # posix_acl_chmod() is changed to __posix_acl_chmod() -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CHMOD], [ -+ AC_MSG_CHECKING([whether posix_acl_chmod exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ posix_acl_chmod(NULL, 0, 0) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_CHMOD, 1, [posix_acl_chmod() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether __posix_acl_chmod exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ __posix_acl_chmod(NULL, 0, 0) -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE___POSIX_ACL_CHMOD, 1, [__posix_acl_chmod() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.30 API change, -+dnl # caching of ACL into the inode was added in this version. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CACHING], [ -+ AC_MSG_CHECKING([whether inode has i_acl and i_default_acl]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct inode ino; -+ ino.i_acl = NULL; -+ ino.i_default_acl = NULL; -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_CACHING, 1, -+ [inode contains i_acl and i_default_acl]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # posix_acl_equiv_mode now wants an umode_t* instead of a mode_t* -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ -+ AC_MSG_CHECKING([whether posix_acl_equiv_mode() wants umode_t]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ ],[ -+ umode_t tmp; -+ posix_acl_equiv_mode(NULL,&tmp); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T, 1, -+ [ posix_acl_equiv_mode wants umode_t*]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.27 API change, -+dnl # Check if inode_operations contains the function permission -+dnl # and expects the nameidata structure to have been removed. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION], [ -+ AC_MSG_CHECKING([whether iops->permission() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int permission_fn(struct inode *inode, int mask) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .permission = permission_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.26 API change, -+dnl # Check if inode_operations contains the function permission -+dnl # and expects the nameidata structure to be passed. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA], [ -+ AC_MSG_CHECKING([whether iops->permission() wants nameidata]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int permission_fn(struct inode *inode, int mask, -+ struct nameidata *nd) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .permission = permission_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_PERMISSION, 1, [iops->permission() exists]) -+ AC_DEFINE(HAVE_PERMISSION_WITH_NAMEIDATA, 1, -+ [iops->permission() with nameidata exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.32 API change, -+dnl # Check if inode_operations contains the function check_acl -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL], [ -+ AC_MSG_CHECKING([whether iops->check_acl() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int check_acl_fn(struct inode *inode, int mask) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .check_acl = check_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.38 API change, -+dnl # The function check_acl gained a new parameter: flags -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS], [ -+ AC_MSG_CHECKING([whether iops->check_acl() wants flags]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ int check_acl_fn(struct inode *inode, int mask, -+ unsigned int flags) { return 0; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .check_acl = check_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CHECK_ACL, 1, [iops->check_acl() exists]) -+ AC_DEFINE(HAVE_CHECK_ACL_WITH_FLAGS, 1, -+ [iops->check_acl() wants flags]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.1 API change, -+dnl # Check if inode_operations contains the function get_acl -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ -+ AC_MSG_CHECKING([whether iops->get_acl() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ struct posix_acl *get_acl_fn(struct inode *inode, int type) -+ { return NULL; } -+ -+ static const struct inode_operations -+ iops __attribute__ ((unused)) = { -+ .get_acl = get_acl_fn, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_GET_ACL, 1, [iops->get_acl() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.30 API change, -+dnl # current_umask exists only since this version. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_CURRENT_UMASK], [ -+ AC_MSG_CHECKING([whether current_umask exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ current_umask(); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_CURRENT_UMASK, 1, [current_umask() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/config/kernel-bdi-setup-and-register.m4 b/config/kernel-bdi-setup-and-register.m4 -index 4196091..6369409 100644 ---- a/config/kernel-bdi-setup-and-register.m4 -+++ b/config/kernel-bdi-setup-and-register.m4 -@@ -2,3 +2,3 @@ dnl # - dnl # 2.6.34 API change --dnl # The bdi_setup_and_register() helper function is avilable and -+dnl # The bdi_setup_and_register() helper function is avaliable and - dnl # exported by the kernel. This is a trivial helper function but -@@ -12,3 +12,4 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI_SETUP_AND_REGISTER], - ], [ -- bdi_setup_and_register(NULL, NULL, 0); -+ int r = bdi_setup_and_register(NULL, NULL, 0); -+ r = *(&r); - ], [bdi_setup_and_register], [mm/backing-dev.c], [ -diff --git a/config/kernel-bdi.m4 b/config/kernel-bdi.m4 -index 34ffaab..00bd375 100644 ---- a/config/kernel-bdi.m4 -+++ b/config/kernel-bdi.m4 -@@ -8,5 +8,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BDI], [ - #include -+ -+ static const struct super_block -+ sb __attribute__ ((unused)) = { -+ .s_bdi = NULL, -+ }; - ],[ -- struct super_block sb __attribute__ ((unused)); -- sb.s_bdi = NULL; - ],[ -diff --git a/config/kernel-bio-bvec-iter.m4 b/config/kernel-bio-bvec-iter.m4 -new file mode 100644 -index 0000000..64c9893 ---- /dev/null -+++ b/config/kernel-bio-bvec-iter.m4 -@@ -0,0 +1,20 @@ -+dnl # -+dnl # 3.14 API change, -+dnl # Immutable biovecs. A number of fields of struct bio are moved to -+dnl # struct bvec_iter. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_BIO_BVEC_ITER], [ -+ AC_MSG_CHECKING([whether bio has bi_iter]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct bio bio; -+ bio.bi_iter.bi_sector = 0; -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_BIO_BVEC_ITER, 1, [bio has bi_iter]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -diff --git a/config/kernel-rq-for-each_segment.m4 b/config/kernel-rq-for-each_segment.m4 -index 449168d..84ce7d1 100644 ---- a/config/kernel-rq-for-each_segment.m4 -+++ b/config/kernel-rq-for-each_segment.m4 -@@ -3,6 +3,9 @@ dnl # 2.6.x API change - dnl # -+dnl # 3.14 API change -+dnl # - AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ -- AC_MSG_CHECKING([whether rq_for_each_segment() is available]) - tmp_flags="$EXTRA_KCFLAGS" - EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}" -+ -+ AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec *]) - ZFS_LINUX_TRY_COMPILE([ -@@ -18,2 +21,22 @@ AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ - [rq_for_each_segment() is available]) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BVP, 1, -+ [rq_for_each_segment() wants bio_vec *]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+ -+ AC_MSG_CHECKING([whether rq_for_each_segment() wants bio_vec]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ struct bio_vec bv; -+ struct req_iterator iter; -+ struct request *req = NULL; -+ rq_for_each_segment(bv, req, iter) { } -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT, 1, -+ [rq_for_each_segment() is available]) -+ AC_DEFINE(HAVE_RQ_FOR_EACH_SEGMENT_BV, 1, -+ [rq_for_each_segment() wants bio_vec]) - ],[ -@@ -21,2 +44,3 @@ AC_DEFUN([ZFS_AC_KERNEL_RQ_FOR_EACH_SEGMENT], [ - ]) -+ - EXTRA_KCFLAGS="$tmp_flags" -diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 -index 325c960..2ba2fcb 100644 ---- a/config/kernel-xattr-handler.m4 -+++ b/config/kernel-xattr-handler.m4 -@@ -20,6 +20,7 @@ AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], - }; -- ],[ -- struct super_block sb __attribute__ ((unused)); - -- sb.s_xattr = xattr_handlers; -+ const struct super_block sb __attribute__ ((unused)) = { -+ .s_xattr = xattr_handlers, -+ }; -+ ],[ - ],[ -@@ -42,8 +43,10 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ - #include -- ],[ -- int (*get)(struct dentry *dentry, const char *name, -- void *buffer, size_t size, int handler_flags) = NULL; -- struct xattr_handler xops __attribute__ ((unused)); - -- xops.get = get; -+ int get(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int handler_flags) { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .get = get, -+ }; -+ ],[ - ],[ -@@ -66,9 +69,11 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ - #include -- ],[ -- int (*set)(struct dentry *dentry, const char *name, -- const void *buffer, size_t size, int flags, -- int handler_flags) = NULL; -- struct xattr_handler xops __attribute__ ((unused)); - -- xops.set = set; -+ int set(struct dentry *dentry, const char *name, -+ const void *buffer, size_t size, int flags, -+ int handler_flags) { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .set = set, -+ }; -+ ],[ - ],[ -@@ -81 +86,70 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ - ]) -+ -+dnl # -+dnl # 2.6.33 API change, -+dnl # The xattr_hander->list() callback was changed to take a dentry -+dnl # instead of an inode, and a handler_flags argument was added. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ -+ AC_MSG_CHECKING([whether xattr_handler->list() wants dentry]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ -+ size_t list(struct dentry *dentry, char *list, size_t list_size, -+ const char *name, size_t name_len, int handler_flags) -+ { return 0; } -+ static const struct xattr_handler -+ xops __attribute__ ((unused)) = { -+ .list = list, -+ }; -+ ],[ -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_DENTRY_XATTR_LIST, 1, -+ [xattr_handler->list() wants dentry]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 3.7 API change, -+dnl # The posix_acl_{from,to}_xattr functions gained a new -+dnl # parameter: user_ns -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ -+ AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ #include -+ #include -+ ],[ -+ posix_acl_from_xattr(&init_user_ns, NULL, 0); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_POSIX_ACL_FROM_XATTR_USERNS, 1, -+ [posix_acl_from_xattr() needs user_ns]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -+ -+dnl # -+dnl # 2.6.39 API change, -+dnl # The is_owner_or_cap() macro was replaced by inode_owner_or_capable(), -+dnl # this is used for permission checks in the xattr call paths. -+dnl # -+AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ -+ AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) -+ ZFS_LINUX_TRY_COMPILE([ -+ #include -+ ],[ -+ inode_owner_or_capable(NULL); -+ ],[ -+ AC_MSG_RESULT(yes) -+ AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE, 1, -+ [inode_owner_or_capable() exists]) -+ ],[ -+ AC_MSG_RESULT(no) -+ ]) -+]) -diff --git a/config/kernel.m4 b/config/kernel.m4 -index 74ce22c..2557033 100644 ---- a/config/kernel.m4 -+++ b/config/kernel.m4 -@@ -19,2 +19,3 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - ZFS_AC_KERNEL_BDEV_PHYSICAL_BLOCK_SIZE -+ ZFS_AC_KERNEL_BIO_BVEC_ITER - ZFS_AC_KERNEL_BIO_FAILFAST -@@ -47,2 +48,15 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - ZFS_AC_KERNEL_XATTR_HANDLER_SET -+ ZFS_AC_KERNEL_XATTR_HANDLER_LIST -+ ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE -+ ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS -+ ZFS_AC_KERNEL_POSIX_ACL_RELEASE -+ ZFS_AC_KERNEL_POSIX_ACL_CHMOD -+ ZFS_AC_KERNEL_POSIX_ACL_CACHING -+ ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T -+ ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION -+ ZFS_AC_KERNEL_INODE_OPERATIONS_PERMISSION_WITH_NAMEIDATA -+ ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL -+ ZFS_AC_KERNEL_INODE_OPERATIONS_CHECK_ACL_WITH_FLAGS -+ ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL -+ ZFS_AC_KERNEL_CURRENT_UMASK - ZFS_AC_KERNEL_SHOW_OPTIONS -@@ -93,2 +107,3 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_UNUSED_BUT_SET_VARIABLE" -+ KERNELCPPFLAGS="$KERNELCPPFLAGS $NO_AGGRESSIVE_LOOP_OPTIMIZATIONS" - KERNELCPPFLAGS="$KERNELCPPFLAGS -DHAVE_SPL -D_KERNEL" -diff --git a/config/user-frame-larger-than.m4 b/config/user-frame-larger-than.m4 -index 7ad8622..e0828ec 100644 ---- a/config/user-frame-larger-than.m4 -+++ b/config/user-frame-larger-than.m4 -@@ -9,3 +9,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN], [ - -- AC_RUN_IFELSE([AC_LANG_PROGRAM([], [])], -+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], - [ -diff --git a/config/user-libblkid.m4 b/config/user-libblkid.m4 -index 276587f..2dd2623 100644 ---- a/config/user-libblkid.m4 -+++ b/config/user-libblkid.m4 -@@ -24,3 +24,10 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - LIBBLKID= -- AS_IF([test "x$with_blkid" != xno], -+ AS_IF([test "x$with_blkid" = xyes], -+ [ -+ AC_SUBST([LIBBLKID], ["-lblkid"]) -+ AC_DEFINE([HAVE_LIBBLKID], 1, -+ [Define if you have libblkid]) -+ ]) -+ -+ AS_IF([test "x$with_blkid" = xcheck], - [ -@@ -31,3 +38,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - ZFS_DEV=`mktemp` -- dd if=/dev/zero of=$ZFS_DEV bs=1024k count=8 \ -+ truncate -s 64M $ZFS_DEV -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=128 conv=notrunc &>/dev/null \ - >/dev/null 2>/dev/null -@@ -37,5 +47,13 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - >/dev/null 2>/dev/null -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=136 conv=notrunc &>/dev/null \ -+ >/dev/null 2>/dev/null -+ echo -en "\x0c\xb1\xba\0\0\0\0\0" | \ -+ dd of=$ZFS_DEV bs=1k count=8 \ -+ seek=140 conv=notrunc &>/dev/null \ -+ >/dev/null 2>/dev/null - -- saved_LDFLAGS="$LDFLAGS" -- LDFLAGS="-lblkid" -+ saved_LIBS="$LIBS" -+ LIBS="-lblkid" - -@@ -44,2 +62,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - #include -+ #include - #include -@@ -60,6 +79,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - -- if (strcmp(value, "zfs")) { -+ if (strcmp(value, "zfs_member")) { - free(value); - blkid_put_cache(cache); -- return 3; -+ return 0; - } -@@ -84,3 +103,3 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ - -- LDFLAGS="$saved_LDFLAGS" -+ LIBS="$saved_LIBS" - ], -diff --git a/config/user-runstatedir.m4 b/config/user-runstatedir.m4 -new file mode 100644 -index 0000000..ded1362 ---- /dev/null -+++ b/config/user-runstatedir.m4 -@@ -0,0 +1,6 @@ -+dnl For backwards compatibility; runstatedir added in autoconf 2.70. -+AC_DEFUN([ZFS_AC_CONFIG_USER_RUNSTATEDIR], [ -+ if test "x$runstatedir" = x; then -+ AC_SUBST([runstatedir], ['${localstatedir}/run']) -+ fi -+]) -diff --git a/config/user-selinux.m4 b/config/user-selinux.m4 -deleted file mode 100644 -index 84df6ce..0000000 ---- a/config/user-selinux.m4 -+++ /dev/null -@@ -1,36 +0,0 @@ --dnl # --dnl # Check to see if the selinux libraries are available. If they --dnl # are then they will be consulted during mount to determine if --dnl # selinux is enabled or disabled. --dnl # --AC_DEFUN([ZFS_AC_CONFIG_USER_LIBSELINUX], [ -- AC_ARG_WITH([selinux], -- [AS_HELP_STRING([--with-selinux], -- [support selinux @<:@default=check@:>@])], -- [], -- [with_selinux=check]) -- -- LIBSELINUX= -- AS_IF([test "x$with_selinux" != xno], [ -- AC_CHECK_HEADER([selinux/selinux.h], [ -- AC_CHECK_LIB([selinux], [is_selinux_enabled], [ -- AC_SUBST([LIBSELINUX], ["-lselinux"]) -- AC_DEFINE([HAVE_LIBSELINUX], 1, -- [Define if you have selinux]) -- ], [ -- AS_IF([test "x$with_selinux" != xcheck], -- [AC_MSG_FAILURE( -- [--with-selinux given but unavailable]) -- ]) -- ]) -- ], [ -- AS_IF([test "x$with_selinux" != xcheck], -- [AC_MSG_FAILURE( -- [--with-selinux given but unavailable]) -- ]) -- ]) -- ], [ -- AC_MSG_CHECKING([for selinux support]) -- AC_MSG_RESULT([no]) -- ]) --]) -diff --git a/config/user-systemd.m4 b/config/user-systemd.m4 -new file mode 100644 -index 0000000..5988945 ---- /dev/null -+++ b/config/user-systemd.m4 -@@ -0,0 +1,29 @@ -+AC_DEFUN([ZFS_AC_CONFIG_USER_SYSTEMD], [ -+ AC_ARG_ENABLE(systemd, -+ AC_HELP_STRING([--enable-systemd], -+ [install systemd unit/preset files [[default: yes]]]), -+ [],enable_systemd=yes) -+ -+ AC_ARG_WITH(systemdunitdir, -+ AC_HELP_STRING([--with-systemdunitdir=DIR], -+ [install systemd unit files in dir [[/usr/lib/systemd/system]]]), -+ systemdunitdir=$withval,systemdunitdir=/usr/lib/systemd/system) -+ -+ AC_ARG_WITH(systemdpresetdir, -+ AC_HELP_STRING([--with-systemdpresetdir=DIR], -+ [install systemd preset files in dir [[/usr/lib/systemd/system-preset]]]), -+ systemdpresetdir=$withval,systemdpresetdir=/usr/lib/systemd/system-preset) -+ -+ AS_IF([test "x$enable_systemd" = xyes], -+ [ -+ ZFS_INIT_SYSTEMD=systemd -+ ZFS_MODULE_LOAD=modules-load.d -+ modulesloaddir=/usr/lib/modules-load.d -+ ]) -+ -+ AC_SUBST(ZFS_INIT_SYSTEMD) -+ AC_SUBST(ZFS_MODULE_LOAD) -+ AC_SUBST(systemdunitdir) -+ AC_SUBST(systemdpresetdir) -+ AC_SUBST(modulesloaddir) -+]) -diff --git a/config/user-sysvinit.m4 b/config/user-sysvinit.m4 -new file mode 100644 -index 0000000..65dcc38 ---- /dev/null -+++ b/config/user-sysvinit.m4 -@@ -0,0 +1,11 @@ -+AC_DEFUN([ZFS_AC_CONFIG_USER_SYSVINIT], [ -+ AC_ARG_ENABLE(sysvinit, -+ AC_HELP_STRING([--enable-sysvinit], -+ [install SysV init scripts [default: yes]]), -+ [],enable_sysvinit=yes) -+ -+ AS_IF([test "x$enable_sysvinit" = xyes], -+ [ZFS_INIT_SYSV=init.d]) -+ -+ AC_SUBST(ZFS_INIT_SYSV) -+]) -diff --git a/config/user.m4 b/config/user.m4 -index 6925e56..3802437 100644 ---- a/config/user.m4 -+++ b/config/user.m4 -@@ -5,2 +5,4 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ - ZFS_AC_CONFIG_USER_UDEV -+ ZFS_AC_CONFIG_USER_SYSTEMD -+ ZFS_AC_CONFIG_USER_SYSVINIT - ZFS_AC_CONFIG_USER_DRACUT -@@ -11,4 +13,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ - ZFS_AC_CONFIG_USER_LIBBLKID -- ZFS_AC_CONFIG_USER_LIBSELINUX - ZFS_AC_CONFIG_USER_FRAME_LARGER_THAN -+ ZFS_AC_CONFIG_USER_RUNSTATEDIR -+dnl # -+dnl # Checks for library functions -+ AC_CHECK_FUNCS([mlockall]) - ]) -diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 -index 005185b..477b916 100644 ---- a/config/zfs-build.m4 -+++ b/config/zfs-build.m4 -@@ -64,2 +64,3 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ - ZFS_AC_CONFIG_ALWAYS_NO_UNUSED_BUT_SET_VARIABLE -+ ZFS_AC_CONFIG_ALWAYS_NO_AGGRESSIVE_LOOP_OPTIMIZATIONS - ]) -diff --git a/configure.ac b/configure.ac -index 58e2158..66272fd 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -66,2 +66,5 @@ AC_CONFIG_FILES([ - etc/zfs/Makefile -+ etc/systemd/Makefile -+ etc/systemd/system/Makefile -+ etc/modules-load.d/Makefile - man/Makefile -@@ -90,2 +93,3 @@ AC_CONFIG_FILES([ - lib/libzfs/Makefile -+ lib/libzfs_core/Makefile - lib/libshare/Makefile -@@ -105,2 +109,4 @@ AC_CONFIG_FILES([ - cmd/arcstat/Makefile -+ cmd/dbufstat/Makefile -+ cmd/zed/Makefile - module/Makefile -diff --git a/etc/Makefile.am b/etc/Makefile.am -index 65882b5..a62678b 100644 ---- a/etc/Makefile.am -+++ b/etc/Makefile.am -@@ -1 +1,2 @@ --SUBDIRS = init.d zfs -+SUBDIRS = zfs $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) -+DIST_SUBDIRS = init.d zfs systemd modules-load.d -diff --git a/etc/init.d/zfs.fedora.in b/etc/init.d/zfs.fedora.in -index 3cece9b..1786378 100644 ---- a/etc/init.d/zfs.fedora.in -+++ b/etc/init.d/zfs.fedora.in -@@ -29,3 +29,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/init.d/zfs.gentoo.in b/etc/init.d/zfs.gentoo.in -index 0034e02..07fce01 100644 ---- a/etc/init.d/zfs.gentoo.in -+++ b/etc/init.d/zfs.gentoo.in -@@ -7,3 +7,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -@@ -22,2 +22,3 @@ depend() - before bootmisc logger -+ use mtab - keyword -lxc -openvz -prefix -vserver -diff --git a/etc/init.d/zfs.lsb.in b/etc/init.d/zfs.lsb.in -index 0d0ffb4..e626f79 100644 ---- a/etc/init.d/zfs.lsb.in -+++ b/etc/init.d/zfs.lsb.in -@@ -31,2 +31,6 @@ ZPOOL="@sbindir@/zpool" - ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" -+USE_DISK_BY_ID=0 -+VERBOSE_MOUNT=0 -+DO_OVERLAY_MOUNTS=0 -+MOUNT_EXTRA_OPTIONS="" - -@@ -40,3 +44,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -@@ -71,9 +75,27 @@ start() - # all filesystem based on their properties. -- if [ -f "$ZPOOL_CACHE" ] ; then -+ if [ "$USE_DISK_BY_ID" -eq 1 ]; then -+ log_begin_msg "Importing ZFS pools" -+ "$ZPOOL" import -d /dev/disk/by-id -aN 2>/dev/null -+ ret=$? -+ log_end_msg $ret -+ [ "$ret" -eq 0 ] && POOL_IMPORTED=1 -+ elif [ -f "$ZPOOL_CACHE" ] ; then - log_begin_msg "Importing ZFS pools" - "$ZPOOL" import -c "$ZPOOL_CACHE" -aN 2>/dev/null -- log_end_msg $? -+ ret=$? -+ log_end_msg $ret -+ [ "$ret" -eq 0 ] && POOL_IMPORTED=1 -+ fi -+ -+ if [ -n "$POOL_IMPORTED" ]; then -+ if [ "$VERBOSE_MOUNT" -eq 1 ]; then -+ verbose=v -+ fi -+ -+ if [ "$DO_OVERLAY_MOUNTS" -eq 1 ]; then -+ overlay=O -+ fi - - log_begin_msg "Mounting ZFS filesystems" -- "$ZFS" mount -a -+ "$ZFS" mount -a$verbose$overlay$MOUNT_EXTRA_OPTIONS - log_end_msg $? -@@ -92,2 +114,6 @@ stop() - -+ log_begin_msg "Unsharing ZFS filesystems" -+ "$ZFS" unshare -a -+ log_end_msg $? -+ - log_begin_msg "Unmounting ZFS filesystems" -@@ -96,2 +122,9 @@ stop() - -+ log_begin_msg "Exporting ZFS pools" -+ "$ZPOOL" list -H -o name | \ -+ while read pool; do -+ "$ZPOOL" export $pool -+ done -+ log_end_msg $? -+ - rm -f "$LOCKFILE" -diff --git a/etc/init.d/zfs.lunar.in b/etc/init.d/zfs.lunar.in -index 3cf79ce..7a51104 100644 ---- a/etc/init.d/zfs.lunar.in -+++ b/etc/init.d/zfs.lunar.in -@@ -18,3 +18,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/init.d/zfs.redhat.in b/etc/init.d/zfs.redhat.in -index fb5187f..227787d 100644 ---- a/etc/init.d/zfs.redhat.in -+++ b/etc/init.d/zfs.redhat.in -@@ -29,3 +29,3 @@ if [ -z "$init" ]; then - # Not interactive -- grep -Eqi 'zfs=off|zfs=no' /proc/cmdline && exit 3 -+ grep -qE '(^|[^\\](\\\\)* )zfs=(off|no)( |$)' /proc/cmdline && exit 3 - fi -diff --git a/etc/modules-load.d/.gitignore b/etc/modules-load.d/.gitignore -new file mode 100644 -index 0000000..fee9217 ---- /dev/null -+++ b/etc/modules-load.d/.gitignore -@@ -0,0 +1 @@ -+*.conf -diff --git a/etc/modules-load.d/Makefile.am b/etc/modules-load.d/Makefile.am -new file mode 100644 -index 0000000..980cb85 ---- /dev/null -+++ b/etc/modules-load.d/Makefile.am -@@ -0,0 +1,13 @@ -+modulesload_DATA = \ -+ $(top_srcdir)/etc/modules-load.d/zfs.conf -+ -+EXTRA_DIST = \ -+ $(top_srcdir)/etc/modules-load.d/zfs.conf.in -+ -+$(modulesload_DATA): -+ -$(SED) \ -+ -e '' \ -+ '$@.in' >'$@' -+ -+distclean-local:: -+ -$(RM) $(modulesload_DATA) -diff --git a/etc/modules-load.d/zfs.conf.in b/etc/modules-load.d/zfs.conf.in -new file mode 100644 -index 0000000..73304bc ---- /dev/null -+++ b/etc/modules-load.d/zfs.conf.in -@@ -0,0 +1 @@ -+zfs -diff --git a/etc/systemd/Makefile.am b/etc/systemd/Makefile.am -new file mode 100644 -index 0000000..d4008c0 ---- /dev/null -+++ b/etc/systemd/Makefile.am -@@ -0,0 +1 @@ -+SUBDIRS = system -diff --git a/etc/systemd/system/.gitignore b/etc/systemd/system/.gitignore -new file mode 100644 -index 0000000..efada54 ---- /dev/null -+++ b/etc/systemd/system/.gitignore -@@ -0,0 +1,3 @@ -+*.service -+*.target -+*.preset -diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in -new file mode 100644 -index 0000000..4efdd72 ---- /dev/null -+++ b/etc/systemd/system/50-zfs.preset.in -@@ -0,0 +1,2 @@ -+# ZFS is enabled by default -+enable zfs.* -diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am -new file mode 100644 -index 0000000..b7a8db2 ---- /dev/null -+++ b/etc/systemd/system/Makefile.am -@@ -0,0 +1,35 @@ -+systemdpreset_DATA = \ -+ $(top_srcdir)/etc/systemd/system/50-zfs.preset -+systemdunit_DATA = \ -+ $(top_srcdir)/etc/systemd/system/zed.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-cache.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-scan.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-mount.service \ -+ $(top_srcdir)/etc/systemd/system/zfs-share.service \ -+ $(top_srcdir)/etc/systemd/system/zfs.target -+ -+EXTRA_DIST = \ -+ $(top_srcdir)/etc/systemd/system/zed.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-cache.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-import-scan.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-mount.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs-share.service.in \ -+ $(top_srcdir)/etc/systemd/system/zfs.target.in \ -+ $(top_srcdir)/etc/systemd/system/50-zfs.preset.in -+ -+$(systemdunit_DATA): -+ -$(SED) -e 's,@bindir\@,$(bindir),g' \ -+ -e 's,@runstatedir\@,$(runstatedir),g' \ -+ -e 's,@sbindir\@,$(sbindir),g' \ -+ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -+ '$@.in' >'$@' -+ -+$(systemdpreset_DATA): -+ -$(SED) -e 's,@bindir\@,$(bindir),g' \ -+ -e 's,@runstatedir\@,$(runstatedir),g' \ -+ -e 's,@sbindir\@,$(sbindir),g' \ -+ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -+ '$@.in' >'$@' -+ -+distclean-local:: -+ -$(RM) $(systemdunit_DATA) $(systemdpreset_DATA) -diff --git a/etc/systemd/system/zed.service.in b/etc/systemd/system/zed.service.in -new file mode 100644 -index 0000000..78988ab ---- /dev/null -+++ b/etc/systemd/system/zed.service.in -@@ -0,0 +1,13 @@ -+[Unit] -+Description=ZFS Event Daemon (zed) -+Documentation=man:zed(8) -+After=zfs-import-cache.service -+After=zfs-import-scan.service -+ -+[Service] -+Type=forking -+ExecStart=@sbindir@/zed -+PIDFile=@runstatedir@/zed.pid -+User=root -+Group=root -+Restart=on-abort -diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in -new file mode 100644 -index 0000000..918a258 ---- /dev/null -+++ b/etc/systemd/system/zfs-import-cache.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=Import ZFS pools by cache file -+DefaultDependencies=no -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+ConditionPathExists=@sysconfdir@/zfs/zpool.cache -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN -diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in -new file mode 100644 -index 0000000..ab1b0f6 ---- /dev/null -+++ b/etc/systemd/system/zfs-import-scan.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=Import ZFS pools by device scanning -+DefaultDependencies=no -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+ConditionPathExists=!@sysconfdir@/zfs/zpool.cache -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zpool import -d /dev/disk/by-id -aN -diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in -new file mode 100644 -index 0000000..f1056af ---- /dev/null -+++ b/etc/systemd/system/zfs-mount.service.in -@@ -0,0 +1,15 @@ -+[Unit] -+Description=Mount ZFS filesystems -+DefaultDependencies=no -+Wants=zfs-import-cache.service -+Wants=zfs-import-scan.service -+Requires=systemd-udev-settle.service -+After=systemd-udev-settle.service -+After=zfs-import-cache.service -+After=zfs-import-scan.service -+Before=local-fs.target -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zfs mount -a -diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in -new file mode 100644 -index 0000000..a21c9c6 ---- /dev/null -+++ b/etc/systemd/system/zfs-share.service.in -@@ -0,0 +1,11 @@ -+[Unit] -+Description=ZFS file system shares -+After=nfs-server.service -+After=smb.service -+PartOf=nfs-server.service -+PartOf=smb.service -+ -+[Service] -+Type=oneshot -+RemainAfterExit=yes -+ExecStart=@sbindir@/zfs share -a -diff --git a/etc/systemd/system/zfs.target.in b/etc/systemd/system/zfs.target.in -new file mode 100644 -index 0000000..3541533 ---- /dev/null -+++ b/etc/systemd/system/zfs.target.in -@@ -0,0 +1,8 @@ -+[Unit] -+Description=ZFS startup target -+Requires=zfs-mount.service -+Requires=zfs-share.service -+Wants=zed.service -+ -+[Install] -+WantedBy=multi-user.target -diff --git a/etc/zfs/vdev_id.conf.sas_direct.example b/etc/zfs/vdev_id.conf.sas_direct.example -index a0c43a7..115ebd8 100644 ---- a/etc/zfs/vdev_id.conf.sas_direct.example -+++ b/etc/zfs/vdev_id.conf.sas_direct.example -@@ -10,13 +10,16 @@ channel 86:00.0 0 D - -+ -+# Custom mapping for Channel A -+ - # Linux Mapped --# Slot Slot --slot 1 7 --slot 2 10 --slot 3 3 --slot 4 6 --slot 5 2 --slot 6 8 --slot 7 1 --slot 8 4 --slot 9 9 --slot 10 5 -+# Slot Slot Channel -+slot 1 7 A -+slot 2 10 A -+slot 3 3 A -+slot 4 6 A -+ -+# Default mapping for B, C, and D -+slot 1 4 -+slot 2 2 -+slot 3 1 -+slot 4 3 -diff --git a/include/Makefile.am b/include/Makefile.am -index 64141d9..2e1c31a 100644 ---- a/include/Makefile.am -+++ b/include/Makefile.am -@@ -20,2 +20,3 @@ USER_H = \ - $(top_srcdir)/include/libzfs.h \ -+ $(top_srcdir)/include/libzfs_core.h \ - $(top_srcdir)/include/libzfs_impl.h -diff --git a/include/libzfs.h b/include/libzfs.h -index 3472b76..5bc8b03 100644 ---- a/include/libzfs.h -+++ b/include/libzfs.h -@@ -23,5 +23,6 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -59,2 +60,7 @@ extern "C" { - -+/* -+ * Default wait time for a device name to be created. -+ */ -+#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ -+ - #define DEFAULT_IMPORT_PATH_SIZE 7 -@@ -65,3 +71,4 @@ extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE]; - */ --enum { -+typedef enum zfs_error { -+ EZFS_SUCCESS = 0, /* no error -- success */ - EZFS_NOMEM = 2000, /* out of memory */ -@@ -137,3 +144,3 @@ enum { - EZFS_UNKNOWN --}; -+} zfs_error_t; - -@@ -193,2 +200,5 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); - -+extern void zfs_save_arguments(int argc, char **, char *, int); -+extern int zpool_log_history(libzfs_handle_t *, const char *); -+ - extern int libzfs_errno(libzfs_handle_t *); -@@ -228,3 +238,3 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, - nvlist_t *, nvlist_t *); --extern int zpool_destroy(zpool_handle_t *); -+extern int zpool_destroy(zpool_handle_t *, const char *); - extern int zpool_add(zpool_handle_t *, nvlist_t *); -@@ -274,2 +284,4 @@ extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, - size_t proplen, zprop_source_t *); -+extern int zpool_get_prop_literal(zpool_handle_t *, zpool_prop_t, char *, -+ size_t proplen, zprop_source_t *, boolean_t literal); - extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, -@@ -302,2 +314,3 @@ typedef enum { - ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ -+ ZPOOL_STATUS_ERRATA, /* informational errata available */ - -@@ -337,4 +350,6 @@ typedef enum { - --extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); --extern zpool_status_t zpool_import_status(nvlist_t *, char **); -+extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, -+ zpool_errata_t *); -+extern zpool_status_t zpool_import_status(nvlist_t *, char **, -+ zpool_errata_t *); - extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); -@@ -352,4 +367,4 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); - */ --extern int zpool_export(zpool_handle_t *, boolean_t); --extern int zpool_export_force(zpool_handle_t *); -+extern int zpool_export(zpool_handle_t *, boolean_t, const char *); -+extern int zpool_export_force(zpool_handle_t *, const char *); - extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, -@@ -387,3 +402,3 @@ struct zfs_cmd; - --extern const char *zfs_history_event_names[LOG_END]; -+extern const char *zfs_history_event_names[]; - -@@ -395,7 +410,6 @@ extern int zpool_history_unpack(char *, uint64_t, uint64_t *, - nvlist_t ***, uint_t *); --extern void zpool_set_history_str(const char *subcommand, int argc, -- char **argv, char *history_str); --extern int zpool_stage_history(libzfs_handle_t *, const char *); --extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, int, int); -+extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, -+ int); - extern int zpool_events_clear(libzfs_handle_t *, int *); -+extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int); - extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, -@@ -452,4 +466,2 @@ extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, - char *buf, size_t len); --extern int zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, -- uint64_t *usedp); - extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); -@@ -473,3 +485,4 @@ typedef struct zprop_list { - --extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); -+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, -+ boolean_t); - extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); -@@ -566,5 +579,7 @@ extern int zfs_destroy(zfs_handle_t *, boolean_t); - extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); --extern int zfs_destroy_snaps_nvl(zfs_handle_t *, nvlist_t *, boolean_t); -+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); - extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); - extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); -+extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, -+ nvlist_t *props); - extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -@@ -607,4 +622,5 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *, - extern int zfs_promote(zfs_handle_t *); --extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, -- boolean_t, boolean_t, int, uint64_t, uint64_t); -+extern int zfs_hold(zfs_handle_t *, const char *, const char *, -+ boolean_t, int); -+extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); - extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); -diff --git a/include/libzfs_core.h b/include/libzfs_core.h -new file mode 100644 -index 0000000..3642dc7 ---- /dev/null -+++ b/include/libzfs_core.h -@@ -0,0 +1,67 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ -+#ifndef _LIBZFS_CORE_H -+#define _LIBZFS_CORE_H -+ -+#include -+#include -+#include -+#include -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+int libzfs_core_init(void); -+void libzfs_core_fini(void); -+ -+int lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist); -+int lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props); -+int lzc_clone(const char *fsname, const char *origin, nvlist_t *props); -+int lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist); -+ -+int lzc_snaprange_space(const char *firstsnap, const char *lastsnap, -+ uint64_t *usedp); -+ -+int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist); -+int lzc_release(nvlist_t *holds, nvlist_t **errlist); -+int lzc_get_holds(const char *snapname, nvlist_t **holdsp); -+ -+int lzc_send(const char *snapname, const char *fromsnap, int fd); -+int lzc_receive(const char *snapname, nvlist_t *props, const char *origin, -+ boolean_t force, int fd); -+int lzc_send_space(const char *snapname, const char *fromsnap, -+ uint64_t *result); -+ -+boolean_t lzc_exists(const char *dataset); -+ -+int lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _LIBZFS_CORE_H */ -diff --git a/include/libzfs_impl.h b/include/libzfs_impl.h -index fabcb11..5502455 100644 ---- a/include/libzfs_impl.h -+++ b/include/libzfs_impl.h -@@ -23,7 +23,7 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ - --#ifndef _LIBFS_IMPL_H --#define _LIBFS_IMPL_H -+#ifndef _LIBZFS_IMPL_H -+#define _LIBZFS_IMPL_H - -@@ -38,2 +38,3 @@ - #include -+#include - -@@ -71,3 +72,2 @@ struct libzfs_handle { - char libzfs_desc[1024]; -- char *libzfs_log_str; - int libzfs_printerr; -@@ -195,4 +195,2 @@ int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); - --int zvol_create_link(libzfs_handle_t *, const char *); --int zvol_remove_link(libzfs_handle_t *, const char *); - boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); -@@ -221,2 +219,2 @@ extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); - --#endif /* _LIBFS_IMPL_H */ -+#endif /* _LIBZFS_IMPL_H */ -diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h -index ec9926f..8566033 100644 ---- a/include/linux/blkdev_compat.h -+++ b/include/linux/blkdev_compat.h -@@ -29,3 +29,3 @@ - #ifndef _ZFS_BLKDEV_H --#define _ZFS_BLKDEV_H -+#define _ZFS_BLKDEV_H - -@@ -48,3 +48,3 @@ blk_fetch_request(struct request_queue *q) - -- return req; -+ return (req); - } -@@ -81,3 +81,3 @@ __blk_end_request(struct request *req, int error, unsigned int nr_bytes) - -- return 0; -+ return (0); - } -@@ -94,6 +94,6 @@ blk_end_request(struct request *req, int error, unsigned int nr_bytes) - -- return rc; -+ return (rc); - } - #else --# ifdef HAVE_BLK_END_REQUEST_GPL_ONLY -+#ifdef HAVE_BLK_END_REQUEST_GPL_ONLY - /* -@@ -103,4 +103,4 @@ blk_end_request(struct request *req, int error, unsigned int nr_bytes) - */ --# define __blk_end_request __blk_end_request_x --# define blk_end_request blk_end_request_x -+#define __blk_end_request __blk_end_request_x -+#define blk_end_request blk_end_request_x - -@@ -117,3 +117,3 @@ __blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - -- return 0; -+ return (0); - } -@@ -129,5 +129,5 @@ blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - -- return rc; -+ return (rc); - } --# endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ -+#endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ - #endif /* HAVE_BLK_END_REQUEST */ -@@ -143,3 +143,3 @@ blk_end_request_x(struct request *req, int error, unsigned int nr_bytes) - #if defined(HAVE_BLK_QUEUE_FLUSH) && defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) --#define blk_queue_flush __blk_queue_flush -+#define blk_queue_flush __blk_queue_flush - static inline void -@@ -155,3 +155,3 @@ blk_rq_pos(struct request *req) - { -- return req->sector; -+ return (req->sector); - } -@@ -163,3 +163,3 @@ blk_rq_sectors(struct request *req) - { -- return req->nr_sectors; -+ return (req->nr_sectors); - } -@@ -173,3 +173,3 @@ blk_rq_sectors(struct request *req) - */ --#define blk_rq_bytes __blk_rq_bytes -+#define blk_rq_bytes __blk_rq_bytes - static inline unsigned int -@@ -177,3 +177,3 @@ __blk_rq_bytes(struct request *req) - { -- return blk_rq_sectors(req) << 9; -+ return (blk_rq_sectors(req) << 9); - } -@@ -188,3 +188,3 @@ __blk_rq_bytes(struct request *req) - #ifndef blk_fs_request --#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) -+#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) - #endif -@@ -199,3 +199,3 @@ __blk_rq_bytes(struct request *req) - #ifndef blk_queue_stackable --#define blk_queue_stackable(q) ((q)->request_fn == NULL) -+#define blk_queue_stackable(q) ((q)->request_fn == NULL) - #endif -@@ -207,3 +207,3 @@ __blk_rq_bytes(struct request *req) - #ifndef HAVE_BLK_QUEUE_MAX_HW_SECTORS --#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors -+#define blk_queue_max_hw_sectors __blk_queue_max_hw_sectors - static inline void -@@ -221,3 +221,3 @@ __blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) - #ifndef HAVE_BLK_QUEUE_MAX_SEGMENTS --#define blk_queue_max_segments __blk_queue_max_segments -+#define blk_queue_max_segments __blk_queue_max_segments - static inline void -@@ -237,3 +237,3 @@ __blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) - #ifndef HAVE_BLK_QUEUE_PHYSICAL_BLOCK_SIZE --#define blk_queue_physical_block_size(q, x) ((void)(0)) -+#define blk_queue_physical_block_size(q, x) ((void)(0)) - #endif -@@ -246,3 +246,3 @@ __blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) - #ifndef HAVE_BLK_QUEUE_IO_OPT --#define blk_queue_io_opt(q, x) ((void)(0)) -+#define blk_queue_io_opt(q, x) ((void)(0)) - #endif -@@ -258,3 +258,3 @@ get_disk_ro(struct gendisk *disk) - -- return policy; -+ return (policy); - } -@@ -276,6 +276,6 @@ struct req_iterator { - --# define for_each_bio(_bio) \ -+#define for_each_bio(_bio) \ - for (; _bio; _bio = _bio->bi_next) - --# define __rq_for_each_bio(_bio, rq) \ -+#define __rq_for_each_bio(_bio, rq) \ - if ((rq->bio)) \ -@@ -283,5 +283,7 @@ struct req_iterator { - --# define rq_for_each_segment(bvl, _rq, _iter) \ -+#define rq_for_each_segment(bvl, _rq, _iter) \ - __rq_for_each_bio(_iter.bio, _rq) \ - bio_for_each_segment(bvl, _iter.bio, _iter.i) -+ -+#define HAVE_RQ_FOR_EACH_SEGMENT_BVP 1 - #endif /* HAVE_RQ_FOR_EACH_SEGMENT */ -@@ -289,2 +291,34 @@ struct req_iterator { - /* -+ * 3.14 API change -+ * rq_for_each_segment changed from taking bio_vec * to taking bio_vec. -+ * We provide rq_for_each_segment4 which takes both. -+ * You should not modify the fields in @bv and @bvp. -+ * -+ * Note: the if-else is just to inject the assignment before the loop body. -+ */ -+#ifdef HAVE_RQ_FOR_EACH_SEGMENT_BVP -+#define rq_for_each_segment4(bv, bvp, rq, iter) \ -+ rq_for_each_segment(bvp, rq, iter) \ -+ if ((bv = *bvp), 0) \ -+ ; \ -+ else -+#else -+#define rq_for_each_segment4(bv, bvp, rq, iter) \ -+ rq_for_each_segment(bv, rq, iter) \ -+ if ((bvp = &bv), 0) \ -+ ; \ -+ else -+#endif -+ -+#ifdef HAVE_BIO_BVEC_ITER -+#define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector -+#define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size -+#define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx -+#else -+#define BIO_BI_SECTOR(bio) (bio)->bi_sector -+#define BIO_BI_SIZE(bio) (bio)->bi_size -+#define BIO_BI_IDX(bio) (bio)->bi_idx -+#endif -+ -+/* - * Portable helper for correctly setting the FAILFAST flags. The -@@ -317,17 +351,19 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - /* BIO_RW_FAILFAST_* preferred interface from 2.6.28 - 2.6.35 */ -- *flags |= -- ((1 << BIO_RW_FAILFAST_DEV) | -- (1 << BIO_RW_FAILFAST_TRANSPORT) | -- (1 << BIO_RW_FAILFAST_DRIVER)); -+ *flags |= ( -+ (1 << BIO_RW_FAILFAST_DEV) | -+ (1 << BIO_RW_FAILFAST_TRANSPORT) | -+ (1 << BIO_RW_FAILFAST_DRIVER)); - #else --# ifdef HAVE_BIO_RW_FAILFAST -+#ifdef HAVE_BIO_RW_FAILFAST - /* BIO_RW_FAILFAST preferred interface from 2.6.12 - 2.6.27 */ - *flags |= (1 << BIO_RW_FAILFAST); --# else --# ifdef HAVE_REQ_FAILFAST_MASK -- /* REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, -- * the BIO_* and REQ_* flags were unified under REQ_* flags. */ -+#else -+#ifdef HAVE_REQ_FAILFAST_MASK -+ /* -+ * REQ_FAILFAST_* preferred interface from 2.6.36 - 2.6.xx, -+ * the BIO_* and REQ_* flags were unified under REQ_* flags. -+ */ - *flags |= REQ_FAILFAST_MASK; --# endif /* HAVE_REQ_FAILFAST_MASK */ --# endif /* HAVE_BIO_RW_FAILFAST */ -+#endif /* HAVE_REQ_FAILFAST_MASK */ -+#endif /* HAVE_BIO_RW_FAILFAST */ - #endif /* HAVE_BIO_RW_FAILFAST_DTD */ -@@ -339,3 +375,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifndef DISK_NAME_LEN --#define DISK_NAME_LEN 32 -+#define DISK_NAME_LEN 32 - #endif /* DISK_NAME_LEN */ -@@ -348,8 +384,10 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_2ARGS_BIO_END_IO_T --# define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) --# define BIO_END_IO_RETURN(rc) return -+#define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) -+#define BIO_END_IO_RETURN(rc) return - #else --# define BIO_END_IO_PROTO(fn, x, y, z) static int fn(struct bio *x, \ -- unsigned int y, int z) --# define BIO_END_IO_RETURN(rc) return rc -+#define BIO_END_IO_PROTO(fn, x, y, z) static int fn( \ -+ struct bio *x, \ -+ unsigned int y, \ -+ int z) -+#define BIO_END_IO_RETURN(rc) return rc - #endif /* HAVE_2ARGS_BIO_END_IO_T */ -@@ -372,11 +410,11 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #if defined(HAVE_BLKDEV_GET_BY_PATH) --# define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ -+#define vdev_bdev_open(path, md, hld) blkdev_get_by_path(path, \ - (md) | FMODE_EXCL, hld) --# define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) -+#define vdev_bdev_close(bdev, md) blkdev_put(bdev, (md) | FMODE_EXCL) - #elif defined(HAVE_OPEN_BDEV_EXCLUSIVE) --# define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) --# define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) -+#define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) -+#define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) - #else --# define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) --# define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) -+#define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) -+#define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) - #endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */ -@@ -389,5 +427,5 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_1ARG_INVALIDATE_BDEV --# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) -+#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) - #else --# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) -+#define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) - #endif /* HAVE_1ARG_INVALIDATE_BDEV */ -@@ -400,3 +438,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifndef HAVE_LOOKUP_BDEV --# define lookup_bdev(path) ERR_PTR(-ENOTSUP) -+#define lookup_bdev(path) ERR_PTR(-ENOTSUP) - #endif -@@ -418,9 +456,9 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef HAVE_BDEV_PHYSICAL_BLOCK_SIZE --# define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) -+#define vdev_bdev_block_size(bdev) bdev_physical_block_size(bdev) -+#else -+#ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE -+#define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) - #else --# ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE --# define vdev_bdev_block_size(bdev) bdev_logical_block_size(bdev) --# else --# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) --# endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ -+#define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) -+#endif /* HAVE_BDEV_LOGICAL_BLOCK_SIZE */ - #endif /* HAVE_BDEV_PHYSICAL_BLOCK_SIZE */ -@@ -440,9 +478,9 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef WRITE_FLUSH_FUA --# define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA --# define VDEV_REQ_FLUSH REQ_FLUSH --# define VDEV_REQ_FUA REQ_FUA -+#define VDEV_WRITE_FLUSH_FUA WRITE_FLUSH_FUA -+#define VDEV_REQ_FLUSH REQ_FLUSH -+#define VDEV_REQ_FUA REQ_FUA - #else --# define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER --# define VDEV_REQ_FLUSH REQ_HARDBARRIER --# define VDEV_REQ_FUA REQ_HARDBARRIER -+#define VDEV_WRITE_FLUSH_FUA WRITE_BARRIER -+#define VDEV_REQ_FLUSH REQ_HARDBARRIER -+#define VDEV_REQ_FUA REQ_HARDBARRIER - #endif -@@ -454,3 +492,3 @@ bio_set_flags_failfast(struct block_device *bdev, int *flags) - #ifdef REQ_DISCARD --# define VDEV_REQ_DISCARD REQ_DISCARD -+#define VDEV_REQ_DISCARD REQ_DISCARD - #endif -@@ -469,3 +507,3 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) - #else --#define blk_queue_discard_granularity(x, dg) ((void)0) -+#define blk_queue_discard_granularity(x, dg) ((void)0) - #endif /* HAVE_DISCARD_GRANULARITY */ -@@ -487,3 +525,3 @@ blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) - */ --#define VDEV_HOLDER ((void *)0x2401de7) -+#define VDEV_HOLDER ((void *)0x2401de7) - -diff --git a/include/linux/dcache_compat.h b/include/linux/dcache_compat.h -index 2b9e5c1..bdaa5db 100644 ---- a/include/linux/dcache_compat.h -+++ b/include/linux/dcache_compat.h -@@ -26,3 +26,3 @@ - #ifndef _ZFS_DCACHE_H --#define _ZFS_DCACHE_H -+#define _ZFS_DCACHE_H - -@@ -30,7 +30,7 @@ - --#define dname(dentry) ((char *)((dentry)->d_name.name)) --#define dlen(dentry) ((int)((dentry)->d_name.len)) -+#define dname(dentry) ((char *)((dentry)->d_name.name)) -+#define dlen(dentry) ((int)((dentry)->d_name.len)) - - #ifndef HAVE_D_MAKE_ROOT --#define d_make_root(inode) d_alloc_root(inode) -+#define d_make_root(inode) d_alloc_root(inode) - #endif /* HAVE_D_MAKE_ROOT */ -@@ -76,5 +76,5 @@ d_clear_d_op(struct dentry *dentry) - dentry->d_op = NULL; -- dentry->d_flags &= -- ~(DCACHE_OP_HASH | DCACHE_OP_COMPARE | -- DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -+ dentry->d_flags &= ~( -+ DCACHE_OP_HASH | DCACHE_OP_COMPARE | -+ DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); - #endif /* HAVE_D_SET_D_OP */ -diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h -index 17fa3ff..4358cd2 100644 ---- a/include/linux/vfs_compat.h -+++ b/include/linux/vfs_compat.h -@@ -26,3 +26,3 @@ - #ifndef _ZFS_VFS_H --#define _ZFS_VFS_H -+#define _ZFS_VFS_H - -@@ -73,3 +73,6 @@ extern atomic_long_t zfs_bdi_seq; - static inline int --bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) -+bdi_setup_and_register( -+ struct backing_dev_info *bdi, -+ char *name, -+ unsigned int cap) - { -@@ -101,3 +104,3 @@ bdi_setup_and_register(struct backing_dev_info *bdi,char *name,unsigned int cap) - #ifndef LOOKUP_RCU --#define LOOKUP_RCU 0x0 -+#define LOOKUP_RCU 0x0 - #endif /* LOOKUP_RCU */ -@@ -138,3 +141,3 @@ typedef int zpl_umode_t; - #if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) --#define clear_inode(ip) end_writeback(ip) -+#define clear_inode(ip) end_writeback(ip) - #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ -@@ -146,9 +149,9 @@ typedef int zpl_umode_t; - #ifdef HAVE_5ARG_SGET --#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) -+#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) - #else --#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) -+#define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) - #endif /* HAVE_5ARG_SGET */ - --#define ZFS_IOC_GETFLAGS FS_IOC_GETFLAGS --#define ZFS_IOC_SETFLAGS FS_IOC_SETFLAGS -+#define ZFS_IOC_GETFLAGS FS_IOC_GETFLAGS -+#define ZFS_IOC_SETFLAGS FS_IOC_SETFLAGS - -@@ -156,4 +159,7 @@ typedef int zpl_umode_t; - static inline loff_t --lseek_execute(struct file *filp, struct inode *inode, -- loff_t offset, loff_t maxsize) -+lseek_execute( -+ struct file *filp, -+ struct inode *inode, -+ loff_t offset, -+ loff_t maxsize) - { -@@ -176,2 +182,151 @@ lseek_execute(struct file *filp, struct inode *inode, - -+#if defined(CONFIG_FS_POSIX_ACL) -+/* -+ * These functions safely approximates the behavior of posix_acl_release() -+ * which cannot be used because it calls the GPL-only symbol kfree_rcu(). -+ * The in-kernel version, which can access the RCU, frees the ACLs after -+ * the grace period expires. Because we're unsure how long that grace -+ * period may be this implementation conservatively delays for 60 seconds. -+ * This is several orders of magnitude larger than expected grace period. -+ * At 60 seconds the kernel will also begin issuing RCU stall warnings. -+ */ -+#include -+#ifndef HAVE_POSIX_ACL_CACHING -+#define ACL_NOT_CACHED ((void *)(-1)) -+#endif /* HAVE_POSIX_ACL_CACHING */ -+ -+#if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) -+ -+#define zpl_posix_acl_release(arg) posix_acl_release(arg) -+#define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) -+#define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) -+ -+#else -+ -+static inline void -+zpl_posix_acl_free(void *arg) { -+ kfree(arg); -+} -+ -+static inline void -+zpl_posix_acl_release(struct posix_acl *acl) -+{ -+ if ((acl == NULL) || (acl == ACL_NOT_CACHED)) -+ return; -+ -+ if (atomic_dec_and_test(&acl->a_refcount)) { -+ taskq_dispatch_delay(system_taskq, zpl_posix_acl_free, acl, -+ TQ_SLEEP, ddi_get_lbolt() + 60*HZ); -+ } -+} -+ -+static inline void -+zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { -+#ifdef HAVE_POSIX_ACL_CACHING -+ struct posix_acl *older = NULL; -+ -+ spin_lock(&ip->i_lock); -+ -+ if ((newer != ACL_NOT_CACHED) && (newer != NULL)) -+ posix_acl_dup(newer); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ older = ip->i_acl; -+ rcu_assign_pointer(ip->i_acl, newer); -+ break; -+ case ACL_TYPE_DEFAULT: -+ older = ip->i_default_acl; -+ rcu_assign_pointer(ip->i_default_acl, newer); -+ break; -+ } -+ -+ spin_unlock(&ip->i_lock); -+ -+ zpl_posix_acl_release(older); -+#endif /* HAVE_POSIX_ACL_CACHING */ -+} -+ -+static inline void -+zpl_forget_cached_acl(struct inode *ip, int type) { -+ zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); -+} -+#endif /* HAVE_POSIX_ACL_RELEASE */ -+ -+/* -+ * 2.6.38 API change, -+ * The is_owner_or_cap() function was renamed to inode_owner_or_capable(). -+ */ -+#ifdef HAVE_INODE_OWNER_OR_CAPABLE -+#define zpl_inode_owner_or_capable(ip) inode_owner_or_capable(ip) -+#else -+#define zpl_inode_owner_or_capable(ip) is_owner_or_cap(ip) -+#endif /* HAVE_INODE_OWNER_OR_CAPABLE */ -+ -+#ifndef HAVE___POSIX_ACL_CHMOD -+#ifdef HAVE_POSIX_ACL_CHMOD -+#define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) -+#define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) -+#else -+static inline int -+__posix_acl_chmod(struct posix_acl **acl, int flags, umode_t umode) { -+ struct posix_acl *oldacl = *acl; -+ mode_t mode = umode; -+ int error; -+ -+ *acl = posix_acl_clone(*acl, flags); -+ zpl_posix_acl_release(oldacl); -+ -+ if (!(*acl)) -+ return (-ENOMEM); -+ -+ error = posix_acl_chmod_masq(*acl, mode); -+ if (error) { -+ zpl_posix_acl_release(*acl); -+ *acl = NULL; -+ } -+ -+ return (error); -+} -+ -+static inline int -+__posix_acl_create(struct posix_acl **acl, int flags, umode_t *umodep) { -+ struct posix_acl *oldacl = *acl; -+ mode_t mode = *umodep; -+ int error; -+ -+ *acl = posix_acl_clone(*acl, flags); -+ zpl_posix_acl_release(oldacl); -+ -+ if (!(*acl)) -+ return (-ENOMEM); -+ -+ error = posix_acl_create_masq(*acl, &mode); -+ *umodep = mode; -+ -+ if (error < 0) { -+ zpl_posix_acl_release(*acl); -+ *acl = NULL; -+ } -+ -+ return (error); -+} -+#endif /* HAVE_POSIX_ACL_CHMOD */ -+#endif /* HAVE___POSIX_ACL_CHMOD */ -+ -+#ifndef HAVE_CURRENT_UMASK -+static inline int -+current_umask(void) -+{ -+ return (current->fs->umask); -+} -+#endif /* HAVE_CURRENT_UMASK */ -+ -+#ifdef HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T -+typedef umode_t zpl_equivmode_t; -+#else -+typedef mode_t zpl_equivmode_t; -+#endif /* HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T */ -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - #endif /* _ZFS_VFS_H */ -diff --git a/include/linux/xattr_compat.h b/include/linux/xattr_compat.h -index 84d8fde..a7371f9 100644 ---- a/include/linux/xattr_compat.h -+++ b/include/linux/xattr_compat.h -@@ -26,3 +26,5 @@ - #ifndef _ZFS_XATTR_H --#define _ZFS_XATTR_H -+#define _ZFS_XATTR_H -+ -+#include - -@@ -47,3 +49,3 @@ typedef struct xattr_handler xattr_handler_t; - #ifdef HAVE_DENTRY_XATTR_GET --#define ZPL_XATTR_GET_WRAPPER(fn) \ -+#define ZPL_XATTR_GET_WRAPPER(fn) \ - static int \ -@@ -52,6 +54,6 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ - { \ -- return __ ## fn(dentry->d_inode, name, buffer, size); \ -+ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ - } - #else --#define ZPL_XATTR_GET_WRAPPER(fn) \ -+#define ZPL_XATTR_GET_WRAPPER(fn) \ - static int \ -@@ -59,3 +61,3 @@ fn(struct inode *ip, const char *name, void *buffer, size_t size) \ - { \ -- return __ ## fn(ip, name, buffer, size); \ -+ return (__ ## fn(ip, name, buffer, size)); \ - } -@@ -69,3 +71,3 @@ fn(struct inode *ip, const char *name, void *buffer, size_t size) \ - #ifdef HAVE_DENTRY_XATTR_SET --#define ZPL_XATTR_SET_WRAPPER(fn) \ -+#define ZPL_XATTR_SET_WRAPPER(fn) \ - static int \ -@@ -74,6 +76,6 @@ fn(struct dentry *dentry, const char *name, const void *buffer, \ - { \ -- return __ ## fn(dentry->d_inode, name, buffer, size, flags); \ -+ return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ - } - #else --#define ZPL_XATTR_SET_WRAPPER(fn) \ -+#define ZPL_XATTR_SET_WRAPPER(fn) \ - static int \ -@@ -82,3 +84,3 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - { \ -- return __ ## fn(ip, name, buffer, size, flags); \ -+ return (__ ## fn(ip, name, buffer, size, flags)); \ - } -@@ -87,6 +89,6 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - #ifdef HAVE_6ARGS_SECURITY_INODE_INIT_SECURITY --#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ -+#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, qstr, nm, val, len) - #else --#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ -+#define zpl_security_inode_init_security(ip, dip, qstr, nm, val, len) \ - security_inode_init_security(ip, dip, nm, val, len) -@@ -94,2 +96,35 @@ fn(struct inode *ip, const char *name, const void *buffer, \ - -+/* -+ * Linux 3.7 API change. posix_acl_{from,to}_xattr gained the user_ns -+ * parameter. For the HAVE_POSIX_ACL_FROM_XATTR_USERNS version the -+ * userns _may_ not be correct because it's used outside the RCU. -+ */ -+#ifdef HAVE_POSIX_ACL_FROM_XATTR_USERNS -+static inline struct posix_acl * -+zpl_acl_from_xattr(const void *value, int size) -+{ -+ return (posix_acl_from_xattr(CRED()->user_ns, value, size)); -+} -+ -+static inline int -+zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) -+{ -+ return (posix_acl_to_xattr(CRED()->user_ns, acl, value, size)); -+} -+ -+#else -+ -+static inline struct posix_acl * -+zpl_acl_from_xattr(const void *value, int size) -+{ -+ return (posix_acl_from_xattr(value, size)); -+} -+ -+static inline int -+zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) -+{ -+ return (posix_acl_to_xattr(acl, value, size)); -+} -+#endif /* HAVE_POSIX_ACL_FROM_XATTR_USERNS */ -+ - #endif /* _ZFS_XATTR_H */ -diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am -index 2245ff4..9d77566 100644 ---- a/include/sys/Makefile.am -+++ b/include/sys/Makefile.am -@@ -14,2 +14,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dmu_objset.h \ -+ $(top_srcdir)/include/sys/dmu_send.h \ - $(top_srcdir)/include/sys/dmu_traverse.h \ -@@ -21,2 +22,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dsl_deleg.h \ -+ $(top_srcdir)/include/sys/dsl_destroy.h \ - $(top_srcdir)/include/sys/dsl_dir.h \ -@@ -26,2 +28,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/dsl_synctask.h \ -+ $(top_srcdir)/include/sys/dsl_userhold.h \ - $(top_srcdir)/include/sys/efi_partition.h \ -@@ -61,2 +64,3 @@ COMMON_H = \ - $(top_srcdir)/include/sys/zfs_debug.h \ -+ $(top_srcdir)/include/sys/zfs_delay.h \ - $(top_srcdir)/include/sys/zfs_dir.h \ -@@ -67,4 +71,4 @@ COMMON_H = \ - $(top_srcdir)/include/sys/zfs_vfsops.h \ -- $(top_srcdir)/include/sys/zfs_znode.h \ - $(top_srcdir)/include/sys/zfs_vnops.h \ -+ $(top_srcdir)/include/sys/zfs_znode.h \ - $(top_srcdir)/include/sys/zil.h \ -diff --git a/include/sys/arc.h b/include/sys/arc.h -index 6788219..005d071 100644 ---- a/include/sys/arc.h -+++ b/include/sys/arc.h -@@ -88,2 +88,3 @@ typedef enum arc_space_type { - ARC_SPACE_DATA, -+ ARC_SPACE_META, - ARC_SPACE_HDRS, -@@ -94,2 +95,32 @@ typedef enum arc_space_type { - -+typedef enum arc_state_type { -+ ARC_STATE_ANON, -+ ARC_STATE_MRU, -+ ARC_STATE_MRU_GHOST, -+ ARC_STATE_MFU, -+ ARC_STATE_MFU_GHOST, -+ ARC_STATE_L2C_ONLY, -+ ARC_STATE_NUMTYPES -+} arc_state_type_t; -+ -+typedef struct arc_buf_info { -+ arc_state_type_t abi_state_type; -+ arc_buf_contents_t abi_state_contents; -+ uint64_t abi_state_index; -+ uint32_t abi_flags; -+ uint32_t abi_datacnt; -+ uint64_t abi_size; -+ uint64_t abi_spa; -+ uint64_t abi_access; -+ uint32_t abi_mru_hits; -+ uint32_t abi_mru_ghost_hits; -+ uint32_t abi_mfu_hits; -+ uint32_t abi_mfu_ghost_hits; -+ uint32_t abi_l2arc_hits; -+ uint32_t abi_holds; -+ uint64_t abi_l2arc_dattr; -+ uint64_t abi_l2arc_asize; -+ enum zio_compress abi_l2arc_compress; -+} arc_buf_info_t; -+ - void arc_space_consume(uint64_t space, arc_space_type_t type); -@@ -102,3 +133,4 @@ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); - void arc_buf_add_ref(arc_buf_t *buf, void *tag); --int arc_buf_remove_ref(arc_buf_t *buf, void *tag); -+boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); -+void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); - int arc_buf_size(arc_buf_t *buf); -@@ -107,2 +139,3 @@ int arc_released(arc_buf_t *buf); - int arc_has_callback(arc_buf_t *buf); -+void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); - void arc_buf_freeze(arc_buf_t *buf); -@@ -115,3 +148,3 @@ int arc_referenced(arc_buf_t *buf); - int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, -- arc_done_func_t *done, void *private, int priority, int flags, -+ arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_t *zb); -@@ -119,4 +152,5 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, -- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, -- void *private, int priority, int zio_flags, const zbookmark_t *zb); -+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, -+ arc_done_func_t *done, void *private, zio_priority_t priority, -+ int zio_flags, const zbookmark_t *zb); - -@@ -129,3 +163,2 @@ int arc_buf_evict(arc_buf_t *buf); - --void arc_adjust_meta(int64_t adjustment, boolean_t may_prune); - void arc_flush(spa_t *spa); -@@ -149,6 +182,5 @@ void l2arc_stop(void); - --/* Global tunings */ --extern int zfs_write_limit_shift; --extern unsigned long zfs_write_limit_max; --extern kmutex_t zfs_write_limit_lock; -+#ifndef _KERNEL -+extern boolean_t arc_watch; -+#endif - -diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h -index 394fdfb..23b919b 100644 ---- a/include/sys/dbuf.h -+++ b/include/sys/dbuf.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -113,2 +114,5 @@ typedef struct dbuf_dirty_record { - -+ /* How much space was changed to dsl_pool_dirty_space() for this? */ -+ unsigned int dr_accounted; -+ - union dirty_types { -@@ -133,2 +137,3 @@ typedef struct dbuf_dirty_record { - uint8_t dr_copies; -+ boolean_t dr_nopwrite; - } dl; -@@ -252,3 +257,3 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, - --void dbuf_prefetch(struct dnode *dn, uint64_t blkid); -+void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); - -@@ -284,2 +289,5 @@ void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); - -+void dbuf_stats_init(dbuf_hash_table_t *hash); -+void dbuf_stats_destroy(void); -+ - #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) -@@ -309,7 +317,4 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - --#define DBUF_IS_METADATA(_db) \ -- (dbuf_is_metadata(_db)) -- - #define DBUF_GET_BUFC_TYPE(_db) \ -- (DBUF_IS_METADATA(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -+ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) - -@@ -317,3 +322,3 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ -- (DBUF_IS_METADATA(_db) && \ -+ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -@@ -322,3 +327,3 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ -- (DBUF_IS_METADATA(_db) && \ -+ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) -diff --git a/include/sys/ddt.h b/include/sys/ddt.h -index 6943259..3befcb8 100644 ---- a/include/sys/ddt.h -+++ b/include/sys/ddt.h -@@ -65,12 +65,11 @@ typedef struct ddt_key { - zio_cksum_t ddk_cksum; /* 256-bit block checksum */ -- uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ -+ /* -+ * Encoded with logical & physical size, and compression, as follows: -+ * +-------+-------+-------+-------+-------+-------+-------+-------+ -+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE | -+ * +-------+-------+-------+-------+-------+-------+-------+-------+ -+ */ -+ uint64_t ddk_prop; - } ddt_key_t; - --/* -- * ddk_prop layout: -- * -- * +-------+-------+-------+-------+-------+-------+-------+-------+ -- * | 0 | 0 | 0 | comp | PSIZE | LSIZE | -- * +-------+-------+-------+-------+-------+-------+-------+-------+ -- */ - #define DDK_GET_LSIZE(ddk) \ -@@ -219,2 +218,4 @@ extern void ddt_enter(ddt_t *ddt); - extern void ddt_exit(ddt_t *ddt); -+extern void ddt_init(void); -+extern void ddt_fini(void); - extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); -diff --git a/include/sys/dmu.h b/include/sys/dmu.h -index adaab4c..1314c1e 100644 ---- a/include/sys/dmu.h -+++ b/include/sys/dmu.h -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -@@ -44,2 +45,3 @@ - #include -+#include - #include -@@ -215,11 +217,7 @@ typedef enum dmu_object_type { - --typedef enum dmu_objset_type { -- DMU_OST_NONE, -- DMU_OST_META, -- DMU_OST_ZFS, -- DMU_OST_ZVOL, -- DMU_OST_OTHER, /* For testing only! */ -- DMU_OST_ANY, /* Be careful! */ -- DMU_OST_NUMTYPES --} dmu_objset_type_t; -+typedef enum txg_how { -+ TXG_WAIT = 1, -+ TXG_NOWAIT, -+ TXG_WAITED, -+} txg_how_t; - -@@ -263,13 +261,10 @@ int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); - --int dmu_objset_evict_dbufs(objset_t *os); -+void dmu_objset_evict_dbufs(objset_t *os); - int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); --int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, -- uint64_t flags); --int dmu_objset_destroy(const char *name, boolean_t defer); --int dmu_snapshots_destroy_nvl(struct nvlist *snaps, boolean_t defer, char *); --int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); --int dmu_objset_rename(const char *name, const char *newname, -- boolean_t recursive); -+int dmu_objset_clone(const char *name, const char *origin); -+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, -+ struct nvlist *errlist); -+int dmu_objset_snapshot_one(const char *fsname, const char *snapname); -+int dmu_objset_snapshot_tmp(const char *, const char *, int); - int dmu_objset_find(char *name, int func(const char *, void *), void *arg, -@@ -277,2 +272,4 @@ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, - void dmu_objset_byteswap(void *buf, size_t size); -+int dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive); - -@@ -413,2 +410,4 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, - * buffer as well. You must release what you hold with dmu_buf_rele(). -+ * -+ * Returns ENOENT, EIO, or 0. - */ -@@ -502,2 +501,7 @@ void *dmu_buf_get_user(dmu_buf_t *db); - /* -+ * Returns the blkptr associated with this dbuf, or NULL if not set. -+ */ -+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); -+ -+/* - * Indicate that you are going to modify the buffer's data (db_data). -@@ -546,3 +550,3 @@ void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); - void dmu_tx_abort(dmu_tx_t *tx); --int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -+int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); - void dmu_tx_wait(dmu_tx_t *tx); -@@ -578,3 +582,3 @@ int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size); --int dmu_free_object(objset_t *os, uint64_t object); -+int dmu_free_long_object(objset_t *os, uint64_t object); - -@@ -667,4 +671,11 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; - int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); -+void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -+/* Like dmu_object_info, but faster if you have a held dnode in hand. */ - void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); -+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ - void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); -+/* -+ * Like dmu_object_info_from_db, but faster still when you only care about -+ * the size. This is specifically optimized for zfs_getattr(). -+ */ - void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, -@@ -794,33 +805,4 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start, - --int dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- int outfd, struct vnode *vp, offset_t *off); --int dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorign, -- uint64_t *sizep); -- --typedef struct dmu_recv_cookie { -- /* -- * This structure is opaque! -- * -- * If logical and real are different, we are recving the stream -- * into the "real" temporary clone, and then switching it with -- * the "logical" target. -- */ -- struct dsl_dataset *drc_logical_ds; -- struct dsl_dataset *drc_real_ds; -- struct drr_begin *drc_drrb; -- char *drc_tosnap; -- char *drc_top_ds; -- boolean_t drc_newfs; -- boolean_t drc_force; -- struct avl_tree *drc_guid_to_ds_map; --} dmu_recv_cookie_t; -- --int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, -- boolean_t force, objset_t *origin, dmu_recv_cookie_t *); --int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, -- int cleanup_fd, uint64_t *action_handlep); --int dmu_recv_end(dmu_recv_cookie_t *drc); -- --int dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, -- offset_t *off); -+int dmu_diff(const char *tosnap_name, const char *fromsnap_name, -+ struct vnode *vp, offset_t *offp); - -diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h -index f13a2a3..bbff15d 100644 ---- a/include/sys/dmu_impl.h -+++ b/include/sys/dmu_impl.h -@@ -23,3 +23,6 @@ - * Use is subject to license terms. -+ */ -+/* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -267,2 +270,5 @@ typedef struct dmu_sendarg { - dmu_pendop_t dsa_pending_op; -+ boolean_t dsa_incremental; -+ uint64_t dsa_last_data_object; -+ uint64_t dsa_last_data_offset; - } dmu_sendarg_t; -diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h -index 507f732..edf362f 100644 ---- a/include/sys/dmu_objset.h -+++ b/include/sys/dmu_objset.h -@@ -23,2 +23,3 @@ - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -45,2 +46,3 @@ extern krwlock_t os_lock; - -+struct dsl_pool; - struct dsl_dataset; -@@ -116,4 +118,2 @@ struct objset { - void *os_user_ptr; -- -- /* SA layout/attribute registration */ - sa_os_t *os_sa; -@@ -138,2 +138,3 @@ int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); -+void dmu_objset_refresh_ownership(objset_t *os, void *tag); - void dmu_objset_rele(objset_t *os, void *tag); -@@ -142,9 +143,2 @@ int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); - --int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, -- void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); --int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, -- uint64_t flags); --int dmu_objset_destroy(const char *name, boolean_t defer); --int dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- struct nvlist *props, boolean_t recursive, boolean_t temporary, int fd); - void dmu_objset_stats(objset_t *os, nvlist_t *nv); -@@ -154,9 +148,6 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, - uint64_t dmu_objset_fsid_guid(objset_t *os); --int dmu_objset_find(char *name, int func(const char *, void *), void *arg, -- int flags); --int dmu_objset_find_spa(spa_t *spa, const char *name, -- int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); --int dmu_objset_prefetch(const char *name, void *arg); --void dmu_objset_byteswap(void *buf, size_t size); --int dmu_objset_evict_dbufs(objset_t *os); -+int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, -+ int func(struct dsl_pool *, struct dsl_dataset *, void *), -+ void *arg, int flags); -+void dmu_objset_evict_dbufs(objset_t *os); - timestruc_t dmu_objset_snap_cmtime(objset_t *os); -@@ -176,2 +167,3 @@ int dmu_objset_userspace_upgrade(objset_t *os); - boolean_t dmu_objset_userspace_present(objset_t *os); -+int dmu_fsname(const char *snapname, char *buf); - -diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h -new file mode 100644 -index 0000000..65514b7 ---- /dev/null -+++ b/include/sys/dmu_send.h -@@ -0,0 +1,68 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ */ -+ -+#ifndef _DMU_SEND_H -+#define _DMU_SEND_H -+ -+#include -+#include -+ -+struct vnode; -+struct dsl_dataset; -+struct drr_begin; -+struct avl_tree; -+ -+int dmu_send(const char *tosnap, const char *fromsnap, int outfd, -+ struct vnode *vp, offset_t *off); -+int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, -+ uint64_t *sizep); -+int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, -+ int outfd, struct vnode *vp, offset_t *off); -+ -+typedef struct dmu_recv_cookie { -+ struct dsl_dataset *drc_ds; -+ struct drr_begin *drc_drrb; -+ const char *drc_tofs; -+ const char *drc_tosnap; -+ boolean_t drc_newfs; -+ boolean_t drc_byteswap; -+ boolean_t drc_force; -+ struct avl_tree *drc_guid_to_ds_map; -+ zio_cksum_t drc_cksum; -+ uint64_t drc_newsnapobj; -+ void *drc_owner; -+} dmu_recv_cookie_t; -+ -+int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, -+ boolean_t force, char *origin, dmu_recv_cookie_t *drc); -+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, -+ int cleanup_fd, uint64_t *action_handlep); -+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); -+boolean_t dmu_objset_is_receiving(objset_t *os); -+ -+#endif /* _DMU_SEND_H */ -diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h -index 40c1ded..c70c97d 100644 ---- a/include/sys/dmu_tx.h -+++ b/include/sys/dmu_tx.h -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ - -@@ -59,4 +62,18 @@ struct dmu_tx { - struct dmu_tx_hold *tx_needassign_txh; -- list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ -- uint8_t tx_anyobj; -+ -+ /* list of dmu_tx_callback_t on this dmu_tx */ -+ list_t tx_callbacks; -+ -+ /* placeholder for syncing context, doesn't need specific holds */ -+ boolean_t tx_anyobj; -+ -+ /* has this transaction already been delayed? */ -+ boolean_t tx_waited; -+ -+ /* time this transaction was created */ -+ hrtime_t tx_start; -+ -+ /* need to wait for sufficient dirty space */ -+ boolean_t tx_wait_dirty; -+ - int tx_err; -@@ -115,8 +132,7 @@ typedef struct dmu_tx_stats { - kstat_named_t dmu_tx_group; -- kstat_named_t dmu_tx_how; - kstat_named_t dmu_tx_memory_reserve; - kstat_named_t dmu_tx_memory_reclaim; -- kstat_named_t dmu_tx_memory_inflight; - kstat_named_t dmu_tx_dirty_throttle; -- kstat_named_t dmu_tx_write_limit; -+ kstat_named_t dmu_tx_dirty_delay; -+ kstat_named_t dmu_tx_dirty_over_max; - kstat_named_t dmu_tx_quota; -@@ -126,5 +142,5 @@ extern dmu_tx_stats_t dmu_tx_stats; - --#define DMU_TX_STAT_INCR(stat, val) \ -+#define DMU_TX_STAT_INCR(stat, val) \ - atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val)); --#define DMU_TX_STAT_BUMP(stat) \ -+#define DMU_TX_STAT_BUMP(stat) \ - DMU_TX_STAT_INCR(stat, 1); -@@ -135,3 +151,3 @@ extern dmu_tx_stats_t dmu_tx_stats; - dmu_tx_t *dmu_tx_create(objset_t *dd); --int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); -+int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); - void dmu_tx_commit(dmu_tx_t *tx); -@@ -139,2 +155,3 @@ void dmu_tx_abort(dmu_tx_t *tx); - uint64_t dmu_tx_get_txg(dmu_tx_t *tx); -+struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); - void dmu_tx_wait(dmu_tx_t *tx); -diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h -index 442ab15..38ed1d8 100644 ---- a/include/sys/dmu_zfetch.h -+++ b/include/sys/dmu_zfetch.h -@@ -52,3 +52,3 @@ typedef struct zstream { - clock_t zst_last; /* lbolt of last prefetch */ -- avl_node_t zst_node; /* embed avl node here */ -+ list_node_t zst_node; /* next zstream here */ - } zstream_t; -diff --git a/include/sys/dnode.h b/include/sys/dnode.h -index 9f9134d..55b87bc 100644 ---- a/include/sys/dnode.h -+++ b/include/sys/dnode.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -147,5 +147,4 @@ typedef struct dnode { - /* -- * dn_struct_rwlock protects the structure of the dnode, -- * including the number of levels of indirection (dn_nlevels), -- * dn_maxblkid, and dn_next_* -+ * Protects the structure of the dnode, including the number of levels -+ * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* - */ -@@ -191,2 +190,4 @@ typedef struct dnode { - uint32_t dn_dbufs_count; /* count of dn_dbufs */ -+ /* There are no level-0 blocks of this blkid or higher in dn_dbufs */ -+ uint64_t dn_unlisted_l0_blkid; - -diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h -index afcf2b7..f6449c6 100644 ---- a/include/sys/dsl_dataset.h -+++ b/include/sys/dsl_dataset.h -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -37,2 +38,3 @@ - #include -+#include - -@@ -50,6 +52,4 @@ struct dsl_pool; - /* -- * NB: nopromote can not yet be set, but we want support for it in this -- * on-disk version, so that we don't need to upgrade for it later. It -- * will be needed when we implement 'zfs split' (where the split off -- * clone should not be promoted). -+ * Note: nopromote can not yet be set, but we want support for it in this -+ * on-disk version, so that we don't need to upgrade for it later. - */ -@@ -78,2 +78,4 @@ struct dsl_pool; - -+#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) -+ - typedef struct dsl_dataset_phys { -@@ -127,5 +129,2 @@ typedef struct dsl_dataset { - -- /* to protect against multiple concurrent incremental recv */ -- kmutex_t ds_recvlock; -- - /* protected by lock on pool's dp_dirty_datasets list */ -@@ -141,9 +140,11 @@ typedef struct dsl_dataset { - uint64_t ds_userrefs; -+ void *ds_owner; - - /* -- * ds_owner is protected by the ds_rwlock and the ds_lock -+ * Long holds prevent the ds from being destroyed; they allow the -+ * ds to remain held even after dropping the dp_config_rwlock. -+ * Owning counts as a long hold. See the comments above -+ * dsl_pool_hold() for details. - */ -- krwlock_t ds_rwlock; -- kcondvar_t ds_exclusive_cv; -- void *ds_owner; -+ refcount_t ds_longholds; - -@@ -165,11 +166,2 @@ typedef struct dsl_dataset { - --struct dsl_ds_destroyarg { -- dsl_dataset_t *ds; /* ds to destroy */ -- dsl_dataset_t *rm_origin; /* also remove our origin? */ -- boolean_t is_origin_rm; /* set if removing origin snap */ -- boolean_t defer; /* destroy -d requested? */ -- boolean_t releasing; /* destroying due to release? */ -- boolean_t need_prep; /* do we need to retry due to EBUSY? */ --}; -- - /* -@@ -180,12 +172,2 @@ struct dsl_ds_destroyarg { - --struct dsl_ds_holdarg { -- dsl_sync_task_group_t *dstg; -- char *htag; -- char *snapname; -- boolean_t recursive; -- boolean_t gotone; -- boolean_t temphold; -- char failed[MAXPATHLEN]; --}; -- - #define dsl_dataset_is_snapshot(ds) \ -@@ -196,18 +178,14 @@ struct dsl_ds_holdarg { - --int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); --int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, -- void *tag, dsl_dataset_t **); --int dsl_dataset_own(const char *name, boolean_t inconsistentok, -+int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, -+ dsl_dataset_t **dsp); -+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, -+ dsl_dataset_t **); -+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -+int dsl_dataset_own(struct dsl_pool *dp, const char *name, - void *tag, dsl_dataset_t **dsp); - int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, -- boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); --void dsl_dataset_name(dsl_dataset_t *ds, char *name); --void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -+ void *tag, dsl_dataset_t **dsp); - void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); --void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); --boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, -- void *tag); --void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); --void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, -- minor_t minor); -+void dsl_dataset_name(dsl_dataset_t *ds, char *name); -+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); - uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, -@@ -216,22 +194,8 @@ uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx); --int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); --int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); --dsl_checkfunc_t dsl_dataset_destroy_check; --dsl_syncfunc_t dsl_dataset_destroy_sync; --dsl_checkfunc_t dsl_dataset_snapshot_check; --dsl_syncfunc_t dsl_dataset_snapshot_sync; --dsl_syncfunc_t dsl_dataset_user_hold_sync; --int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -+int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); - int dsl_dataset_promote(const char *name, char *conflsnap); --int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, -- boolean_t force); --int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, -- boolean_t recursive, boolean_t temphold, int cleanup_fd); --int dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, -- boolean_t temphold); --int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, -- boolean_t recursive); --int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, -- char *htag, boolean_t retry); --int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); -+int dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive); -+int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, -+ minor_t cleanup_minor, const char *htag); - -@@ -242,3 +206,4 @@ spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); - --boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); -+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, -+ dsl_dataset_t *snap); - -@@ -274,9 +239,31 @@ int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t *ref_rsrv); --int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, -+int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, - uint64_t quota); --dsl_syncfunc_t dsl_dataset_set_quota_sync; --int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, -+int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, - uint64_t reservation); - --int dsl_destroy_inconsistent(const char *dsname, void *arg); -+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier); -+void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); -+void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); -+boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); -+ -+int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); -+void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, dmu_tx_t *tx); -+int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx, boolean_t recv); -+void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx); -+ -+void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, -+ dmu_tx_t *tx); -+void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); -+int dsl_dataset_get_snapname(dsl_dataset_t *ds); -+int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, -+ uint64_t *value); -+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); -+void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, -+ zprop_source_t source, uint64_t value, dmu_tx_t *tx); -+int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); - -diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h -index 9db6d07..5842639 100644 ---- a/include/sys/dsl_deleg.h -+++ b/include/sys/dsl_deleg.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -67,4 +67,3 @@ int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); - int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); --int dsl_deleg_access_impl(struct dsl_dataset *ds, boolean_t descendent, -- const char *perm, cred_t *cr); -+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); - void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); -diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h -new file mode 100644 -index 0000000..3f63864 ---- /dev/null -+++ b/include/sys/dsl_destroy.h -@@ -0,0 +1,53 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ */ -+ -+#ifndef _SYS_DSL_DESTROY_H -+#define _SYS_DSL_DESTROY_H -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+struct nvlist; -+struct dsl_dataset; -+struct dmu_tx; -+ -+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, -+ struct nvlist *); -+int dsl_destroy_snapshot(const char *, boolean_t); -+int dsl_destroy_head(const char *); -+int dsl_destroy_head_check_impl(struct dsl_dataset *, int); -+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *); -+int dsl_destroy_inconsistent(const char *, void *); -+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); -+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, -+ boolean_t, struct dmu_tx *); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _SYS_DSL_DESTROY_H */ -diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h -index 65ad202..d69d476 100644 ---- a/include/sys/dsl_dir.h -+++ b/include/sys/dsl_dir.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -103,7 +104,6 @@ struct dsl_dir { - --void dsl_dir_close(dsl_dir_t *dd, void *tag); --int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); --int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, -- const char **tailp); --int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, -+void dsl_dir_rele(dsl_dir_t *dd, void *tag); -+int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, -+ dsl_dir_t **, const char **tail); -+int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **); -@@ -113,4 +113,2 @@ uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, - const char *name, dmu_tx_t *tx); --dsl_checkfunc_t dsl_dir_destroy_check; --dsl_syncfunc_t dsl_dir_destroy_sync; - void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); -@@ -133,5 +131,4 @@ int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, - uint64_t reservation); --int dsl_dir_rename(dsl_dir_t *dd, const char *newname); -+int dsl_dir_rename(const char *oldname, const char *newname); - int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); --int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); - boolean_t dsl_dir_is_clone(dsl_dir_t *dd); -@@ -141,2 +138,4 @@ void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); - timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); -+void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, -+ dmu_tx_t *tx); - -diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h -index 4a4bf76..d5bad8d 100644 ---- a/include/sys/dsl_pool.h -+++ b/include/sys/dsl_pool.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -38,2 +38,3 @@ - #include -+#include - -@@ -52,2 +53,10 @@ struct dsl_scan; - -+extern unsigned long zfs_dirty_data_max; -+extern unsigned long zfs_dirty_data_max_max; -+extern unsigned long zfs_dirty_data_sync; -+extern int zfs_dirty_data_max_percent; -+extern int zfs_dirty_data_max_max_percent; -+extern int zfs_delay_min_dirty_percent; -+extern unsigned long zfs_delay_scale; -+ - /* These macros are for indexing into the zfs_all_blkstats_t. */ -@@ -72,9 +81,2 @@ typedef struct zfs_all_blkstats { - --typedef struct txg_history { -- kstat_txg_t th_kstat; -- vdev_stat_t th_vs1; -- vdev_stat_t th_vs2; -- kmutex_t th_lock; -- list_node_t th_link; --} txg_history_t; - -@@ -90,4 +92,2 @@ typedef struct dsl_pool { - struct taskq *dp_iput_taskq; -- kstat_t *dp_txg_kstat; -- kstat_t *dp_tx_assign_kstat; - -@@ -95,5 +95,2 @@ typedef struct dsl_pool { - blkptr_t dp_meta_rootbp; -- hrtime_t dp_read_overhead; -- uint64_t dp_throughput; /* bytes per millisec */ -- uint64_t dp_write_limit; - uint64_t dp_tmp_userrefs_obj; -@@ -107,4 +104,5 @@ typedef struct dsl_pool { - kmutex_t dp_lock; -- uint64_t dp_space_towrite[TXG_SIZE]; -- uint64_t dp_tempreserved[TXG_SIZE]; -+ kcondvar_t dp_spaceavail_cv; -+ uint64_t dp_dirty_pertxg[TXG_SIZE]; -+ uint64_t dp_dirty_total; - uint64_t dp_mos_used_delta; -@@ -112,7 +110,8 @@ typedef struct dsl_pool { - uint64_t dp_mos_uncompressed_delta; -- uint64_t dp_txg_history_size; -- list_t dp_txg_history; -- uint64_t dp_tx_assign_size; -- kstat_named_t *dp_tx_assign_buckets; - -+ /* -+ * Time of most recently scheduled (furthest in the future) -+ * wakeup for delayed transactions. -+ */ -+ hrtime_t dp_last_wakeup; - -@@ -127,2 +126,3 @@ typedef struct dsl_pool { - * Protects administrative changes (properties, namespace) -+ * - * It is only held for write in syncing context. Therefore -@@ -131,3 +131,3 @@ typedef struct dsl_pool { - */ -- krwlock_t dp_config_rwlock; -+ rrwlock_t dp_config_rwlock; - -@@ -145,6 +145,4 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); - uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); --int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); --void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); --void dsl_pool_memory_pressure(dsl_pool_t *dp); --void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); - void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); -@@ -157,2 +155,6 @@ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, - int64_t used, int64_t comp, int64_t uncomp); -+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); -+void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); -+void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); -+boolean_t dsl_pool_config_held(dsl_pool_t *dp); - -@@ -160,14 +162,10 @@ taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp); - --extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, -- const char *tag, uint64_t *now, dmu_tx_t *tx); --extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, -+int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, -+ const char *tag, uint64_t now, dmu_tx_t *tx); -+int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, dmu_tx_t *tx); --extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); -+void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); - int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); -- --void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs); -- --txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); --txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); --void dsl_pool_txg_history_put(txg_history_t *th); -+int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); -+void dsl_pool_rele(dsl_pool_t *dp, void *tag); - -diff --git a/include/sys/dsl_prop.h b/include/sys/dsl_prop.h -index a636ad3..5fe18d6 100644 ---- a/include/sys/dsl_prop.h -+++ b/include/sys/dsl_prop.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -55,16 +56,2 @@ typedef struct dsl_props_arg { - --typedef struct dsl_prop_set_arg { -- const char *psa_name; -- zprop_source_t psa_source; -- int psa_intsz; -- int psa_numints; -- const void *psa_value; -- -- /* -- * Used to handle the special requirements of the quota and reservation -- * properties. -- */ -- uint64_t psa_effective_value; --} dsl_prop_setarg_t; -- - int dsl_prop_register(struct dsl_dataset *ds, const char *propname, -@@ -73,3 +60,4 @@ int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg); --int dsl_prop_numcb(struct dsl_dataset *ds); -+void dsl_prop_notify_all(struct dsl_dir *dd); -+boolean_t dsl_prop_hascb(struct dsl_dataset *ds); - -@@ -80,5 +68,7 @@ int dsl_prop_get_integer(const char *ddname, const char *propname, - int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); --int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); -+int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); - int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, - int intsz, int numints, void *buf, char *setpoint); -+int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, -+ uint64_t *valuep); - int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, -@@ -87,24 +77,22 @@ int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - --dsl_syncfunc_t dsl_props_set_sync; --int dsl_prop_set(const char *ddname, const char *propname, -- zprop_source_t source, int intsz, int numints, const void *buf); --int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); --void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, -+void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, -+ nvlist_t *props, dmu_tx_t *tx); -+void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, -+ zprop_source_t source, int intsz, int numints, const void *value, - dmu_tx_t *tx); -+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); -+int dsl_prop_set_int(const char *dsname, const char *propname, -+ zprop_source_t source, uint64_t value); -+int dsl_prop_set_string(const char *dsname, const char *propname, -+ zprop_source_t source, const char *value); -+int dsl_prop_inherit(const char *dsname, const char *propname, -+ zprop_source_t source); - --void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, -- zprop_source_t source, uint64_t *value); --int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); --#ifdef ZFS_DEBUG --void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); --#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ -- dsl_prop_check_prediction((dd), (psa)) --#else --#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ --#endif -+int dsl_prop_predict(dsl_dir_t *dd, const char *propname, -+ zprop_source_t source, uint64_t value, uint64_t *newvalp); - - /* flag first receive on or after SPA_VERSION_RECVD_PROPS */ --boolean_t dsl_prop_get_hasrecvd(objset_t *os); --void dsl_prop_set_hasrecvd(objset_t *os); --void dsl_prop_unset_hasrecvd(objset_t *os); -+boolean_t dsl_prop_get_hasrecvd(const char *dsname); -+int dsl_prop_set_hasrecvd(const char *dsname); -+void dsl_prop_unset_hasrecvd(const char *dsname); - -diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h -index 5691f4d..bcb85d6 100644 ---- a/include/sys/dsl_scan.h -+++ b/include/sys/dsl_scan.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -74,2 +74,34 @@ typedef enum dsl_scan_flags { - -+#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) -+ -+/* -+ * Every pool will have one dsl_scan_t and this structure will contain -+ * in-memory information about the scan and a pointer to the on-disk -+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan -+ * is contained on-disk to allow the scan to resume in the event of a reboot -+ * or panic. This structure maintains information about the behavior of a -+ * running scan, some caching information, and how it should traverse the pool. -+ * -+ * The following members of this structure direct the behavior of the scan: -+ * -+ * scn_pausing - a scan that cannot be completed in a single txg or -+ * has exceeded its allotted time will need to pause. -+ * When this flag is set the scanner will stop traversing -+ * the pool and write out the current state to disk. -+ * -+ * scn_restart_txg - directs the scanner to either restart or start a -+ * a scan at the specified txg value. -+ * -+ * scn_done_txg - when a scan completes its traversal it will set -+ * the completion txg to the next txg. This is necessary -+ * to ensure that any blocks that were freed during -+ * the scan but have not yet been processed (i.e deferred -+ * frees) are accounted for. -+ * -+ * This structure also maintains information about deferred frees which are -+ * a special kind of traversal. Deferred free can exist in either a bptree or -+ * a bpobj structure. The scn_is_bptree flag will indicate the type of -+ * deferred free that is in progress. If the deferred free is part of an -+ * asynchronous destroy then the scn_async_destroying flag will be set. -+ */ - typedef struct dsl_scan { -@@ -79,2 +111,3 @@ typedef struct dsl_scan { - uint64_t scn_restart_txg; -+ uint64_t scn_done_txg; - uint64_t scn_sync_start_time; -@@ -84,2 +117,3 @@ typedef struct dsl_scan { - boolean_t scn_is_bptree; -+ boolean_t scn_async_destroying; - -diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h -index 9126290..ef86fb6 100644 ---- a/include/sys/dsl_synctask.h -+++ b/include/sys/dsl_synctask.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -36,39 +37,22 @@ struct dsl_pool; - --typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *); --typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *); -+typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); -+typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); - - typedef struct dsl_sync_task { -- list_node_t dst_node; -+ txg_node_t dst_node; -+ struct dsl_pool *dst_pool; -+ uint64_t dst_txg; -+ int dst_space; - dsl_checkfunc_t *dst_checkfunc; - dsl_syncfunc_t *dst_syncfunc; -- void *dst_arg1; -- void *dst_arg2; -- int dst_err; -+ void *dst_arg; -+ int dst_error; -+ boolean_t dst_nowaiter; - } dsl_sync_task_t; - --typedef struct dsl_sync_task_group { -- txg_node_t dstg_node; -- list_t dstg_tasks; -- struct dsl_pool *dstg_pool; -- uint64_t dstg_txg; -- int dstg_err; -- int dstg_space; -- boolean_t dstg_nowaiter; --} dsl_sync_task_group_t; -- --dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp); --void dsl_sync_task_create(dsl_sync_task_group_t *dstg, -- dsl_checkfunc_t *, dsl_syncfunc_t *, -- void *arg1, void *arg2, int blocks_modified); --int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg); --void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); --void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg); --void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx); -- --int dsl_sync_task_do(struct dsl_pool *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified); --void dsl_sync_task_do_nowait(struct dsl_pool *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx); -+void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx); -+int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, -+ dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified); -+void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc, -+ void *arg, int blocks_modified, dmu_tx_t *tx); - -diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h -new file mode 100644 -index 0000000..071aeb8 ---- /dev/null -+++ b/include/sys/dsl_userhold.h -@@ -0,0 +1,57 @@ -+ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#ifndef _SYS_DSL_USERHOLD_H -+#define _SYS_DSL_USERHOLD_H -+ -+#include -+#include -+ -+#ifdef __cplusplus -+extern "C" { -+#endif -+ -+struct dsl_pool; -+struct dsl_dataset; -+struct dmu_tx; -+ -+int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, -+ nvlist_t *errlist); -+int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); -+int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); -+void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); -+int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, -+ boolean_t temphold, struct dmu_tx *tx); -+void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, -+ minor_t minor, uint64_t now, struct dmu_tx *tx); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* _SYS_DSL_USERHOLD_H */ -diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h -index 741b99e..d541b07 100644 ---- a/include/sys/fm/fs/zfs.h -+++ b/include/sys/fm/fs/zfs.h -@@ -41,3 +41,3 @@ extern "C" { - #define FM_EREPORT_ZFS_POOL_DESTROY "zpool.destroy" --#define FM_EREPORT_ZFS_POOL_REGUID "zpool.reguid" -+#define FM_EREPORT_ZFS_POOL_REGUID "zpool.reguid" - #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" -@@ -77,2 +77,7 @@ extern "C" { - #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" -+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" - #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" -diff --git a/include/sys/fm/protocol.h b/include/sys/fm/protocol.h -index 1ee2212..de05bb2 100644 ---- a/include/sys/fm/protocol.h -+++ b/include/sys/fm/protocol.h -@@ -72,2 +72,3 @@ extern "C" { - #define FM_EREPORT_TIME "time" -+#define FM_EREPORT_EID "eid" - -diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h -index a3a8c3f..18fe490 100644 ---- a/include/sys/fm/util.h -+++ b/include/sys/fm/util.h -@@ -73,3 +73,3 @@ typedef struct erpt_dump { - --#define ZEVENT_SHUTDOWN 0x1 -+#define ZEVENT_SHUTDOWN 0x1 - -@@ -78,7 +78,8 @@ typedef void zevent_cb_t(nvlist_t *, nvlist_t *); - typedef struct zevent_s { -- nvlist_t *ev_nvl; /* protected by the zevent_lock */ -- nvlist_t *ev_detector; /* " */ -- list_t ev_ze_list; /* " */ -- list_node_t ev_node; /* " */ -- zevent_cb_t *ev_cb; /* " */ -+ nvlist_t *ev_nvl; /* protected by the zevent_lock */ -+ nvlist_t *ev_detector; /* " */ -+ list_t ev_ze_list; /* " */ -+ list_node_t ev_node; /* " */ -+ zevent_cb_t *ev_cb; /* " */ -+ uint64_t ev_eid; - } zevent_t; -@@ -86,5 +87,5 @@ typedef struct zevent_s { - typedef struct zfs_zevent { -- zevent_t *ze_zevent; /* protected by the zevent_lock */ -- list_node_t ze_node; /* " */ -- uint64_t ze_dropped; /* " */ -+ zevent_t *ze_zevent; /* protected by the zevent_lock */ -+ list_node_t ze_node; /* " */ -+ uint64_t ze_dropped; /* " */ - } zfs_zevent_t; -@@ -100,2 +101,3 @@ extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); - extern int zfs_zevent_wait(zfs_zevent_t *); -+extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); - extern void zfs_zevent_init(zfs_zevent_t **); -diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h -index 26c24fc..ae72f83 100644 ---- a/include/sys/fs/zfs.h -+++ b/include/sys/fs/zfs.h -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -54,2 +54,12 @@ typedef enum { - -+typedef enum dmu_objset_type { -+ DMU_OST_NONE, -+ DMU_OST_META, -+ DMU_OST_ZFS, -+ DMU_OST_ZVOL, -+ DMU_OST_OTHER, /* For testing only! */ -+ DMU_OST_ANY, /* Be careful! */ -+ DMU_OST_NUMTYPES -+} dmu_objset_type_t; -+ - #define ZFS_TYPE_DATASET \ -@@ -130,3 +140,12 @@ typedef enum { - ZFS_PROP_CLONES, -+ ZFS_PROP_LOGICALUSED, -+ ZFS_PROP_LOGICALREFERENCED, -+ ZFS_PROP_INCONSISTENT, /* not exposed to the user */ - ZFS_PROP_SNAPDEV, -+ ZFS_PROP_ACLTYPE, -+ ZFS_PROP_SELINUX_CONTEXT, -+ ZFS_PROP_SELINUX_FSCONTEXT, -+ ZFS_PROP_SELINUX_DEFCONTEXT, -+ ZFS_PROP_SELINUX_ROOTCONTEXT, -+ ZFS_PROP_RELATIME, - ZFS_NUM_PROPS -@@ -518,3 +537,3 @@ typedef struct zpool_rewind_policy { - #define ZPOOL_CONFIG_REMOVING "removing" --#define ZPOOL_CONFIG_RESILVERING "resilvering" -+#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" - #define ZPOOL_CONFIG_COMMENT "comment" -@@ -531,2 +550,3 @@ typedef struct zpool_rewind_policy { - #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ -+#define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ - /* -@@ -687,2 +707,13 @@ typedef enum dsl_scan_state { - -+/* -+ * Errata described by http://zfsonlinux.org/msg/ZFS-8000-ER. The ordering -+ * of this enum must be maintained to ensure the errata identifiers map to -+ * the correct documentation. New errata may only be appended to the list -+ * and must contain corresponding documentation at the above link. -+ */ -+typedef enum zpool_errata { -+ ZPOOL_ERRATA_NONE, -+ ZPOOL_ERRATA_ZOL_2094_SCRUB, -+ ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, -+} zpool_errata_t; - -@@ -755,6 +786,9 @@ typedef struct ddt_histogram { - */ --#define ZFS_IOC ('Z' << 8) -- - typedef enum zfs_ioc { -- ZFS_IOC_POOL_CREATE = ZFS_IOC, -+ /* -+ * Illumos - 69/128 numbers reserved. -+ */ -+ ZFS_IOC_FIRST = ('Z' << 8), -+ ZFS_IOC = ZFS_IOC_FIRST, -+ ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, - ZFS_IOC_POOL_DESTROY, -@@ -781,4 +815,2 @@ typedef enum zfs_ioc { - ZFS_IOC_SET_PROP, -- ZFS_IOC_CREATE_MINOR, -- ZFS_IOC_REMOVE_MINOR, - ZFS_IOC_CREATE, -@@ -795,3 +827,2 @@ typedef enum zfs_ioc { - ZFS_IOC_PROMOTE, -- ZFS_IOC_DESTROY_SNAPS_NVL, - ZFS_IOC_SNAPSHOT, -@@ -818,9 +849,27 @@ typedef enum zfs_ioc { - ZFS_IOC_OBJ_TO_STATS, -- ZFS_IOC_EVENTS_NEXT, -- ZFS_IOC_EVENTS_CLEAR, -- ZFS_IOC_POOL_REGUID, - ZFS_IOC_SPACE_WRITTEN, - ZFS_IOC_SPACE_SNAPS, -+ ZFS_IOC_DESTROY_SNAPS, -+ ZFS_IOC_POOL_REGUID, - ZFS_IOC_POOL_REOPEN, - ZFS_IOC_SEND_PROGRESS, -+ ZFS_IOC_LOG_HISTORY, -+ ZFS_IOC_SEND_NEW, -+ ZFS_IOC_SEND_SPACE, -+ ZFS_IOC_CLONE, -+ -+ /* -+ * Linux - 3/64 numbers reserved. -+ */ -+ ZFS_IOC_LINUX = ('Z' << 8) + 0x80, -+ ZFS_IOC_EVENTS_NEXT, -+ ZFS_IOC_EVENTS_CLEAR, -+ ZFS_IOC_EVENTS_SEEK, -+ -+ /* -+ * FreeBSD - 1/64 numbers reserved. -+ */ -+ ZFS_IOC_FREEBSD = ('Z' << 8) + 0xC0, -+ -+ ZFS_IOC_LAST - } zfs_ioc_t; -@@ -830,3 +879,3 @@ typedef enum zfs_ioc { - */ --#define BLKZNAME _IOR(0x12,125,char[ZFS_MAXNAMELEN]) -+#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAXNAMELEN]) - -@@ -866,2 +915,8 @@ typedef enum { - #define ZPOOL_HIST_INT_STR "history internal str" -+#define ZPOOL_HIST_INT_NAME "internal_name" -+#define ZPOOL_HIST_IOCTL "ioctl" -+#define ZPOOL_HIST_INPUT_NVL "in_nvl" -+#define ZPOOL_HIST_OUTPUT_NVL "out_nvl" -+#define ZPOOL_HIST_DSNAME "dsname" -+#define ZPOOL_HIST_DSID "dsid" - -@@ -884,2 +939,3 @@ typedef enum { - #define ZFS_IMPORT_ONLY 0x8 -+#define ZFS_IMPORT_TEMP_NAME 0x10 - -@@ -911,53 +967,2 @@ typedef enum { - --/* -- * Note: This is encoded on-disk, so new events must be added to the -- * end, and unused events can not be removed. Be sure to edit -- * libzfs_pool.c: hist_event_table[]. -- */ --typedef enum history_internal_events { -- LOG_NO_EVENT = 0, -- LOG_POOL_CREATE, -- LOG_POOL_VDEV_ADD, -- LOG_POOL_REMOVE, -- LOG_POOL_DESTROY, -- LOG_POOL_EXPORT, -- LOG_POOL_IMPORT, -- LOG_POOL_VDEV_ATTACH, -- LOG_POOL_VDEV_REPLACE, -- LOG_POOL_VDEV_DETACH, -- LOG_POOL_VDEV_ONLINE, -- LOG_POOL_VDEV_OFFLINE, -- LOG_POOL_UPGRADE, -- LOG_POOL_CLEAR, -- LOG_POOL_SCAN, -- LOG_POOL_PROPSET, -- LOG_DS_CREATE, -- LOG_DS_CLONE, -- LOG_DS_DESTROY, -- LOG_DS_DESTROY_BEGIN, -- LOG_DS_INHERIT, -- LOG_DS_PROPSET, -- LOG_DS_QUOTA, -- LOG_DS_PERM_UPDATE, -- LOG_DS_PERM_REMOVE, -- LOG_DS_PERM_WHO_REMOVE, -- LOG_DS_PROMOTE, -- LOG_DS_RECEIVE, -- LOG_DS_RENAME, -- LOG_DS_RESERVATION, -- LOG_DS_REPLAY_INC_SYNC, -- LOG_DS_REPLAY_FULL_SYNC, -- LOG_DS_ROLLBACK, -- LOG_DS_SNAPSHOT, -- LOG_DS_UPGRADE, -- LOG_DS_REFQUOTA, -- LOG_DS_REFRESERV, -- LOG_POOL_SCAN_DONE, -- LOG_DS_USER_HOLD, -- LOG_DS_USER_RELEASE, -- LOG_POOL_SPLIT, -- LOG_POOL_GUID_CHANGE, -- LOG_END --} history_internal_events_t; -- - #ifdef __cplusplus -diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h -index 9991242..70f7af0 100644 ---- a/include/sys/metaslab.h -+++ b/include/sys/metaslab.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -59,2 +59,3 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, - extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -+extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp); - extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp); -diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h -index a36baed..36aa60d 100644 ---- a/include/sys/metaslab_impl.h -+++ b/include/sys/metaslab_impl.h -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -47,2 +47,3 @@ struct metaslab_class { - uint64_t mc_aliquot; -+ uint64_t mc_alloc_groups; /* # of allocatable groups */ - uint64_t mc_alloc; /* total allocated space */ -@@ -60,2 +61,4 @@ struct metaslab_group { - uint64_t mg_alloc_failures; -+ boolean_t mg_allocatable; /* can we allocate? */ -+ uint64_t mg_free_capacity; /* percentage free */ - int64_t mg_bias; -diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h -index c502568..7a67870 100644 ---- a/include/sys/nvpair.h -+++ b/include/sys/nvpair.h -@@ -287,2 +287,3 @@ nvlist_t *fnvlist_dup(nvlist_t *); - void fnvlist_merge(nvlist_t *, nvlist_t *); -+size_t fnvlist_num_pairs(nvlist_t *); - -diff --git a/include/sys/refcount.h b/include/sys/refcount.h -index 1752c64..e767a23 100644 ---- a/include/sys/refcount.h -+++ b/include/sys/refcount.h -@@ -52,2 +52,3 @@ typedef struct refcount { - kmutex_t rc_mtx; -+ boolean_t rc_tracked; - list_t rc_list; -@@ -58,5 +59,6 @@ typedef struct refcount { - --/* Note: refcount_t must be initialized with refcount_create() */ -+/* Note: refcount_t must be initialized with refcount_create[_untracked]() */ - - void refcount_create(refcount_t *rc); -+void refcount_create_untracked(refcount_t *rc); - void refcount_destroy(refcount_t *rc); -@@ -81,2 +83,3 @@ typedef struct refcount { - #define refcount_create(rc) ((rc)->rc_count = 0) -+#define refcount_create_untracked(rc) ((rc)->rc_count = 0) - #define refcount_destroy(rc) ((rc)->rc_count = 0) -diff --git a/include/sys/rrwlock.h b/include/sys/rrwlock.h -index 798a015..25c8a52 100644 ---- a/include/sys/rrwlock.h -+++ b/include/sys/rrwlock.h -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -59,2 +62,3 @@ typedef struct rrwlock { - boolean_t rr_writer_wanted; -+ boolean_t rr_track_all; - } rrwlock_t; -@@ -66,7 +70,10 @@ typedef struct rrwlock { - */ --void rrw_init(rrwlock_t *rrl); -+void rrw_init(rrwlock_t *rrl, boolean_t track_all); - void rrw_destroy(rrwlock_t *rrl); - void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); -+void rrw_enter_read(rrwlock_t *rrl, void *tag); -+void rrw_enter_write(rrwlock_t *rrl); - void rrw_exit(rrwlock_t *rrl, void *tag); - boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); -+void rrw_tsd_destroy(void *arg); - -@@ -74,2 +81,4 @@ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); - #define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) -+#define RRW_LOCK_HELD(x) \ -+ (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) - -diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h -index 8ae05ce..fcbd8eb 100644 ---- a/include/sys/sa_impl.h -+++ b/include/sys/sa_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -152,5 +152,6 @@ struct sa_os { - * header for all bonus and spill buffers. -+ * - * The header has a fixed portion with a variable number - * of "lengths" depending on the number of variable sized -- * attribues which are determined by the "layout number" -+ * attributes which are determined by the "layout number" - */ -@@ -160,3 +161,19 @@ typedef struct sa_hdr_phys { - uint32_t sa_magic; -- uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ -+ /* -+ * Encoded with hdrsize and layout number as follows: -+ * 16 10 0 -+ * +--------+-------+ -+ * | hdrsz |layout | -+ * +--------+-------+ -+ * -+ * Bits 0-10 are the layout number -+ * Bits 11-16 are the size of the header. -+ * The hdrsize is the number * 8 -+ * -+ * For example. -+ * hdrsz of 1 ==> 8 byte header -+ * 2 ==> 16 byte header -+ * -+ */ -+ uint16_t sa_layout_info; - uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ -@@ -165,20 +182,2 @@ typedef struct sa_hdr_phys { - --/* -- * sa_hdr_phys -> sa_layout_info -- * -- * 16 10 0 -- * +--------+-------+ -- * | hdrsz |layout | -- * +--------+-------+ -- * -- * Bits 0-10 are the layout number -- * Bits 11-16 are the size of the header. -- * The hdrsize is the number * 8 -- * -- * For example. -- * hdrsz of 1 ==> 8 byte header -- * 2 ==> 16 byte header -- * -- */ -- - #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -diff --git a/include/sys/spa.h b/include/sys/spa.h -index 8f2af8a..d5a91c2 100644 ---- a/include/sys/spa.h -+++ b/include/sys/spa.h -@@ -53,3 +53,6 @@ typedef struct ddt ddt_t; - typedef struct ddt_entry ddt_entry_t; -+typedef struct zbookmark zbookmark_t; -+ - struct dsl_pool; -+struct dsl_dataset; - -@@ -421,5 +424,5 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, -- const char *history_str, nvlist_t *zplprops); -+ nvlist_t *zplprops); - extern int spa_import_rootpool(char *devpath, char *devid); --extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, -+extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, - uint64_t flags); -@@ -535,2 +538,39 @@ extern boolean_t spa_refcount_zero(spa_t *spa); - -+/* Historical pool statistics */ -+typedef struct spa_stats_history { -+ kmutex_t lock; -+ uint64_t count; -+ uint64_t size; -+ kstat_t *kstat; -+ void *private; -+ list_t list; -+} spa_stats_history_t; -+ -+typedef struct spa_stats { -+ spa_stats_history_t read_history; -+ spa_stats_history_t txg_history; -+ spa_stats_history_t tx_assign_histogram; -+ spa_stats_history_t io_history; -+} spa_stats_t; -+ -+typedef enum txg_state { -+ TXG_STATE_BIRTH = 0, -+ TXG_STATE_OPEN = 1, -+ TXG_STATE_QUIESCED = 2, -+ TXG_STATE_WAIT_FOR_SYNC = 3, -+ TXG_STATE_SYNCED = 4, -+ TXG_STATE_COMMITTED = 5, -+} txg_state_t; -+ -+extern void spa_stats_init(spa_t *spa); -+extern void spa_stats_destroy(spa_t *spa); -+extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, -+ uint32_t aflags); -+extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); -+extern int spa_txg_history_set(spa_t *spa, uint64_t txg, -+ txg_state_t completed_state, hrtime_t completed_time); -+extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, -+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty); -+extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); -+ - /* Pool configuration locks */ -@@ -631,17 +671,2 @@ extern uint64_t strtonum(const char *str, char **nptr); - --/* history logging */ --typedef enum history_log_type { -- LOG_CMD_POOL_CREATE, -- LOG_CMD_NORMAL, -- LOG_INTERNAL --} history_log_type_t; -- --typedef struct history_arg { -- char *ha_history_str; -- history_log_type_t ha_log_type; -- history_internal_events_t ha_event; -- char *ha_zone; -- uid_t ha_uid; --} history_arg_t; -- - extern char *spa_his_ievent_table[]; -@@ -651,7 +676,11 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, - char *his_buf); --extern int spa_history_log(spa_t *spa, const char *his_buf, -- history_log_type_t what); --extern void spa_history_log_internal(history_internal_events_t event, -- spa_t *spa, dmu_tx_t *tx, const char *fmt, ...); --extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); -+extern int spa_history_log(spa_t *spa, const char *his_buf); -+extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); -+extern void spa_history_log_version(spa_t *spa, const char *operation); -+extern void spa_history_log_internal(spa_t *spa, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...); -+extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, -+ dmu_tx_t *tx, const char *fmt, ...); -+extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...); - -diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h -index 47dfe43..90a32d3 100644 ---- a/include/sys/spa_impl.h -+++ b/include/sys/spa_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -236,6 +236,9 @@ struct spa { - uint64_t spa_deadman_calls; /* number of deadman calls */ -- uint64_t spa_sync_starttime; /* starting time fo spa_sync */ -+ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ - uint64_t spa_deadman_synctime; /* deadman expiration timer */ -+ uint64_t spa_errata; /* errata issues detected */ -+ spa_stats_t spa_stats; /* assorted spa statistics */ -+ - /* -- * spa_refcnt & spa_config_lock must be the last elements -+ * spa_refcount & spa_config_lock must be the last elements - * because refcount_t changes size based on compilation options. -diff --git a/include/sys/space_map.h b/include/sys/space_map.h -index 2da80d2..588feb8 100644 ---- a/include/sys/space_map.h -+++ b/include/sys/space_map.h -@@ -96,3 +96,2 @@ struct space_map_ops { - * -- * - * non-debug entry -@@ -151,2 +150,4 @@ extern boolean_t space_map_contains(space_map_t *sm, - uint64_t start, uint64_t size); -+extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start, -+ uint64_t size, avl_index_t *wherep); - extern void space_map_swap(space_map_t **msrc, space_map_t **mdest); -diff --git a/include/sys/txg.h b/include/sys/txg.h -index f9d6dd4..1bb6bac 100644 ---- a/include/sys/txg.h -+++ b/include/sys/txg.h -@@ -25,3 +25,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -47,5 +47,2 @@ extern "C" { - --#define TXG_WAIT 1ULL --#define TXG_NOWAIT 2ULL -- - typedef struct tx_cpu tx_cpu_t; -@@ -79,9 +76,5 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); - --/* -- * Delay the caller by the specified number of ticks or until -- * the txg closes (whichever comes first). This is intended -- * to be used to throttle writers when the system nears its -- * capacity. -- */ --extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks); -+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, -+ hrtime_t resolution); -+extern void txg_kick(struct dsl_pool *dp); - -@@ -127,7 +120,7 @@ extern void txg_list_destroy(txg_list_t *tl); - extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); --extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg); --extern int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); - extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); - extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); --extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg); -+extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); - extern void *txg_list_head(txg_list_t *tl, uint64_t txg); -diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h -index 7b356ea..e583d61 100644 ---- a/include/sys/txg_impl.h -+++ b/include/sys/txg_impl.h -@@ -20,2 +20,3 @@ - */ -+ - /* -@@ -25,2 +26,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #ifndef _SYS_TXG_IMPL_H -@@ -35,13 +40,55 @@ extern "C" { - -+/* -+ * The tx_cpu structure is a per-cpu structure that is used to track -+ * the number of active transaction holds (tc_count). As transactions -+ * are assigned into a transaction group the appropriate tc_count is -+ * incremented to indicate that there are pending changes that have yet -+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement -+ * the tc_count. A transaction group is not considered quiesced until all -+ * tx_cpu structures have reached a tc_count of zero. -+ * -+ * This structure is a per-cpu structure by design. Updates to this structure -+ * are frequent and concurrent. Having a single structure would result in -+ * heavy lock contention so a per-cpu design was implemented. With the fanned -+ * out mutex design, consumers only need to lock the mutex associated with -+ * thread's cpu. -+ * -+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock. -+ * The tc_lock is used to protect all members of the tx_cpu structure with -+ * the exception of the tc_open_lock. This lock should only be held for a -+ * short period of time, typically when updating the value of tc_count. -+ * -+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure. -+ * This lock is used to ensure that transactions are only assigned into -+ * the current open transaction group. In order to move the current open -+ * transaction group to the quiesce phase, the txg_quiesce thread must -+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. -+ * The tc_open_lock is held until the transaction is assigned into the -+ * transaction group. Typically, this is a short operation but if throttling -+ * is occuring it may be held for longer periods of time. -+ */ - struct tx_cpu { -- kmutex_t tc_lock; -+ kmutex_t tc_open_lock; /* protects tx_open_txg */ -+ kmutex_t tc_lock; /* protects the rest of this struct */ - kcondvar_t tc_cv[TXG_SIZE]; -- uint64_t tc_count[TXG_SIZE]; -+ uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ - list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ -- char tc_pad[16]; -+ char tc_pad[8]; /* pad to fill 3 cache lines */ - }; - -+/* -+ * The tx_state structure maintains the state information about the different -+ * stages of the pool's transcation groups. A per pool tx_state structure -+ * is used to track this information. The tx_state structure also points to -+ * an array of tx_cpu structures (described above). Although the tx_sync_lock -+ * is used to protect the members of this structure, it is not used to -+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure -+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. -+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on -+ * every cpu (see txg_quiesce()). -+ */ - typedef struct tx_state { -- tx_cpu_t *tx_cpu; /* protects right to enter txg */ -- kmutex_t tx_sync_lock; /* protects tx_state_t */ -+ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ -+ kmutex_t tx_sync_lock; /* protects the rest of this struct */ -+ - uint64_t tx_open_txg; /* currently open txg id */ -@@ -51,2 +98,4 @@ typedef struct tx_state { - -+ hrtime_t tx_open_time; /* start time of tx_open_txg */ -+ - uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ -diff --git a/include/sys/unique.h b/include/sys/unique.h -index d971752..d4ba32e 100644 ---- a/include/sys/unique.h -+++ b/include/sys/unique.h -@@ -28,4 +28,2 @@ - -- -- - #include -@@ -44,3 +42,3 @@ void unique_fini(void); - * Return a new unique value (which will not be uniquified against until -- * it is unique_insert()-ed. -+ * it is unique_insert()-ed). - */ -diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h -index daefed7..d5a1889 100644 ---- a/include/sys/vdev_disk.h -+++ b/include/sys/vdev_disk.h -@@ -28,3 +28,3 @@ - #ifndef _SYS_VDEV_DISK_H --#define _SYS_VDEV_DISK_H -+#define _SYS_VDEV_DISK_H - -diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h -index e0669cc..4b465d2 100644 ---- a/include/sys/vdev_impl.h -+++ b/include/sys/vdev_impl.h -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -102,8 +102,18 @@ struct vdev_cache { - -+typedef struct vdev_queue_class { -+ uint32_t vqc_active; -+ -+ /* -+ * Sorted by offset or timestamp, depending on if the queue is -+ * LBA-ordered vs FIFO. -+ */ -+ avl_tree_t vqc_queued_tree; -+} vdev_queue_class_t; -+ - struct vdev_queue { -- avl_tree_t vq_deadline_tree; -- avl_tree_t vq_read_tree; -- avl_tree_t vq_write_tree; -- avl_tree_t vq_pending_tree; -- hrtime_t vq_io_complete_ts; -+ vdev_t *vq_vdev; -+ vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; -+ avl_tree_t vq_active_tree; -+ uint64_t vq_last_offset; -+ hrtime_t vq_io_complete_ts; /* time last i/o completed */ - hrtime_t vq_io_delta_ts; -@@ -184,3 +194,3 @@ struct vdev { - uint64_t vdev_removed; /* persistent removed state */ -- uint64_t vdev_resilvering; /* persistent resilvering state */ -+ uint64_t vdev_resilver_txg; /* persistent resilvering state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ -@@ -256,8 +266,9 @@ typedef struct vdev_label { - -+/* Offset of embedded boot loader region on each label */ -+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) - /* -- * Size and offset of embedded boot loader region on each label. -+ * Size of embedded boot loader region on each label. - * The total size of the first two labels plus the boot area is 4MB. - */ --#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) --#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ -+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ - -@@ -328,4 +339,5 @@ extern void vdev_set_min_asize(vdev_t *vd); - /* -- * zdb uses this tunable, so it must be declared here to make lint happy. -+ * Global variables - */ -+/* zdb uses this tunable, so it must be declared here to make lint happy. */ - extern int zfs_vdev_cache_size; -diff --git a/include/sys/zap.h b/include/sys/zap.h -index 092669c..aabfca7 100644 ---- a/include/sys/zap.h -+++ b/include/sys/zap.h -@@ -88,9 +88,3 @@ extern "C" { - /* -- * The matchtype specifies which entry will be accessed. -- * MT_EXACT: only find an exact match (non-normalized) -- * MT_FIRST: find the "first" normalized (case and Unicode -- * form) match; the designated "first" match will not change as long -- * as the set of entries with this normalization doesn't change -- * MT_BEST: if there is an exact match, find that, otherwise find the -- * first normalized match -+ * Specifies matching criteria for ZAP lookups. - */ -@@ -98,4 +92,14 @@ typedef enum matchtype - { -+ /* Only find an exact match (non-normalized) */ - MT_EXACT, -+ /* -+ * If there is an exact match, find that, otherwise find the -+ * first normalized match. -+ */ - MT_BEST, -+ /* -+ * Find the "first" normalized (case and Unicode form) match; -+ * the designated "first" match will not change as long as the -+ * set of entries with this normalization doesn't change. -+ */ - MT_FIRST -@@ -176,5 +180,6 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - * If 'integer_size' is equal to or larger than the attribute's integer -- * size, the call will succeed and return 0. * When converting to a -- * larger integer size, the integers will be treated as unsigned (ie. no -- * sign-extension will be performed). -+ * size, the call will succeed and return 0. -+ * -+ * When converting to a larger integer size, the integers will be treated as -+ * unsigned (ie. no sign-extension will be performed). - * -@@ -185,3 +190,7 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - * transferred, the call will return EOVERFLOW. -- * -+ */ -+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, -+ uint64_t integer_size, uint64_t num_integers, void *buf); -+ -+/* - * If rn_len is nonzero, realname will be set to the name of the found -@@ -193,4 +202,2 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); - */ --int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, -- uint64_t integer_size, uint64_t num_integers, void *buf); - int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, -diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h -index 3a33636..f6947a7 100644 ---- a/include/sys/zap_leaf.h -+++ b/include/sys/zap_leaf.h -@@ -103,2 +103,3 @@ typedef struct zap_leaf_phys { - struct zap_leaf_header { -+ /* Public to ZAP */ - uint64_t lh_block_type; /* ZBT_LEAF */ -@@ -111,4 +112,3 @@ typedef struct zap_leaf_phys { - --/* above is accessable to zap, below is zap_leaf private */ -- -+ /* Private to zap_leaf */ - uint16_t lh_freelist; /* chunk head of free list */ -@@ -163,3 +163,3 @@ typedef struct zap_leaf { - typedef struct zap_entry_handle { -- /* below is set by zap_leaf.c and is public to zap.c */ -+ /* Set by zap_leaf and public to ZAP */ - uint64_t zeh_num_integers; -@@ -169,3 +169,3 @@ typedef struct zap_entry_handle { - -- /* below is private to zap_leaf.c */ -+ /* Private to zap_leaf */ - uint16_t zeh_fakechunk; -@@ -204,3 +204,3 @@ extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, - * -- * zap_entry_update may fail if it runs out of space (ENOSPC). -+ * May fail if it runs out of space (ENOSPC). - */ -@@ -223,6 +223,3 @@ extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, - --/* -- * Return true if there are additional entries with the same normalized -- * form. -- */ -+/* Determine whether there is another entry with the same normalized form. */ - extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, -diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h -index 481e85b..1a081e4 100644 ---- a/include/sys/zfeature.h -+++ b/include/sys/zfeature.h -@@ -28,3 +28,2 @@ - --#include - #include -@@ -36,10 +35,14 @@ extern "C" { - --extern boolean_t feature_is_supported(objset_t *os, uint64_t obj, -+struct spa; -+struct dmu_tx; -+struct objset; -+ -+extern boolean_t feature_is_supported(struct objset *os, uint64_t obj, - uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat); - --struct spa; --extern void spa_feature_create_zap_objects(struct spa *, dmu_tx_t *); --extern void spa_feature_enable(struct spa *, zfeature_info_t *, dmu_tx_t *); --extern void spa_feature_incr(struct spa *, zfeature_info_t *, dmu_tx_t *); --extern void spa_feature_decr(struct spa *, zfeature_info_t *, dmu_tx_t *); -+extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); -+extern void spa_feature_enable(struct spa *, zfeature_info_t *, -+ struct dmu_tx *); -+extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *); -+extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *); - extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *); -diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h -index 11fc335..2c51f09 100644 ---- a/include/sys/zfs_acl.h -+++ b/include/sys/zfs_acl.h -@@ -49,3 +49,4 @@ struct znode_phys; - /* -- * ZFS ACLs are store in various forms. -+ * ZFS ACLs (Access Control Lists) are stored in various forms. -+ * - * Files created with ACL version ZFS_ACL_VERSION_INITIAL -@@ -139,4 +140,4 @@ typedef struct acl_ops { - int (*ace_mask_off)(void); /* off of access mask in ace */ -+ /* ptr to data if any */ - int (*ace_data)(void *acep, void **datap); -- /* ptr to data if any */ - } acl_ops_t; -diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h -index 4b34259..fa12cea 100644 ---- a/include/sys/zfs_context.h -+++ b/include/sys/zfs_context.h -@@ -27,3 +27,3 @@ - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -61,3 +61,5 @@ - #include -+#include - #include -+#include - #include -@@ -98,2 +100,4 @@ - #include -+#include -+#include - #include -@@ -119,3 +123,3 @@ - --#define noinline __attribute__((noinline)) -+#define noinline __attribute__((noinline)) - -@@ -151,2 +155,3 @@ extern void vpanic(const char *, __va_list); - -+#ifdef __sun - /* -@@ -159,4 +164,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE --#define DTRACE_PROBE(a) ((void)0) - #endif /* DTRACE_PROBE */ -+#define DTRACE_PROBE(a) \ -+ ZFS_PROBE0(#a) - -@@ -164,4 +170,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE1 --#define DTRACE_PROBE1(a, b, c) ((void)0) - #endif /* DTRACE_PROBE1 */ -+#define DTRACE_PROBE1(a, b, c) \ -+ ZFS_PROBE1(#a, (unsigned long)c) - -@@ -169,4 +176,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE2 --#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) - #endif /* DTRACE_PROBE2 */ -+#define DTRACE_PROBE2(a, b, c, d, e) \ -+ ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e) - -@@ -174,4 +182,5 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE3 --#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) - #endif /* DTRACE_PROBE3 */ -+#define DTRACE_PROBE3(a, b, c, d, e, f, g) \ -+ ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g) - -@@ -179,6 +188,19 @@ extern void vpanic(const char *, __va_list); - #undef DTRACE_PROBE4 --#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) - #endif /* DTRACE_PROBE4 */ -+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \ -+ ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \ -+ (unsigned long)i) - - /* -+ * We use the comma operator so that this macro can be used without much -+ * additional code. For example, "return (EINVAL);" becomes -+ * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated -+ * twice, so it should not have side effects (e.g. something like: -+ * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). -+ */ -+#define SET_ERROR(err) (ZFS_SET_ERROR(err), err) -+#else -+#define SET_ERROR(err) (err) -+#endif -+/* - * Threads -@@ -205,2 +227,4 @@ typedef pthread_t kt_did_t; - -+#define kpreempt(x) ((void)0) -+ - typedef struct kthread { -@@ -211,5 +235,4 @@ typedef struct kthread { - --#define tsd_get(key) pthread_getspecific(key) --#define tsd_set(key, val) pthread_setspecific(key, val) - #define curthread zk_thread_current() -+#define getcomm() "unknown" - #define thread_exit zk_thread_exit -@@ -217,5 +240,5 @@ typedef struct kthread { - zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ -- len, NULL, state, pri, PTHREAD_CREATE_DETACHED) -+ len, NULL, state, pri, PTHREAD_CREATE_DETACHED) - #define thread_join(t) zk_thread_join(t) --#define newproc(f,a,cid,pri,ctp,pid) (ENOSYS) -+#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) - -@@ -250,3 +273,3 @@ typedef struct kmutex { - #define MUTEX_DEFAULT 0 --#define MUTEX_HELD(m) ((m)->m_owner == curthread) -+#define MUTEX_HELD(m) ((m)->m_owner == curthread) - #define MUTEX_NOT_HELD(m) (!MUTEX_HELD(m)) -@@ -280,3 +303,3 @@ typedef int krw_t; - #define RW_WRITER 1 --#define RW_DEFAULT RW_READER -+#define RW_DEFAULT RW_READER - -@@ -286,2 +309,8 @@ typedef int krw_t; - -+#undef RW_LOCK_HELD -+#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) -+ -+#undef RW_LOCK_HELD -+#define RW_LOCK_HELD(x) (RW_READ_HELD(x) || RW_WRITE_HELD(x)) -+ - extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); -@@ -295,2 +324,3 @@ extern void rw_exit(krwlock_t *rwlp); - extern uid_t crgetuid(cred_t *cr); -+extern uid_t crgetruid(cred_t *cr); - extern gid_t crgetgid(cred_t *cr); -@@ -302,3 +332,3 @@ extern gid_t *crgetgroups(cred_t *cr); - */ --#define CV_MAGIC 0xd31ea9a83b1b30c4ull -+#define CV_MAGIC 0xd31ea9a83b1b30c4ull - -@@ -315,7 +345,25 @@ extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); - extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); -+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, -+ hrtime_t res, int flag); - extern void cv_signal(kcondvar_t *cv); - extern void cv_broadcast(kcondvar_t *cv); --#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) --#define cv_wait_interruptible(cv, mp) cv_wait(cv, mp) --#define cv_wait_io(cv, mp) cv_wait(cv, mp) -+#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at) -+#define cv_wait_interruptible(cv, mp) cv_wait(cv, mp) -+#define cv_wait_io(cv, mp) cv_wait(cv, mp) -+ -+/* -+ * Thread-specific data -+ */ -+#define tsd_get(k) pthread_getspecific(k) -+#define tsd_set(k, v) pthread_setspecific(k, v) -+#define tsd_create(kp, d) pthread_key_create(kp, d) -+#define tsd_destroy(kp) /* nothing */ -+ -+/* -+ * Thread-specific data -+ */ -+#define tsd_get(k) pthread_getspecific(k) -+#define tsd_set(k, v) pthread_setspecific(k, v) -+#define tsd_create(kp, d) pthread_key_create(kp, d) -+#define tsd_destroy(kp) /* nothing */ - -@@ -324,6 +372,16 @@ extern void cv_broadcast(kcondvar_t *cv); - */ --extern kstat_t *kstat_create(char *, int, -- char *, char *, uchar_t, ulong_t, uchar_t); -+extern kstat_t *kstat_create(const char *, int, -+ const char *, const char *, uchar_t, ulong_t, uchar_t); - extern void kstat_install(kstat_t *); - extern void kstat_delete(kstat_t *); -+extern void kstat_waitq_enter(kstat_io_t *); -+extern void kstat_waitq_exit(kstat_io_t *); -+extern void kstat_runq_enter(kstat_io_t *); -+extern void kstat_runq_exit(kstat_io_t *); -+extern void kstat_waitq_to_runq(kstat_io_t *); -+extern void kstat_runq_back_to_waitq(kstat_io_t *); -+extern void kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)); - -@@ -593,3 +651,3 @@ extern char *kmem_vasprintf(const char *fmt, va_list adx); - extern char *kmem_asprintf(const char *fmt, ...); --#define strfree(str) kmem_free((str), strlen(str)+1) -+#define strfree(str) kmem_free((str), strlen(str) + 1) - -@@ -655,2 +713,11 @@ void ksiddomain_rele(ksiddomain_t *); - -+#define zfs_sleep_until(wakeup) \ -+ do { \ -+ hrtime_t delta = wakeup - gethrtime(); \ -+ struct timespec ts; \ -+ ts.tv_sec = delta / NANOSEC; \ -+ ts.tv_nsec = delta % NANOSEC; \ -+ (void) nanosleep(&ts, NULL); \ -+ } while (0) -+ - #endif /* _KERNEL */ -diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h -index 591d0df..e512079 100644 ---- a/include/sys/zfs_debug.h -+++ b/include/sys/zfs_debug.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -27,2 +28,6 @@ - -+#ifdef __cplusplus -+extern "C" { -+#endif -+ - #ifndef TRUE -@@ -38,2 +43,3 @@ - */ -+ - #if !defined(ZFS_DEBUG) && !defined(_KERNEL) -@@ -45,7 +51,9 @@ extern int zfs_recover; - --#define ZFS_DEBUG_DPRINTF 0x0001 --#define ZFS_DEBUG_DBUF_VERIFY 0x0002 --#define ZFS_DEBUG_DNODE_VERIFY 0x0004 --#define ZFS_DEBUG_SNAPNAMES 0x0008 --#define ZFS_DEBUG_MODIFY 0x0010 -+#define ZFS_DEBUG_DPRINTF (1<<0) -+#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -+#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -+#define ZFS_DEBUG_SNAPNAMES (1<<3) -+#define ZFS_DEBUG_MODIFY (1<<4) -+#define ZFS_DEBUG_SPA (1<<5) -+#define ZFS_DEBUG_ZIO_FREE (1<<6) - -@@ -69,3 +77,3 @@ extern int zfs_recover; - #else --#define dprintf(...) \ -+#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ -@@ -75,6 +83,26 @@ extern int zfs_recover; - --void zfs_panic_recover(const char *fmt, ...); --#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) --void zfs_dbgmsg_init(void); --void zfs_dbgmsg_fini(void); -+extern void zfs_panic_recover(const char *fmt, ...); -+ -+typedef struct zfs_dbgmsg { -+ list_node_t zdm_node; -+ time_t zdm_timestamp; -+ char zdm_msg[1]; /* variable length allocation */ -+} zfs_dbgmsg_t; -+ -+extern void zfs_dbgmsg_init(void); -+extern void zfs_dbgmsg_fini(void); -+#if defined(_KERNEL) && defined(__linux__) -+#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) -+#else -+extern void zfs_dbgmsg(const char *fmt, ...); -+extern void zfs_dbgmsg_print(const char *tag); -+#endif -+ -+#ifndef _KERNEL -+extern int dprintf_find_string(const char *string); -+#endif -+ -+#ifdef __cplusplus -+} -+#endif - -diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h -new file mode 100644 -index 0000000..b56a7da ---- /dev/null -+++ b/include/sys/zfs_delay.h -@@ -0,0 +1,41 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#ifndef _SYS_FS_ZFS_DELAY_H -+#define _SYS_FS_ZFS_DELAY_H -+ -+#include -+ -+/* -+ * Generic wrapper to sleep until a given time. -+ */ -+#define zfs_sleep_until(wakeup) \ -+ do { \ -+ hrtime_t delta = wakeup - gethrtime(); \ -+ \ -+ if (delta > 0) { \ -+ unsigned long delta_us; \ -+ delta_us = delta / (NANOSEC / MICROSEC); \ -+ usleep_range(delta_us, delta_us + 100); \ -+ } \ -+ } while (0) -+ -+#endif /* _SYS_FS_ZFS_DELAY_H */ -diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h -index c0cb470..0ab095c 100644 ---- a/include/sys/zfs_ioctl.h -+++ b/include/sys/zfs_ioctl.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -43,2 +44,11 @@ extern "C" { - /* -+ * The structures in this file are passed between userland and the -+ * kernel. Userland may be running a 32-bit process, while the kernel -+ * is 64-bit. Therefore, these structures need to compile the same in -+ * 32-bit and 64-bit. This means not using type "long", and adding -+ * explicit padding so that the 32-bit structure will not be packed more -+ * tightly than the 64-bit structure (which requires 64-bit alignment). -+ */ -+ -+/* - * Property values for snapdir -@@ -53,2 +63,7 @@ extern "C" { - #define ZFS_SNAPDEV_VISIBLE 1 -+/* -+ * Property values for acltype -+ */ -+#define ZFS_ACLTYPE_OFF 0 -+#define ZFS_ACLTYPE_POSIXACL 1 - -@@ -246,2 +261,3 @@ typedef struct zinject_record { - -+#define ZEVENT_NONE 0x0 - #define ZEVENT_NONBLOCK 0x1 -@@ -249,2 +265,5 @@ typedef struct zinject_record { - -+#define ZEVENT_SEEK_START 0 -+#define ZEVENT_SEEK_END UINT64_MAX -+ - typedef enum zinject_type { -@@ -279,6 +298,17 @@ typedef enum zfs_case { - typedef struct zfs_cmd { -- char zc_name[MAXPATHLEN]; -+ char zc_name[MAXPATHLEN]; /* name of pool or dataset */ -+ uint64_t zc_nvlist_src; /* really (char *) */ -+ uint64_t zc_nvlist_src_size; -+ uint64_t zc_nvlist_dst; /* really (char *) */ -+ uint64_t zc_nvlist_dst_size; -+ boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ -+ int zc_pad2; -+ -+ /* -+ * The following members are for legacy ioctls which haven't been -+ * converted to the new method. -+ */ -+ uint64_t zc_history; /* really (char *) */ - char zc_value[MAXPATHLEN * 2]; - char zc_string[MAXNAMELEN]; -- char zc_top_ds[MAXPATHLEN]; - uint64_t zc_guid; -@@ -286,6 +316,2 @@ typedef struct zfs_cmd { - uint64_t zc_nvlist_conf_size; -- uint64_t zc_nvlist_src; /* really (char *) */ -- uint64_t zc_nvlist_src_size; -- uint64_t zc_nvlist_dst; /* really (char *) */ -- uint64_t zc_nvlist_dst_size; - uint64_t zc_cookie; -@@ -293,4 +319,3 @@ typedef struct zfs_cmd { - uint64_t zc_perm_action; -- uint64_t zc_history; /* really (char *) */ -- uint64_t zc_history_len; -+ uint64_t zc_history_len; - uint64_t zc_history_offset; -@@ -337,3 +362,6 @@ extern int zfs_secpolicy_rename_perms(const char *from, - extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); --extern int zfs_unmount_snap(const char *, void *); -+extern int zfs_unmount_snap(const char *); -+extern void zfs_destroy_unmount_origin(const char *); -+ -+extern boolean_t dataset_name_hidden(const char *name); - -@@ -346,3 +374,3 @@ enum zfsdev_state_type { - typedef struct zfsdev_state { -- list_node_t zs_next; /* next zfsdev_state_t link */ -+ list_node_t zs_next; /* next zfsdev_state_t link */ - struct file *zs_file; /* associated file struct */ -diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h -index da18b1f..ea5e403 100644 ---- a/include/sys/zfs_rlock.h -+++ b/include/sys/zfs_rlock.h -@@ -28,4 +28,2 @@ - -- -- - #ifdef __cplusplus -@@ -60,6 +58,6 @@ typedef struct rl { - /* -- * Lock a range (offset, length) as either shared (READER) -- * or exclusive (WRITER or APPEND). APPEND is a special type that -- * is converted to WRITER that specified to lock from the start of the -- * end of file. zfs_range_lock() returns the range lock structure. -+ * Lock a range (offset, length) as either shared (RL_READER) -+ * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that -+ * is converted to RL_WRITER that specified to lock from the start of the -+ * end of file. Returns the range lock structure. - */ -@@ -67,5 +65,3 @@ rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); - --/* -- * Unlock range and destroy range lock structure. -- */ -+/* Unlock range and destroy range lock structure. */ - void zfs_range_unlock(rl_t *rl); -@@ -79,3 +75,4 @@ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); - /* -- * AVL comparison function used to compare range locks -+ * AVL comparison function used to order range locks -+ * Locks are ordered on the start offset of the range. - */ -diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h -index 0bac780..735d4b3 100644 ---- a/include/sys/zfs_sa.h -+++ b/include/sys/zfs_sa.h -@@ -130,4 +130,4 @@ typedef struct znode_phys { - --#define DXATTR_MAX_ENTRY_SIZE (32768) --#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) -+#define DXATTR_MAX_ENTRY_SIZE (32768) -+#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1) - -diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h -index f685c12..eeeffbe 100644 ---- a/include/sys/zfs_vfsops.h -+++ b/include/sys/zfs_vfsops.h -@@ -62,2 +62,3 @@ typedef struct zfs_sb { - uint_t z_acl_inherit; /* acl inheritance behavior */ -+ uint_t z_acl_type; /* type of ACL usable on this FS */ - zfs_case_t z_case; /* case-sense */ -@@ -66,2 +67,3 @@ typedef struct zfs_sb { - boolean_t z_atime; /* enable atimes mount option */ -+ boolean_t z_relatime; /* enable relatime mount option */ - boolean_t z_unmounted; /* unmounted */ -@@ -71,3 +73,3 @@ typedef struct zfs_sb { - uint64_t z_nr_znodes; /* number of znodes in the fs */ -- unsigned long z_rollback_time;/* last online rollback time */ -+ unsigned long z_rollback_time; /* last online rollback time */ - kmutex_t z_znodes_lock; /* lock for z_all_znodes */ -@@ -103,3 +105,3 @@ typedef struct zfs_sb { - */ --#define ZFS_LINK_MAX ((1U << 31) - 1U) -+#define ZFS_LINK_MAX ((1U << 31) - 1U) - -diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h -index bdddcc3..a020068 100644 ---- a/include/sys/zfs_znode.h -+++ b/include/sys/zfs_znode.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -140,4 +141,5 @@ extern "C" { - --/* Path component length */ - /* -+ * Path component length -+ * - * The generic fs code uses MAXNAMELEN to represent -@@ -205,4 +207,4 @@ typedef struct znode { - uint64_t z_gid; /* gid fuid (cached) */ -- mode_t z_mode; /* mode (cached) */ - uint32_t z_sync_cnt; /* synchronous open count */ -+ mode_t z_mode; /* mode (cached) */ - kmutex_t z_acl_lock; /* acl data lock */ -@@ -210,4 +212,4 @@ typedef struct znode { - krwlock_t z_xattr_lock; /* xattr data lock */ -- nvlist_t *z_xattr_cached;/* cached xattrs */ -- struct znode *z_xattr_parent;/* xattr parent znode */ -+ nvlist_t *z_xattr_cached; /* cached xattrs */ -+ struct znode *z_xattr_parent; /* xattr parent znode */ - list_node_t z_link_node; /* all znodes in fs link */ -@@ -249,10 +251,6 @@ typedef struct znode { - --/* -- * ZFS_ENTER() is called on entry to each ZFS inode and vfs operation. -- * ZFS_EXIT() must be called before exitting the vop. -- * ZFS_VERIFY_ZP() verifies the znode is valid. -- */ -+/* Called on entry to each ZFS vnode and vfs operation */ - #define ZFS_ENTER(zsb) \ - { \ -- rrw_enter(&(zsb)->z_teardown_lock, RW_READER, FTAG); \ -+ rrw_enter_read(&(zsb)->z_teardown_lock, FTAG); \ - if ((zsb)->z_unmounted) { \ -@@ -263,2 +261,3 @@ typedef struct znode { - -+/* Must be called before exiting the vop */ - #define ZFS_EXIT(zsb) \ -@@ -269,2 +268,3 @@ typedef struct znode { - -+/* Verifies the znode is valid */ - #define ZFS_VERIFY_ZP(zp) \ -@@ -290,5 +290,3 @@ typedef struct znode { - --/* -- * Macros to encode/decode ZFS stored time values from/to struct timespec -- */ -+/* Encode ZFS stored time values from a struct timespec */ - #define ZFS_TIME_ENCODE(tp, stmp) \ -@@ -299,2 +297,3 @@ typedef struct znode { - -+/* Decode ZFS stored time values to a struct timespec */ - #define ZFS_TIME_DECODE(tp, stmp) \ -@@ -356,3 +355,4 @@ extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, -- znode_t *zp, offset_t off, ssize_t len, int ioflag); -+ znode_t *zp, offset_t off, ssize_t len, int ioflag, -+ zil_callback_t callback, void *callback_data); - extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, -diff --git a/include/sys/zil.h b/include/sys/zil.h -index 589e28f..4000742 100644 ---- a/include/sys/zil.h -+++ b/include/sys/zil.h -@@ -244,2 +244,8 @@ typedef struct { - * would be zero. -+ * -+ * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. -+ * If create is also setting xvattr's, then acl data follows xvattr. -+ * If ACE FUIDs are needed then they will follow the xvattr_t. Following -+ * the FUIDs will be the domain table information. The FUIDs for the owner -+ * and group will be in lr_create. Name follows ACL data. - */ -@@ -252,9 +258,2 @@ typedef struct { - uint64_t lr_acl_flags; /* ACL flags */ -- /* lr_acl_bytes number of variable sized ace's follows */ -- /* if create is also setting xvattr's, then acl data follows xvattr */ -- /* if ACE FUIDs are needed then they will follow the xvattr_t */ -- /* Following the FUIDs will be the domain table information. */ -- /* The FUIDs for the owner and group will be in the lr_create */ -- /* portion of the record. */ -- /* name follows ACL data */ - } lr_acl_create_t; -@@ -364,2 +363,4 @@ typedef enum { - -+typedef void (*zil_callback_t)(void *data); -+ - typedef struct itx { -@@ -369,2 +370,4 @@ typedef struct itx { - uint8_t itx_sync; /* synchronous transaction */ -+ zil_callback_t itx_callback; /* Called when the itx is persistent */ -+ void *itx_callback_data; /* User data for the callback */ - uint64_t itx_sod; /* record size on disk */ -@@ -429,5 +432,5 @@ extern zil_stats_t zil_stats; - --#define ZIL_STAT_INCR(stat, val) \ -+#define ZIL_STAT_INCR(stat, val) \ - atomic_add_64(&zil_stats.stat.value.ui64, (val)); --#define ZIL_STAT_BUMP(stat) \ -+#define ZIL_STAT_BUMP(stat) \ - ZIL_STAT_INCR(stat, 1); -@@ -472,4 +475,4 @@ extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); - --extern int zil_suspend(zilog_t *zilog); --extern void zil_resume(zilog_t *zilog); -+extern int zil_suspend(const char *osname, void **cookiep); -+extern void zil_resume(void *cookie); - -diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h -index f5b69b7..0db4b52 100644 ---- a/include/sys/zil_impl.h -+++ b/include/sys/zil_impl.h -@@ -43,3 +43,3 @@ typedef struct lwb { - blkptr_t lwb_blk; /* on disk address of this log blk */ -- boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ -+ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ - int lwb_nused; /* # used bytes in buffer */ -diff --git a/include/sys/zio.h b/include/sys/zio.h -index 189966b..129e2bc 100644 ---- a/include/sys/zio.h -+++ b/include/sys/zio.h -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -126,3 +126,3 @@ enum zio_compress { - */ --#define ZIO_DELAY_MAX (30 * MILLISEC) -+#define ZIO_DELAY_MAX (30 * MILLISEC) - -@@ -132,15 +132,12 @@ enum zio_compress { - --#define ZIO_PRIORITY_NOW (zio_priority_table[0]) --#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) --#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) --#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) --#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) --#define ZIO_PRIORITY_AGG (zio_priority_table[5]) --#define ZIO_PRIORITY_FREE (zio_priority_table[6]) --#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) --#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) --#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) --#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) --#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11]) --#define ZIO_PRIORITY_TABLE_SIZE 12 -+typedef enum zio_priority { -+ ZIO_PRIORITY_SYNC_READ, -+ ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ -+ ZIO_PRIORITY_ASYNC_READ, /* prefetch */ -+ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ -+ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ -+ ZIO_PRIORITY_NUM_QUEUEABLE, -+ -+ ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ -+} zio_priority_t; - -@@ -198,3 +195,6 @@ enum zio_flag { - ZIO_FLAG_GODFATHER = 1 << 24, -- ZIO_FLAG_FASTWRITE = 1 << 25 -+ ZIO_FLAG_NOPWRITE = 1 << 25, -+ ZIO_FLAG_REEXECUTED = 1 << 26, -+ ZIO_FLAG_DELEGATED = 1 << 27, -+ ZIO_FLAG_FASTWRITE = 1 << 28 - }; -@@ -238,4 +238,3 @@ typedef void zio_done_func_t(zio_t *zio); - --extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; --extern char *zio_type_name[ZIO_TYPES]; -+extern const char *zio_type_name[ZIO_TYPES]; - -@@ -258,3 +257,3 @@ extern char *zio_type_name[ZIO_TYPES]; - */ --typedef struct zbookmark { -+struct zbookmark { - uint64_t zb_objset; -@@ -263,3 +262,3 @@ typedef struct zbookmark { - uint64_t zb_blkid; --} zbookmark_t; -+}; - -@@ -296,4 +295,5 @@ typedef struct zio_prop { - uint8_t zp_copies; -- uint8_t zp_dedup; -- uint8_t zp_dedup_verify; -+ boolean_t zp_dedup; -+ boolean_t zp_dedup_verify; -+ boolean_t zp_nopwrite; - } zio_prop_t; -@@ -378,3 +378,3 @@ struct zio { - int io_cmd; -- uint8_t io_priority; -+ zio_priority_t io_priority; - uint8_t io_reexecute; -@@ -393,3 +393,4 @@ struct zio { - /* Callback info */ -- zio_done_func_t *io_ready; -+ zio_done_func_t *io_ready; -+ zio_done_func_t *io_physdone; - zio_done_func_t *io_done; -@@ -411,3 +412,2 @@ struct zio { - uint64_t io_offset; -- uint64_t io_deadline; /* expires at timestamp + deadline */ - hrtime_t io_timestamp; /* submitted at */ -@@ -415,5 +415,3 @@ struct zio { - uint64_t io_delay; /* vdev disk service delta (ticks) */ -- avl_node_t io_offset_node; -- avl_node_t io_deadline_node; -- avl_tree_t *io_vdev_tree; -+ avl_node_t io_queue_node; - -@@ -430,2 +428,3 @@ struct zio { - uint64_t io_child_count; -+ uint64_t io_phys_children; - uint64_t io_parent_count; -@@ -455,3 +454,3 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb); -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); - -@@ -459,4 +458,5 @@ extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, -- zio_done_func_t *ready, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb); -+ zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, -+ void *private, -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb); - -@@ -464,5 +464,6 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, zbookmark_t *zb); -+ zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb); - --extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); -+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, -+ boolean_t nopwrite); - -@@ -475,3 +476,3 @@ extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, - extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags); -+ zio_done_func_t *done, void *private, enum zio_flag flags); - -@@ -479,4 +480,4 @@ extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags, -- boolean_t labels); -+ zio_done_func_t *done, void *private, zio_priority_t priority, -+ enum zio_flag flags, boolean_t labels); - -@@ -484,4 +485,4 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags, -- boolean_t labels); -+ zio_done_func_t *done, void *private, zio_priority_t priority, -+ enum zio_flag flags, boolean_t labels); - -@@ -516,7 +517,8 @@ extern void zio_resubmit_stage_async(void *); - extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, -- uint64_t offset, void *data, uint64_t size, int type, int priority, -- enum zio_flag flags, zio_done_func_t *done, void *private); -+ uint64_t offset, void *data, uint64_t size, int type, -+ zio_priority_t priority, enum zio_flag flags, -+ zio_done_func_t *done, void *private); - - extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, -- void *data, uint64_t size, int type, int priority, -+ void *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private); -diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h -index bd051f1..63863c7 100644 ---- a/include/sys/zio_compress.h -+++ b/include/sys/zio_compress.h -@@ -35,7 +35,6 @@ extern "C" { - --/* -- * Common signature for all zio compress/decompress functions. -- */ -+/* Common signature for all zio compress functions. */ - typedef size_t zio_compress_func_t(void *src, void *dst, - size_t s_len, size_t d_len, int); -+/* Common signature for all zio decompress functions. */ - typedef int zio_decompress_func_t(void *src, void *dst, -@@ -76,5 +75,5 @@ extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, - int level); --extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len, -+extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); --extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, -+extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, - int level); -diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h -index 2d062d0..08f8201 100644 ---- a/include/sys/zio_impl.h -+++ b/include/sys/zio_impl.h -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -40,2 +40,66 @@ extern "C" { - /* -+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed. -+ * -+ * The ZFS I/O pipeline is comprised of various stages which are defined -+ * in the zio_stage enum below. The individual stages are used to construct -+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. -+ * -+ * I/O operations: (XXX - provide detail for each of the operations) -+ * -+ * Read: -+ * Write: -+ * Free: -+ * Claim: -+ * Ioctl: -+ * -+ * Although the most common pipeline are used by the basic I/O operations -+ * above, there are some helper pipelines (one could consider them -+ * sub-pipelines) which are used internally by the ZIO module and are -+ * explained below: -+ * -+ * Interlock Pipeline: -+ * The interlock pipeline is the most basic pipeline and is used by all -+ * of the I/O operations. The interlock pipeline does not perform any I/O -+ * and is used to coordinate the dependencies between I/Os that are being -+ * issued (i.e. the parent/child relationship). -+ * -+ * Vdev child Pipeline: -+ * The vdev child pipeline is responsible for performing the physical I/O. -+ * It is in this pipeline where the I/O are queued and possibly cached. -+ * -+ * In addition to performing I/O, the pipeline is also responsible for -+ * data transformations. The transformations performed are based on the -+ * specific properties that user may have selected and modify the -+ * behavior of the pipeline. Examples of supported transformations are -+ * compression, dedup, and nop writes. Transformations will either modify -+ * the data or the pipeline. This list below further describes each of -+ * the supported transformations: -+ * -+ * Compression: -+ * ZFS supports three different flavors of compression -- gzip, lzjb, and -+ * zle. Compression occurs as part of the write pipeline and is performed -+ * in the ZIO_STAGE_WRITE_BP_INIT stage. -+ * -+ * Dedup: -+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and -+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing -+ * read pipeline if the dedup bit is set on the block pointer. -+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage -+ * and added to a write pipeline if a user has enabled dedup on that -+ * particular dataset. -+ * -+ * NOP Write: -+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage -+ * and is added to an existing write pipeline if a crypographically -+ * secure checksum (i.e. SHA256) is enabled and compression is turned on. -+ * The NOP write stage will compare the checksums of the current data -+ * on-disk (level-0 blocks only) and the data that is currently being written. -+ * If the checksum values are identical then the pipeline is converted to -+ * an interlock pipeline skipping block allocation and bypassing the -+ * physical I/O. The nop write feature can handle writes in either -+ * syncing or open context (i.e. zil writes) and as a result is mutually -+ * exclusive with dedup. -+ */ -+ -+/* - * zio pipeline stage definitions -@@ -52,23 +116,25 @@ enum zio_stage { - -- ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ -- ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ -- ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ -- ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ -+ ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */ -+ -+ ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */ -+ ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */ -+ ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */ -+ ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */ - -- ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ -- ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ -+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */ -+ ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */ - -- ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ -- ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ -- ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ -+ ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */ -+ ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */ -+ ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */ - -- ZIO_STAGE_READY = 1 << 15, /* RWFCI */ -+ ZIO_STAGE_READY = 1 << 16, /* RWFCI */ - -- ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ -- ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ -- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RW--I */ -+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RW--I */ - -- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ -+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */ - -- ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ -+ ZIO_STAGE_DONE = 1 << 21 /* RWFCI */ - }; -@@ -149,3 +215,2 @@ enum zio_stage { - ZIO_STAGE_FREE_BP_INIT | \ -- ZIO_STAGE_ISSUE_ASYNC | \ - ZIO_STAGE_DVA_FREE) -diff --git a/include/sys/zpl.h b/include/sys/zpl.h -index 89cf824..56bd9ae 100644 ---- a/include/sys/zpl.h -+++ b/include/sys/zpl.h -@@ -34,2 +34,3 @@ - #include -+#include - -@@ -73,2 +74,32 @@ extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, - const struct qstr *qstr); -+#if defined(CONFIG_FS_POSIX_ACL) -+extern int zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl); -+extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); -+#if !defined(HAVE_GET_ACL) -+#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -+extern int zpl_check_acl(struct inode *inode, int mask, unsigned int flags); -+#elif defined(HAVE_CHECK_ACL) -+extern int zpl_check_acl(struct inode *inode, int mask); -+#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -+extern int zpl_permission(struct inode *ip, int mask, struct nameidata *nd); -+#elif defined(HAVE_PERMISSION) -+extern int zpl_permission(struct inode *ip, int mask); -+#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* HAVE_GET_ACL */ -+ -+extern int zpl_init_acl(struct inode *ip, struct inode *dir); -+extern int zpl_chmod_acl(struct inode *ip); -+#else -+static inline int -+zpl_init_acl(struct inode *ip, struct inode *dir) -+{ -+ return (0); -+} -+ -+static inline int -+zpl_chmod_acl(struct inode *ip) -+{ -+ return (0); -+} -+#endif /* CONFIG_FS_POSIX_ACL */ - -@@ -93,3 +124,3 @@ extern const struct inode_operations zpl_ops_shares; - --#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ -+#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ - .actor = _actor, \ -@@ -106,3 +137,3 @@ typedef struct dir_context { - --#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ -+#define DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ - .dirent = _dirent, \ -@@ -116,3 +147,4 @@ dir_emit(struct dir_context *ctx, const char *name, int namelen, - { -- return ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type) == 0; -+ return (ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type) -+ == 0); - } -@@ -122,4 +154,4 @@ dir_emit_dot(struct file *file, struct dir_context *ctx) - { -- return ctx->actor(ctx->dirent, ".", 1, ctx->pos, -- file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0; -+ return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, -+ file->f_path.dentry->d_inode->i_ino, DT_DIR) == 0); - } -@@ -129,4 +161,4 @@ dir_emit_dotdot(struct file *file, struct dir_context *ctx) - { -- return ctx->actor(ctx->dirent, "..", 2, ctx->pos, -- parent_ino(file->f_path.dentry), DT_DIR) == 0; -+ return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, -+ parent_ino(file->f_path.dentry), DT_DIR) == 0); - } -@@ -138,3 +170,3 @@ dir_emit_dots(struct file *file, struct dir_context *ctx) - if (!dir_emit_dot(file, ctx)) -- return false; -+ return (false); - ctx->pos = 1; -@@ -143,6 +175,6 @@ dir_emit_dots(struct file *file, struct dir_context *ctx) - if (!dir_emit_dotdot(file, ctx)) -- return false; -+ return (false); - ctx->pos = 2; - } -- return true; -+ return (true); - } -diff --git a/include/sys/zvol.h b/include/sys/zvol.h -index c05f81a..04e0996 100644 ---- a/include/sys/zvol.h -+++ b/include/sys/zvol.h -@@ -40,6 +40,7 @@ extern boolean_t zvol_is_zvol(const char *); - extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); --extern int zvol_create_minor(const char *); --extern int zvol_create_minors(const char *); --extern int zvol_remove_minor(const char *); --extern void zvol_remove_minors(const char *); -+extern int zvol_create_minor(const char *name); -+extern int zvol_create_minors(const char *name); -+extern int zvol_remove_minor(const char *name); -+extern void zvol_remove_minors(const char *name); -+extern void zvol_rename_minors(const char *oldname, const char *newname); - extern int zvol_set_volsize(const char *, uint64_t); -diff --git a/include/zfs_comutil.h b/include/zfs_comutil.h -index 61327f9..f890543 100644 ---- a/include/zfs_comutil.h -+++ b/include/zfs_comutil.h -@@ -22,2 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -39,3 +40,4 @@ extern int zfs_zpl_version_map(int spa_version); - extern int zfs_spa_version_map(int zpl_version); --extern const char *zfs_history_event_names[LOG_END]; -+#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 -+extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; - -diff --git a/include/zpios-ctl.h b/include/zpios-ctl.h -index 6744ae6..82a7fdf 100644 ---- a/include/zpios-ctl.h -+++ b/include/zpios-ctl.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,8 +31,9 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_CTL_H --#define _ZPIOS_CTL_H -+#define _ZPIOS_CTL_H - --/* Contains shared definitions which both the userspace -+/* -+ * Contains shared definitions which both the userspace - * and kernelspace portions of zpios must agree on. -@@ -43,32 +44,32 @@ - --#define ZPIOS_MAJOR 232 /* XXX - Arbitrary */ --#define ZPIOS_MINORS 1 --#define ZPIOS_NAME "zpios" --#define ZPIOS_DEV "/dev/zpios" -- --#define DMU_IO 0x01 -- --#define DMU_WRITE 0x0001 --#define DMU_READ 0x0002 --#define DMU_VERIFY 0x0004 --#define DMU_REMOVE 0x0008 --#define DMU_FPP 0x0010 --#define DMU_WRITE_ZC 0x0020 /* Incompatible w/DMU_VERIFY */ --#define DMU_READ_ZC 0x0040 /* Incompatible w/DMU_VERIFY */ --#define DMU_WRITE_NOWAIT 0x0080 --#define DMU_READ_NOPF 0x0100 -- --#define ZPIOS_NAME_SIZE 16 --#define ZPIOS_PATH_SIZE 128 -- --#define PHASE_PRE_RUN "pre-run" --#define PHASE_PRE_CREATE "pre-create" --#define PHASE_PRE_WRITE "pre-write" --#define PHASE_PRE_READ "pre-read" --#define PHASE_PRE_REMOVE "pre-remove" --#define PHASE_POST_RUN "post-run" --#define PHASE_POST_CREATE "post-create" --#define PHASE_POST_WRITE "post-write" --#define PHASE_POST_READ "post-read" --#define PHASE_POST_REMOVE "post-remove" -+#define ZPIOS_MAJOR 232 /* XXX - Arbitrary */ -+#define ZPIOS_MINORS 1 -+#define ZPIOS_NAME "zpios" -+#define ZPIOS_DEV "/dev/zpios" -+ -+#define DMU_IO 0x01 -+ -+#define DMU_WRITE 0x0001 -+#define DMU_READ 0x0002 -+#define DMU_VERIFY 0x0004 -+#define DMU_REMOVE 0x0008 -+#define DMU_FPP 0x0010 -+#define DMU_WRITE_ZC 0x0020 /* Incompatible w/DMU_VERIFY */ -+#define DMU_READ_ZC 0x0040 /* Incompatible w/DMU_VERIFY */ -+#define DMU_WRITE_NOWAIT 0x0080 -+#define DMU_READ_NOPF 0x0100 -+ -+#define ZPIOS_NAME_SIZE 16 -+#define ZPIOS_PATH_SIZE 128 -+ -+#define PHASE_PRE_RUN "pre-run" -+#define PHASE_PRE_CREATE "pre-create" -+#define PHASE_PRE_WRITE "pre-write" -+#define PHASE_PRE_READ "pre-read" -+#define PHASE_PRE_REMOVE "pre-remove" -+#define PHASE_POST_RUN "post-run" -+#define PHASE_POST_CREATE "post-create" -+#define PHASE_POST_WRITE "post-write" -+#define PHASE_POST_READ "post-read" -+#define PHASE_POST_REMOVE "post-remove" - -@@ -119,4 +120,4 @@ typedef struct zpios_cmd { - uint32_t cmd_flags; /* Test flags */ -- char cmd_pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -- char cmd_post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ -+ char cmd_pre[ZPIOS_PATH_SIZE]; /* Pre-exec hook */ -+ char cmd_post[ZPIOS_PATH_SIZE]; /* Post-exec hook */ - char cmd_log[ZPIOS_PATH_SIZE]; /* Requested log dir */ -@@ -127,11 +128,11 @@ typedef struct zpios_cmd { - /* Valid ioctls */ --#define ZPIOS_CFG _IOWR('f', 101, zpios_cfg_t) --#define ZPIOS_CMD _IOWR('f', 102, zpios_cmd_t) -+#define ZPIOS_CFG _IOWR('f', 101, zpios_cfg_t) -+#define ZPIOS_CMD _IOWR('f', 102, zpios_cmd_t) - - /* Valid configuration commands */ --#define ZPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ --#define ZPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ -+#define ZPIOS_CFG_BUFFER_CLEAR 0x001 /* Clear text buffer */ -+#define ZPIOS_CFG_BUFFER_SIZE 0x002 /* Resize text buffer */ - - #ifndef NSEC_PER_SEC --#define NSEC_PER_SEC 1000000000L -+#define NSEC_PER_SEC 1000000000L - #endif -@@ -139,3 +140,4 @@ typedef struct zpios_cmd { - static inline --void zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) -+void -+zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) - { -@@ -154,3 +156,4 @@ void zpios_timespec_normalize(zpios_timespec_t *ts, uint32_t sec, uint32_t nsec) - static inline --zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) -+zpios_timespec_t -+zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - { -@@ -158,4 +161,4 @@ zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - zpios_timespec_normalize(&ts_delta, lhs.ts_sec + rhs.ts_sec, -- lhs.ts_nsec + rhs.ts_nsec); -- return ts_delta; -+ lhs.ts_nsec + rhs.ts_nsec); -+ return (ts_delta); - } -@@ -163,3 +166,4 @@ zpios_timespec_t zpios_timespec_add(zpios_timespec_t lhs, zpios_timespec_t rhs) - static inline --zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) -+zpios_timespec_t -+zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - { -@@ -167,4 +171,4 @@ zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - zpios_timespec_normalize(&ts_delta, lhs.ts_sec - rhs.ts_sec, -- lhs.ts_nsec - rhs.ts_nsec); -- return ts_delta; -+ lhs.ts_nsec - rhs.ts_nsec); -+ return (ts_delta); - } -@@ -174,3 +178,4 @@ zpios_timespec_t zpios_timespec_sub(zpios_timespec_t lhs, zpios_timespec_t rhs) - static inline --zpios_timespec_t zpios_timespec_now(void) -+zpios_timespec_t -+zpios_timespec_now(void) - { -@@ -183,3 +188,3 @@ zpios_timespec_t zpios_timespec_now(void) - -- return zts_now; -+ return (zts_now); - } -@@ -189,6 +194,8 @@ zpios_timespec_t zpios_timespec_now(void) - static inline --double zpios_timespec_to_double(zpios_timespec_t ts) -+double -+zpios_timespec_to_double(zpios_timespec_t ts) - { -- return ((double)(ts.ts_sec) + -- ((double)(ts.ts_nsec) / (double)(NSEC_PER_SEC))); -+ return -+ ((double)(ts.ts_sec) + -+ ((double)(ts.ts_nsec) / (double)(NSEC_PER_SEC))); - } -diff --git a/include/zpios-internal.h b/include/zpios-internal.h -index 24a2feb..4b99b4c 100644 ---- a/include/zpios-internal.h -+++ b/include/zpios-internal.h -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,6 +31,6 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - - #ifndef _ZPIOS_INTERNAL_H --#define _ZPIOS_INTERNAL_H -+#define _ZPIOS_INTERNAL_H - -@@ -38,3 +38,3 @@ - --#define OBJ_SIZE 64 -+#define OBJ_SIZE 64 - -@@ -53,3 +53,3 @@ typedef struct thread_data { - zpios_stats_t stats; -- kmutex_t lock; -+ kmutex_t lock; - } thread_data_t; -@@ -64,3 +64,3 @@ typedef struct zpios_region { - zpios_stats_t stats; -- kmutex_t lock; -+ kmutex_t lock; - } zpios_region_t; -@@ -87,5 +87,5 @@ typedef struct run_args { - objset_t *os; -- wait_queue_head_t waitq; -+ wait_queue_head_t waitq; - volatile uint64_t threads_done; -- kmutex_t lock_work; -+ kmutex_t lock_work; - kmutex_t lock_ctl; -@@ -101,38 +101,12 @@ typedef struct run_args { - --#define ZPIOS_INFO_BUFFER_SIZE 65536 --#define ZPIOS_INFO_BUFFER_REDZONE 1024 -+#define ZPIOS_INFO_BUFFER_SIZE 65536 -+#define ZPIOS_INFO_BUFFER_REDZONE 1024 - - typedef struct zpios_info { -- spinlock_t info_lock; -- int info_size; -- char *info_buffer; -- char *info_head; /* Internal kernel use only */ -+ spinlock_t info_lock; -+ int info_size; -+ char *info_buffer; -+ char *info_head; /* Internal kernel use only */ - } zpios_info_t; - --#define zpios_print(file, format, args...) \ --({ zpios_info_t *_info_ = (zpios_info_t *)file->private_data; \ -- int _rc_; \ -- \ -- ASSERT(_info_); \ -- ASSERT(_info_->info_buffer); \ -- \ -- spin_lock(&_info_->info_lock); \ -- \ -- /* Don't allow the kernel to start a write in the red zone */ \ -- if ((int)(_info_->info_head - _info_->info_buffer) > \ -- (_info_->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { \ -- _rc_ = -EOVERFLOW; \ -- } else { \ -- _rc_ = sprintf(_info_->info_head, format, args); \ -- if (_rc_ >= 0) \ -- _info_->info_head += _rc_; \ -- } \ -- \ -- spin_unlock(&_info_->info_lock); \ -- _rc_; \ --}) -- --#define zpios_vprint(file, test, format, args...) \ -- zpios_print(file, "%*s: " format, ZPIOS_NAME_SIZE, test, args) -- - #endif /* _ZPIOS_INTERNAL_H */ -diff --git a/lib/Makefile.am b/lib/Makefile.am -index 09139d5..8e7caf2 100644 ---- a/lib/Makefile.am -+++ b/lib/Makefile.am -@@ -6,2 +6,2 @@ SUBDIRS = libspl libavl libefi libshare libunicode - # incorporate the five convenience libraries given above. --SUBDIRS += libuutil libnvpair libzpool libzfs -+SUBDIRS += libuutil libnvpair libzpool libzfs_core libzfs -diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am -index aa57dba..55f7b11 100644 ---- a/lib/libefi/Makefile.am -+++ b/lib/libefi/Makefile.am -@@ -12 +12,3 @@ libefi_la_SOURCES = \ - $(top_srcdir)/lib/libefi/rdwr_efi.c -+ -+libefi_la_LIBADD = $(LIBUUID) $(ZLIB) -diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c -index f4cf417..19a573c 100644 ---- a/lib/libefi/rdwr_efi.c -+++ b/lib/libefi/rdwr_efi.c -@@ -89,3 +89,4 @@ struct dk_map2 default_vtoc_map[NDKMAP] = { - --#if defined(i386) || defined(__amd64) || defined(__arm) || defined(__powerpc) -+#if defined(i386) || defined(__amd64) || defined(__arm) || \ -+ defined(__powerpc) || defined(__sparc) - { V_BOOT, V_UNMNT }, /* i - 8 */ -@@ -134,4 +135,4 @@ read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) - -- if (ioctl(fd, BLKSSZGET, §or_size) < 0) -- return (-1); -+ if (ioctl(fd, BLKSSZGET, §or_size) < 0) -+ return (-1); - -@@ -154,3 +155,3 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - -- memset(dki_info, 0, sizeof(*dki_info)); -+ memset(dki_info, 0, sizeof (*dki_info)); - -@@ -184,4 +185,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) { -@@ -190,4 +191,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/md", 7) == 0)) { -@@ -195,5 +196,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_MD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "md"); -+ rval = sscanf(dev_path, "/dev/md%[0-9]p%hu", -+ dki_info->dki_dname + 2, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/vd", 7) == 0)) { -@@ -202,4 +204,4 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ dki_info->dki_dname, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) { -@@ -207,5 +209,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_VBD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9-]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "dm-"); -+ rval = sscanf(dev_path, "/dev/dm-%[0-9]p%hu", -+ dki_info->dki_dname + 3, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) { -@@ -213,5 +216,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_PCMCIA_MEM; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "ram"); -+ rval = sscanf(dev_path, "/dev/ram%[0-9]p%hu", -+ dki_info->dki_dname + 3, -+ &dki_info->dki_partition); - } else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) { -@@ -219,5 +223,6 @@ efi_get_info(int fd, struct dk_cinfo *dki_info) - dki_info->dki_ctype = DKC_VBD; -- rval = sscanf(dev_path, "/dev/%[a-zA-Z0-9]p%hu", -- dki_info->dki_dname, -- &dki_info->dki_partition); -+ strcpy(dki_info->dki_dname, "loop"); -+ rval = sscanf(dev_path, "/dev/loop%[0-9]p%hu", -+ dki_info->dki_dname + 4, -+ &dki_info->dki_partition); - } else { -@@ -397,6 +402,6 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (efi_debug) -- fprintf(stderr,"unable to read disk info: %d",errno); -+ fprintf(stderr, "unable to read disk info: %d", errno); - - errno = EIO; -- return -1; -+ return (-1); - } -@@ -408,3 +413,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI assuming " -- "LBA %d bytes\n", DEV_BSIZE); -+ "LBA %d bytes\n", DEV_BSIZE); - -@@ -417,4 +422,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI lseek " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -425,4 +430,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI read " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -432,5 +437,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCGETEFI short " -- "read of %d bytes\n", error); -+ "read of %d bytes\n", error); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -443,5 +448,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI unknown " -- "LBA size\n"); -+ "LBA size\n"); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -452,4 +457,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI lseek " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -460,4 +465,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI write " -- "error: %d\n", errno); -- return error; -+ "error: %d\n", errno); -+ return (error); - } -@@ -467,5 +472,5 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - (void) fprintf(stderr, "DKIOCSETEFI short " -- "write of %d bytes\n", error); -+ "write of %d bytes\n", error); - errno = EIO; -- return -1; -+ return (-1); - } -@@ -475,3 +480,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (error == -1) -- return error; -+ return (error); - -@@ -479,3 +484,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - if (ioctl(fd, BLKFLSBUF, 0) == -1) -- return error; -+ return (error); - -@@ -489,3 +494,3 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - errno = EIO; -- return -1; -+ return (-1); - } -@@ -499,3 +504,4 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) - --int efi_rescan(int fd) -+int -+efi_rescan(int fd) - { -@@ -509,3 +515,3 @@ int efi_rescan(int fd) - (void) fprintf(stderr, "the kernel failed to rescan " -- "the partition table: %d\n", errno); -+ "the partition table: %d\n", errno); - return (-1); -@@ -550,3 +556,3 @@ check_label(int fd, dk_efi_t *dk_ioc) - -- if(headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { -+ if (headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { - if (efi_debug) -@@ -592,3 +598,3 @@ efi_read(int fd, struct dk_gpt *vtoc) - if ((rval = efi_get_info(fd, &dki_info)) != 0) -- return rval; -+ return (rval); - -@@ -610,4 +616,4 @@ efi_read(int fd, struct dk_gpt *vtoc) - (void) fprintf(stderr, -- "unable to read disk info: %d", -- errno); -+ "unable to read disk info: %d", -+ errno); - } -@@ -644,3 +650,3 @@ efi_read(int fd, struct dk_gpt *vtoc) - if (posix_memalign((void **)&dk_ioc.dki_data, -- disk_info.dki_lbsize, label_len)) -+ disk_info.dki_lbsize, label_len)) - return (VT_ERROR); -@@ -1119,3 +1125,3 @@ efi_write(int fd, struct dk_gpt *vtoc) - if ((rval = efi_get_info(fd, &dki_info)) != 0) -- return rval; -+ return (rval); - -@@ -1158,3 +1164,3 @@ efi_write(int fd, struct dk_gpt *vtoc) - if (posix_memalign((void **)&dk_ioc.dki_data, -- vtoc->efi_lbasize, dk_ioc.dki_length)) -+ vtoc->efi_lbasize, dk_ioc.dki_length)) - return (VT_ERROR); -diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c -index 6b39ba8..ea59dcd 100644 ---- a/lib/libshare/libshare.c -+++ b/lib/libshare/libshare.c -@@ -66,3 +66,3 @@ register_fstype(const char *name, const sa_share_ops_t *ops) - if (fstype == NULL) -- return NULL; -+ return (NULL); - -@@ -77,3 +77,3 @@ register_fstype(const char *name, const sa_share_ops_t *ops) - -- return fstype; -+ return (fstype); - } -@@ -88,3 +88,3 @@ sa_init(int init_service) - if (impl_handle == NULL) -- return NULL; -+ return (NULL); - -@@ -107,10 +107,2 @@ libshare_init(void) - libshare_smb_init(); -- -- /* -- * This bit causes /etc/dfs/sharetab to be updated before libzfs gets a -- * chance to read that file; this is necessary because the sharetab file -- * might be out of sync with the NFS kernel exports (e.g. due to reboots -- * or users manually removing shares) -- */ -- sa_fini(sa_init(0)); - } -@@ -245,3 +237,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 1; -+ return (1); - } -@@ -250,3 +242,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -256,3 +248,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -263,3 +255,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -268,3 +260,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - zfs_close(zhp); -- return 0; -+ return (0); - } -@@ -289,3 +281,3 @@ update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) - -- return 0; -+ return (0); - } -@@ -300,3 +292,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - if (impl_handle->zfs_libhandle == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -308,3 +300,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - if (zhp == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -314,3 +306,3 @@ update_zfs_share(sa_share_impl_t impl_share, const char *proto) - -- return SA_OK; -+ return (SA_OK); - } -@@ -323,3 +315,3 @@ update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) - if (impl_handle->zfs_libhandle == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -330,3 +322,3 @@ update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) - -- return SA_OK; -+ return (SA_OK); - } -@@ -353,3 +345,3 @@ process_share(sa_handle_impl_t impl_handle, sa_share_impl_t impl_share, - !S_ISDIR(statbuf.st_mode)) -- return SA_BAD_PATH; -+ return (SA_BAD_PATH); - -@@ -423,3 +415,3 @@ err: - -- return rc; -+ return (rc); - } -@@ -489,3 +481,3 @@ find_share(sa_handle_impl_t impl_handle, const char *sharepath) - -- return impl_share; -+ return (impl_share); - } -@@ -495,3 +487,3 @@ sa_find_share(sa_handle_t handle, char *sharepath) - { -- return (sa_share_t)find_share((sa_handle_impl_t)handle, sharepath); -+ return ((sa_share_t)find_share((sa_handle_impl_t)handle, sharepath)); - } -@@ -717,6 +709,6 @@ sa_parse_legacy_options(sa_group_t group, char *options, char *proto) - -- return fstype->ops->validate_shareopts(options); -+ return (fstype->ops->validate_shareopts(options)); - } - -- return SA_INVALID_PROTOCOL; -+ return (SA_INVALID_PROTOCOL); - } -@@ -726,3 +718,3 @@ sa_needs_refresh(sa_handle_t handle) - { -- return B_TRUE; -+ return (B_TRUE); - } -@@ -735,5 +727,5 @@ sa_get_zfs_handle(sa_handle_t handle) - if (impl_handle == NULL) -- return NULL; -+ return (NULL); - -- return impl_handle->zfs_libhandle; -+ return (impl_handle->zfs_libhandle); - } -@@ -748,3 +740,3 @@ alloc_share(const char *sharepath) - if (impl_share == NULL) -- return NULL; -+ return (NULL); - -@@ -754,3 +746,3 @@ alloc_share(const char *sharepath) - free(impl_share); -- return NULL; -+ return (NULL); - } -@@ -762,6 +754,6 @@ alloc_share(const char *sharepath) - free(impl_share); -- return NULL; -+ return (NULL); - } - -- return impl_share; -+ return (impl_share); - } -@@ -801,4 +793,4 @@ sa_zfs_process_share(sa_handle_t handle, sa_group_t group, sa_share_t share, - -- return process_share(impl_handle, impl_share, mountpoint, NULL, -- proto, shareopts, NULL, dataset, B_FALSE); -+ return (process_share(impl_handle, impl_share, mountpoint, NULL, -+ proto, shareopts, NULL, dataset, B_FALSE)); - } -diff --git a/lib/libshare/libshare_impl.h b/lib/libshare/libshare_impl.h -index dfcec2f..18d619b 100644 ---- a/lib/libshare/libshare_impl.h -+++ b/lib/libshare/libshare_impl.h -@@ -45,3 +45,3 @@ typedef struct sa_share_impl { - --#define FSINFO(impl_share, fstype) (&(impl_share->fsinfo[fstype->fsinfo_index])) -+#define FSINFO(impl_share, fstype) (&(impl_share->fsinfo[fstype->fsinfo_index])) - -diff --git a/lib/libshare/nfs.c b/lib/libshare/nfs.c -index 00ba0f6..d1b207e 100644 ---- a/lib/libshare/nfs.c -+++ b/lib/libshare/nfs.c -@@ -52,3 +52,3 @@ typedef int (*nfs_host_callback_t)(const char *sharepath, const char *host, - --/** -+/* - * Invokes the specified callback function for each Solaris share option -@@ -64,3 +64,3 @@ foreach_nfs_shareopt(const char *shareopts, - if (shareopts == NULL) -- return SA_OK; -+ return (SA_OK); - -@@ -69,3 +69,3 @@ foreach_nfs_shareopt(const char *shareopts, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -97,3 +97,3 @@ foreach_nfs_shareopt(const char *shareopts, - free(shareopts_dup); -- return rc; -+ return (rc); - } -@@ -109,3 +109,3 @@ foreach_nfs_shareopt(const char *shareopts, - -- return 0; -+ return (0); - } -@@ -119,3 +119,3 @@ typedef struct nfs_host_cookie_s { - --/** -+/* - * Helper function for foreach_nfs_host. This function checks whether the -@@ -148,3 +148,3 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - if (host_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -165,3 +165,3 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - -- return rc; -+ return (rc); - } -@@ -174,6 +174,6 @@ foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Invokes a callback function for all NFS hosts that are set for a share. -@@ -198,3 +198,3 @@ foreach_nfs_host(sa_share_impl_t impl_share, nfs_host_callback_t callback, - --/** -+/* - * Converts a Solaris NFS host specification to its Linux equivalent. -@@ -219,9 +219,9 @@ get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) - if (*plinux_hostspec == NULL) { -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - } - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Used internally by nfs_enable_share to enable sharing for a single host. -@@ -283,8 +283,8 @@ nfs_enable_share_one(const char *sharepath, const char *host, - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Adds a Linux share option to an array of NFS options. -@@ -304,3 +304,3 @@ add_linux_shareopt(char **plinux_opts, const char *key, const char *value) - if (new_linux_opts == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -320,6 +320,6 @@ add_linux_shareopt(char **plinux_opts, const char *key, const char *value) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Validates and converts a single Solaris share option to its Linux -@@ -335,3 +335,3 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - strcmp(key, "sec") == 0) -- return SA_OK; -+ return (SA_OK); - -@@ -340,6 +340,6 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - -- if (strcmp(key, "root_mapping") == 0) { -- (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); -- key = "anonuid"; -- } -+ if (strcmp(key, "root_mapping") == 0) { -+ (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); -+ key = "anonuid"; -+ } - -@@ -366,3 +366,3 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { -- return SA_SYNTAX_ERR; -+ return (SA_SYNTAX_ERR); - } -@@ -371,6 +371,6 @@ get_linux_shareopts_cb(const char *key, const char *value, void *cookie) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Takes a string containing Solaris share options (e.g. "sync,no_acl") and -@@ -392,3 +392,4 @@ get_linux_shareopts(const char *shareopts, char **plinux_opts) - -- rc = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, plinux_opts); -+ rc = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, -+ plinux_opts); - -@@ -399,6 +400,6 @@ get_linux_shareopts(const char *shareopts, char **plinux_opts) - -- return rc; -+ return (rc); - } - --/** -+/* - * Enables NFS sharing for the specified share. -@@ -412,3 +413,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (!nfs_available()) { -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -418,3 +419,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (shareopts == NULL) -- return SA_OK; -+ return (SA_OK); - -@@ -423,3 +424,3 @@ nfs_enable_share(sa_share_impl_t impl_share) - if (rc != SA_OK) -- return rc; -+ return (rc); - -@@ -429,6 +430,6 @@ nfs_enable_share(sa_share_impl_t impl_share) - -- return rc; -+ return (rc); - } - --/** -+/* - * Used internally by nfs_disable_share to disable sharing for a single host. -@@ -473,8 +474,8 @@ nfs_disable_share_one(const char *sharepath, const char *host, - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Disables NFS sharing for the specified share. -@@ -489,9 +490,9 @@ nfs_disable_share(sa_share_impl_t impl_share) - */ -- return SA_OK; -+ return (SA_OK); - } - -- return foreach_nfs_host(impl_share, nfs_disable_share_one, NULL); -+ return (foreach_nfs_host(impl_share, nfs_disable_share_one, NULL)); - } - --/** -+/* - * Checks whether the specified NFS share options are syntactically correct. -@@ -507,3 +508,3 @@ nfs_validate_shareopts(const char *shareopts) - if (rc != SA_OK) -- return rc; -+ return (rc); - -@@ -511,6 +512,6 @@ nfs_validate_shareopts(const char *shareopts) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Checks whether a share is currently active. -@@ -525,3 +526,3 @@ nfs_is_share_active(sa_share_impl_t impl_share) - if (!nfs_available()) -- return B_FALSE; -+ return (B_FALSE); - -@@ -532,6 +533,6 @@ nfs_is_share_active(sa_share_impl_t impl_share) - fclose(nfs_exportfs_temp_fp); -- return B_FALSE; -+ return (B_FALSE); - } - -- while (fgets(line, sizeof(line), nfs_exportfs_temp_fp) != NULL) { -+ while (fgets(line, sizeof (line), nfs_exportfs_temp_fp) != NULL) { - /* -@@ -566,3 +567,3 @@ nfs_is_share_active(sa_share_impl_t impl_share) - fclose(nfs_exportfs_temp_fp); -- return B_TRUE; -+ return (B_TRUE); - } -@@ -572,6 +573,6 @@ nfs_is_share_active(sa_share_impl_t impl_share) - -- return B_FALSE; -+ return (B_FALSE); - } - --/** -+/* - * Called to update a share's options. A share's options might be out of -@@ -606,3 +607,3 @@ nfs_update_shareopts(sa_share_impl_t impl_share, const char *resource, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -616,6 +617,6 @@ nfs_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Clears a share's NFS options. Used by libshare to -@@ -668,3 +669,3 @@ nfs_check_exportfs(void) - if (nfs_exportfs_temp_fd < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -679,3 +680,3 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -683,4 +684,3 @@ nfs_check_exportfs(void) - if (pid > 0) { -- while ((rc = waitpid(pid, &status, 0)) <= 0 && errno == EINTR) -- ; /* empty loop body */ -+ while ((rc = waitpid(pid, &status, 0)) <= 0 && errno == EINTR); - -@@ -689,3 +689,3 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - } -@@ -695,6 +695,6 @@ nfs_check_exportfs(void) - nfs_exportfs_temp_fd = -1; -- return SA_CONFIG_ERR; -+ return (SA_CONFIG_ERR); - } - -- return SA_OK; -+ return (SA_OK); - } -@@ -726,6 +726,6 @@ nfs_available(void) - -- return (nfs_exportfs_temp_fd != -1) ? B_TRUE : B_FALSE; -+ return ((nfs_exportfs_temp_fd != -1) ? B_TRUE : B_FALSE); - } - --/** -+/* - * Initializes the NFS functionality of libshare. -diff --git a/lib/libshare/smb.c b/lib/libshare/smb.c -index a545bfb..1ac1a8d 100644 ---- a/lib/libshare/smb.c -+++ b/lib/libshare/smb.c -@@ -28,3 +28,3 @@ - * shares using the 'net share' command that comes with Samba. -- -+ * - * TESTING -@@ -66,3 +66,3 @@ static sa_fstype_t *smb_fstype; - --/** -+/* - * Retrieve the list of SMB shares. -@@ -85,3 +85,3 @@ smb_retrieve_shares(void) - if (shares_dir == NULL) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -93,3 +93,3 @@ smb_retrieve_shares(void) - snprintf(file_path, sizeof (file_path), -- "%s/%s", SHARE_DIR, directory->d_name); -+ "%s/%s", SHARE_DIR, directory->d_name); - -@@ -110,7 +110,7 @@ smb_retrieve_shares(void) - if (name == NULL) { -- rc = SA_NO_MEMORY; -- goto out; -+ rc = SA_NO_MEMORY; -+ goto out; - } - -- while (fgets(line, sizeof(line), share_file_fp)) { -+ while (fgets(line, sizeof (line), share_file_fp)) { - if (line[0] == '#') -@@ -120,3 +120,3 @@ smb_retrieve_shares(void) - while (line[strlen(line) - 1] == '\r' || -- line[strlen(line) - 1] == '\n') -+ line[strlen(line) - 1] == '\n') - line[strlen(line) - 1] = '\0'; -@@ -157,11 +157,12 @@ smb_retrieve_shares(void) - sizeof (shares->name)); -- shares->name [sizeof(shares->name)-1] = '\0'; -+ shares->name [sizeof (shares->name) - 1] = '\0'; - - strncpy(shares->path, path, -- sizeof (shares->path)); -- shares->path [sizeof(shares->path)-1] = '\0'; -+ sizeof (shares->path)); -+ shares->path [sizeof (shares->path) - 1] = '\0'; - - strncpy(shares->comment, comment, -- sizeof (shares->comment)); -- shares->comment[sizeof(shares->comment)-1]='\0'; -+ sizeof (shares->comment)); -+ shares->comment[sizeof (shares->comment)-1] = -+ '\0'; - -@@ -172,5 +173,5 @@ smb_retrieve_shares(void) - -- name = NULL; -- path = NULL; -- comment = NULL; -+ name = NULL; -+ path = NULL; -+ comment = NULL; - guest_ok = NULL; -@@ -192,6 +193,6 @@ out: - -- return rc; -+ return (rc); - } - --/** -+/* - * Used internally by smb_enable_share to enable sharing for a single host. -@@ -206,4 +207,4 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - /* Support ZFS share name regexp '[[:alnum:]_-.: ]' */ -- strncpy(name, sharename, sizeof(name)); -- name [sizeof(name)-1] = '\0'; -+ strncpy(name, sharename, sizeof (name)); -+ name [sizeof (name)-1] = '\0'; - -@@ -222,14 +223,16 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - -- /* CMD: net -S NET_CMD_ARG_HOST usershare add Test1 /share/Test1 \ -- * "Comment" "Everyone:F" */ -- snprintf(comment, sizeof(comment), "Comment: %s", sharepath); -- -- argv[0] = NET_CMD_PATH; -- argv[1] = (char*)"-S"; -- argv[2] = NET_CMD_ARG_HOST; -- argv[3] = (char*)"usershare"; -- argv[4] = (char*)"add"; -- argv[5] = (char*)name; -- argv[6] = (char*)sharepath; -- argv[7] = (char*)comment; -+ /* -+ * CMD: net -S NET_CMD_ARG_HOST usershare add Test1 /share/Test1 \ -+ * "Comment" "Everyone:F" -+ */ -+ snprintf(comment, sizeof (comment), "Comment: %s", sharepath); -+ -+ argv[0] = NET_CMD_PATH; -+ argv[1] = (char *)"-S"; -+ argv[2] = NET_CMD_ARG_HOST; -+ argv[3] = (char *)"usershare"; -+ argv[4] = (char *)"add"; -+ argv[5] = (char *)name; -+ argv[6] = (char *)sharepath; -+ argv[7] = (char *)comment; - argv[8] = "Everyone:F"; -@@ -239,3 +242,3 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -244,6 +247,6 @@ smb_enable_share_one(const char *sharename, const char *sharepath) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Enables SMB sharing for the specified share. -@@ -256,3 +259,3 @@ smb_enable_share(sa_share_impl_t impl_share) - if (!smb_available()) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - -@@ -260,12 +263,13 @@ smb_enable_share(sa_share_impl_t impl_share) - if (shareopts == NULL) /* on/off */ -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - - if (strcmp(shareopts, "off") == 0) -- return SA_OK; -+ return (SA_OK); - - /* Magic: Enable (i.e., 'create new') share */ -- return smb_enable_share_one(impl_share->dataset, impl_share->sharepath); -+ return (smb_enable_share_one(impl_share->dataset, -+ impl_share->sharepath)); - } - --/** -+/* - * Used internally by smb_disable_share to disable sharing for a single host. -@@ -280,6 +284,6 @@ smb_disable_share_one(const char *sharename) - argv[0] = NET_CMD_PATH; -- argv[1] = (char*)"-S"; -+ argv[1] = (char *)"-S"; - argv[2] = NET_CMD_ARG_HOST; -- argv[3] = (char*)"usershare"; -- argv[4] = (char*)"delete"; -+ argv[3] = (char *)"usershare"; -+ argv[4] = (char *)"delete"; - argv[5] = strdup(sharename); -@@ -289,8 +293,8 @@ smb_disable_share_one(const char *sharename) - if (rc < 0) -- return SA_SYSTEM_ERR; -+ return (SA_SYSTEM_ERR); - else -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Disables SMB sharing for the specified share. -@@ -307,3 +311,3 @@ smb_disable_share(sa_share_impl_t impl_share) - */ -- return SA_OK; -+ return (SA_OK); - } -@@ -312,3 +316,3 @@ smb_disable_share(sa_share_impl_t impl_share) - if (strcmp(impl_share->sharepath, shares->path) == 0) -- return smb_disable_share_one(shares->name); -+ return (smb_disable_share_one(shares->name)); - -@@ -317,6 +321,6 @@ smb_disable_share(sa_share_impl_t impl_share) - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Checks whether the specified SMB share options are syntactically correct. -@@ -328,8 +332,8 @@ smb_validate_shareopts(const char *shareopts) - if ((strcmp(shareopts, "off") == 0) || (strcmp(shareopts, "on") == 0)) -- return SA_OK; -+ return (SA_OK); - -- return SA_SYNTAX_ERR; -+ return (SA_SYNTAX_ERR); - } - --/** -+/* - * Checks whether a share is currently active. -@@ -340,3 +344,3 @@ smb_is_share_active(sa_share_impl_t impl_share) - if (!smb_available()) -- return B_FALSE; -+ return (B_FALSE); - -@@ -347,3 +351,3 @@ smb_is_share_active(sa_share_impl_t impl_share) - if (strcmp(impl_share->sharepath, smb_shares->path) == 0) -- return B_TRUE; -+ return (B_TRUE); - -@@ -352,6 +356,6 @@ smb_is_share_active(sa_share_impl_t impl_share) - -- return B_FALSE; -+ return (B_FALSE); - } - --/** -+/* - * Called to update a share's options. A share's options might be out of -@@ -369,4 +373,4 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- if(!impl_share) -- return SA_SYSTEM_ERR; -+ if (!impl_share) -+ return (SA_SYSTEM_ERR); - -@@ -386,3 +390,3 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - if (shareopts_dup == NULL) -- return SA_NO_MEMORY; -+ return (SA_NO_MEMORY); - -@@ -396,6 +400,6 @@ smb_update_shareopts(sa_share_impl_t impl_share, const char *resource, - -- return SA_OK; -+ return (SA_OK); - } - --/** -+/* - * Clears a share's SMB options. Used by libshare to -@@ -429,11 +433,11 @@ smb_available(void) - !S_ISDIR(statbuf.st_mode)) -- return B_FALSE; -+ return (B_FALSE); - - if (access(NET_CMD_PATH, F_OK) != 0) -- return B_FALSE; -+ return (B_FALSE); - -- return B_TRUE; -+ return (B_TRUE); - } - --/** -+/* - * Initializes the SMB functionality of libshare. -diff --git a/lib/libshare/smb.h b/lib/libshare/smb.h -index f5ac83a..7a0c0fd 100644 ---- a/lib/libshare/smb.h -+++ b/lib/libshare/smb.h -@@ -30,8 +30,8 @@ - --#define SMB_NAME_MAX 255 --#define SMB_COMMENT_MAX 255 -+#define SMB_NAME_MAX 255 -+#define SMB_COMMENT_MAX 255 - --#define SHARE_DIR "/var/lib/samba/usershares" --#define NET_CMD_PATH "/usr/bin/net" --#define NET_CMD_ARG_HOST "127.0.0.1" -+#define SHARE_DIR "/var/lib/samba/usershares" -+#define NET_CMD_PATH "/usr/bin/net" -+#define NET_CMD_ARG_HOST "127.0.0.1" - -diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am -index 089056c..dbf85c4 100644 ---- a/lib/libspl/Makefile.am -+++ b/lib/libspl/Makefile.am -@@ -32,2 +32,2 @@ libspl_la_SOURCES = \ - --libspl_la_LDFLAGS = -lrt -+libspl_la_LIBADD = -lrt -diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c -index a3223ea..f5eb4f3 100644 ---- a/lib/libspl/asm-generic/atomic.c -+++ b/lib/libspl/asm-generic/atomic.c -@@ -42,3 +42,3 @@ pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER; - --#define ATOMIC_INC(name, type) \ -+#define ATOMIC_INC(name, type) \ - void atomic_inc_##name(volatile type *target) \ -@@ -61,3 +61,3 @@ ATOMIC_INC(64, uint64_t) - --#define ATOMIC_DEC(name, type) \ -+#define ATOMIC_DEC(name, type) \ - void atomic_dec_##name(volatile type *target) \ -@@ -80,3 +80,3 @@ ATOMIC_DEC(64, uint64_t) - --#define ATOMIC_ADD(name, type1, type2) \ -+#define ATOMIC_ADD(name, type1, type2) \ - void atomic_add_##name(volatile type1 *target, type2 bits) \ -@@ -97,3 +97,4 @@ ATOMIC_ADD(64, uint64_t, int64_t) - --void atomic_add_ptr(volatile void *target, ssize_t bits) -+void -+atomic_add_ptr(volatile void *target, ssize_t bits) - { -@@ -105,3 +106,3 @@ void atomic_add_ptr(volatile void *target, ssize_t bits) - --#define ATOMIC_SUB(name, type1, type2) \ -+#define ATOMIC_SUB(name, type1, type2) \ - void atomic_sub_##name(volatile type1 *target, type2 bits) \ -@@ -122,3 +123,4 @@ ATOMIC_SUB(64, uint64_t, int64_t) - --void atomic_sub_ptr(volatile void *target, ssize_t bits) -+void -+atomic_sub_ptr(volatile void *target, ssize_t bits) - { -@@ -130,3 +132,3 @@ void atomic_sub_ptr(volatile void *target, ssize_t bits) - --#define ATOMIC_OR(name, type) \ -+#define ATOMIC_OR(name, type) \ - void atomic_or_##name(volatile type *target, type bits) \ -@@ -148,3 +150,3 @@ ATOMIC_OR(64, uint64_t) - --#define ATOMIC_AND(name, type) \ -+#define ATOMIC_AND(name, type) \ - void atomic_and_##name(volatile type *target, type bits) \ -@@ -170,3 +172,3 @@ ATOMIC_AND(64, uint64_t) - --#define ATOMIC_INC_NV(name, type) \ -+#define ATOMIC_INC_NV(name, type) \ - type atomic_inc_##name##_nv(volatile type *target) \ -@@ -177,3 +179,3 @@ ATOMIC_AND(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -191,3 +193,3 @@ ATOMIC_INC_NV(64, uint64_t) - --#define ATOMIC_DEC_NV(name, type) \ -+#define ATOMIC_DEC_NV(name, type) \ - type atomic_dec_##name##_nv(volatile type *target) \ -@@ -198,3 +200,3 @@ ATOMIC_INC_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -212,3 +214,3 @@ ATOMIC_DEC_NV(64, uint64_t) - --#define ATOMIC_ADD_NV(name, type1, type2) \ -+#define ATOMIC_ADD_NV(name, type1, type2) \ - type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits)\ -@@ -219,3 +221,3 @@ ATOMIC_DEC_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -231,3 +233,4 @@ ATOMIC_ADD_NV(64, uint64_t, int64_t) - --void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) -+void * -+atomic_add_ptr_nv(volatile void *target, ssize_t bits) - { -@@ -239,3 +242,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - -- return ptr; -+ return (ptr); - } -@@ -243,3 +246,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - --#define ATOMIC_SUB_NV(name, type1, type2) \ -+#define ATOMIC_SUB_NV(name, type1, type2) \ - type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ -@@ -250,3 +253,3 @@ void *atomic_add_ptr_nv(volatile void *target, ssize_t bits) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -262,3 +265,4 @@ ATOMIC_SUB_NV(64, uint64_t, int64_t) - --void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) -+void * -+atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - { -@@ -270,3 +274,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - -- return ptr; -+ return (ptr); - } -@@ -274,3 +278,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - --#define ATOMIC_OR_NV(name, type) \ -+#define ATOMIC_OR_NV(name, type) \ - type atomic_or_##name##_nv(volatile type *target, type bits) \ -@@ -281,3 +285,3 @@ void *atomic_sub_ptr_nv(volatile void *target, ssize_t bits) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -295,3 +299,3 @@ ATOMIC_OR_NV(64, uint64_t) - --#define ATOMIC_AND_NV(name, type) \ -+#define ATOMIC_AND_NV(name, type) \ - type atomic_and_##name##_nv(volatile type *target, type bits) \ -@@ -302,3 +306,3 @@ ATOMIC_OR_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return rc; \ -+ return (rc); \ - } -@@ -320,3 +324,3 @@ ATOMIC_AND_NV(64, uint64_t) - --#define ATOMIC_CAS(name, type) \ -+#define ATOMIC_CAS(name, type) \ - type atomic_cas_##name(volatile type *target, type arg1, type arg2) \ -@@ -329,3 +333,3 @@ ATOMIC_AND_NV(64, uint64_t) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return old; \ -+ return (old); \ - } -@@ -341,3 +345,4 @@ ATOMIC_CAS(64, uint64_t) - --void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) -+void * -+atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - { -@@ -347,7 +352,7 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - old = *(void **)target; -- if (old == arg1) -- *(void **)target = arg2; -+ if (old == arg1) -+ *(void **)target = arg2; - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); - -- return old; -+ return (old); - } -@@ -359,3 +364,3 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - --#define ATOMIC_SWAP(name, type) \ -+#define ATOMIC_SWAP(name, type) \ - type atomic_swap_##name(volatile type *target, type bits) \ -@@ -367,3 +372,3 @@ void *atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ -- return old; \ -+ return (old); \ - } -@@ -379,3 +384,4 @@ ATOMIC_SWAP(64, uint64_t) - --void *atomic_swap_ptr(volatile void *target, void *bits) -+void * -+atomic_swap_ptr(volatile void *target, void *bits) - { -@@ -388,3 +394,3 @@ void *atomic_swap_ptr(volatile void *target, void *bits) - -- return old; -+ return (old); - } -@@ -392,3 +398,4 @@ void *atomic_swap_ptr(volatile void *target, void *bits) - --int atomic_set_long_excl(volatile ulong_t *target, uint_t value) -+int -+atomic_set_long_excl(volatile ulong_t *target, uint_t value) - { -@@ -400,3 +407,3 @@ int atomic_set_long_excl(volatile ulong_t *target, uint_t value) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); -- return -1; -+ return (-1); - } -@@ -405,6 +412,7 @@ int atomic_set_long_excl(volatile ulong_t *target, uint_t value) - -- return 0; -+ return (0); - } - --int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) -+int -+atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - { -@@ -416,3 +424,3 @@ int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); -- return -1; -+ return (-1); - } -@@ -421,6 +429,7 @@ int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) - -- return 0; -+ return (0); - } - --void membar_enter(void) -+void -+membar_enter(void) - { -@@ -429,3 +438,4 @@ void membar_enter(void) - --void membar_exit(void) -+void -+membar_exit(void) - { -@@ -434,3 +444,4 @@ void membar_exit(void) - --void membar_producer(void) -+void -+membar_producer(void) - { -@@ -439,3 +450,4 @@ void membar_producer(void) - --void membar_consumer(void) -+void -+membar_consumer(void) - { -@@ -446,35 +458,42 @@ void membar_consumer(void) - --uint8_t cas8(uint8_t *target, uint8_t arg1, uint8_t arg2) -+uint8_t -+cas8(uint8_t *target, uint8_t arg1, uint8_t arg2) - { -- return atomic_cas_8(target, arg1, arg2); -+ return (atomic_cas_8(target, arg1, arg2)); - } - --uint32_t cas32(uint32_t *target, uint32_t arg1, uint32_t arg2) -+uint32_t -+cas32(uint32_t *target, uint32_t arg1, uint32_t arg2) - { -- return atomic_cas_32(target, arg1, arg2); -+ return (atomic_cas_32(target, arg1, arg2)); - } - --uint64_t cas64(uint64_t *target, uint64_t arg1, uint64_t arg2) -+uint64_t -+cas64(uint64_t *target, uint64_t arg1, uint64_t arg2) - { -- return atomic_cas_64(target, arg1, arg2); -+ return (atomic_cas_64(target, arg1, arg2)); - } - --ulong_t caslong(ulong_t *target, ulong_t arg1, ulong_t arg2) -+ulong_t -+caslong(ulong_t *target, ulong_t arg1, ulong_t arg2) - { -- return atomic_cas_ulong(target, arg1, arg2); -+ return (atomic_cas_ulong(target, arg1, arg2)); - } - --void *casptr(void *target, void *arg1, void *arg2) -+void * -+casptr(void *target, void *arg1, void *arg2) - { -- return atomic_cas_ptr(target, arg1, arg2); -+ return (atomic_cas_ptr(target, arg1, arg2)); - } - --void atomic_and_long(ulong_t *target, ulong_t bits) -+void -+atomic_and_long(ulong_t *target, ulong_t bits) - { -- return atomic_and_ulong(target, bits); -+ return (atomic_and_ulong(target, bits)); - } - --void atomic_or_long(ulong_t *target, ulong_t bits) -+void -+atomic_or_long(ulong_t *target, ulong_t bits) - { -- return atomic_or_ulong(target, bits); -+ return (atomic_or_ulong(target, bits)); - } -diff --git a/lib/libspl/getexecname.c b/lib/libspl/getexecname.c -index c564eed..478351c 100644 ---- a/lib/libspl/getexecname.c -+++ b/lib/libspl/getexecname.c -@@ -43,3 +43,4 @@ getexecname(void) - if (strlen(execname) == 0) { -- rc = readlink("/proc/self/exe", execname, sizeof(execname) - 1); -+ rc = readlink("/proc/self/exe", -+ execname, sizeof (execname) - 1); - if (rc == -1) { -@@ -55,3 +56,3 @@ getexecname(void) - pthread_mutex_unlock(&mtx); -- return ptr; -+ return (ptr); - } -diff --git a/lib/libspl/gethrestime.c b/lib/libspl/gethrestime.c -index be163f8..d37cc2d 100644 ---- a/lib/libspl/gethrestime.c -+++ b/lib/libspl/gethrestime.c -@@ -32,7 +32,7 @@ gethrestime(timestruc_t *ts) - { -- struct timeval tv; -+ struct timeval tv; - -- gettimeofday(&tv, NULL); -- ts->tv_sec = tv.tv_sec; -- ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; -+ gettimeofday(&tv, NULL); -+ ts->tv_sec = tv.tv_sec; -+ ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; - } -diff --git a/lib/libspl/gethrtime.c b/lib/libspl/gethrtime.c -index c2fd5e0..95ceb18 100644 ---- a/lib/libspl/gethrtime.c -+++ b/lib/libspl/gethrtime.c -@@ -40,6 +40,6 @@ gethrtime(void) - fprintf(stderr, "Error: clock_gettime() = %d\n", rc); -- abort(); -+ abort(); - } - -- return (((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec; -+ return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec); - } -diff --git a/lib/libspl/getmntany.c b/lib/libspl/getmntany.c -index f0b1cda..d78357a 100644 ---- a/lib/libspl/getmntany.c -+++ b/lib/libspl/getmntany.c -@@ -39,3 +39,3 @@ - --#define BUFSIZE (MNT_LINE_MAX + 2) -+#define BUFSIZE (MNT_LINE_MAX + 2) - -@@ -43,4 +43,5 @@ __thread char buf[BUFSIZE]; - --#define DIFF(xx) ((mrefp->xx != NULL) && \ -- (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) -+#define DIFF(xx) ( \ -+ (mrefp->xx != NULL) && \ -+ (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) - -@@ -51,7 +52,8 @@ getmntany(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp) - -- while (((ret = _sol_getmntent(fp, mgetp)) == 0) && -- (DIFF(mnt_special) || DIFF(mnt_mountp) || -- DIFF(mnt_fstype) || DIFF(mnt_mntopts))); -+ while ( -+ ((ret = _sol_getmntent(fp, mgetp)) == 0) && ( -+ DIFF(mnt_special) || DIFF(mnt_mountp) || -+ DIFF(mnt_fstype) || DIFF(mnt_mntopts))); - -- return ret; -+ return (ret); - } -@@ -71,3 +73,3 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) - mgetp->mnt_mntopts = mntbuf.mnt_opts; -- return 0; -+ return (0); - } -@@ -75,5 +77,5 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) - if (feof(fp)) -- return -1; -+ return (-1); - -- return MNT_TOOLONG; -+ return (MNT_TOOLONG); - } -@@ -91,3 +93,3 @@ getextmntent(FILE *fp, struct extmnttab *mp, int len) - mp->mnt_minor = 0; -- return ret; -+ return (ret); - } -@@ -97,3 +99,3 @@ getextmntent(FILE *fp, struct extmnttab *mp, int len) - -- return ret; -+ return (ret); - } -diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h -index 3704165..d749d1e 100644 ---- a/lib/libspl/include/assert.h -+++ b/lib/libspl/include/assert.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_ASSERT_H --#define _LIBSPL_ASSERT_H -+#define _LIBSPL_ASSERT_H - -diff --git a/lib/libspl/include/devid.h b/lib/libspl/include/devid.h -index 9dfdae8..5406c33 100644 ---- a/lib/libspl/include/devid.h -+++ b/lib/libspl/include/devid.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_DEVID_H --#define _LIBSPL_DEVID_H -+#define _LIBSPL_DEVID_H - -@@ -38,10 +38,70 @@ typedef struct devid_nmlist { - --static inline int devid_str_decode(char *devidstr, ddi_devid_t *retdevid, char **retminor_name) { abort(); } --static inline int devid_deviceid_to_nmlist(char *search_path, ddi_devid_t devid, char *minor_name, devid_nmlist_t **retlist) { abort(); } --static inline void devid_str_free(char *str) { abort(); } --static inline void devid_free(ddi_devid_t devid) { abort(); } --static inline void devid_free_nmlist(devid_nmlist_t *list) { abort(); } --static inline int devid_get(int fd, ddi_devid_t *retdevid) { return -1; } --static inline int devid_get_minor_name(int fd, char **retminor_name) { abort(); } --static inline char *devid_str_encode(ddi_devid_t devid, char *minor_name) { abort(); } -+static inline -+int -+devid_str_decode( -+ char *devidstr, -+ ddi_devid_t *retdevid, -+ char **retminor_name) -+{ -+ abort(); -+} -+ -+static inline -+int -+devid_deviceid_to_nmlist( -+ char *search_path, -+ ddi_devid_t devid, -+ char *minor_name, -+ devid_nmlist_t **retlist) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_str_free(char *str) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_free(ddi_devid_t devid) -+{ -+ abort(); -+} -+ -+static inline -+void -+devid_free_nmlist(devid_nmlist_t *list) -+{ -+ abort(); -+} -+ -+static inline -+int -+devid_get( -+ int fd, -+ ddi_devid_t *retdevid) -+{ -+ return (-1); -+} -+ -+static inline -+int -+devid_get_minor_name( -+ int fd, -+ char **retminor_name) -+{ -+ abort(); -+} -+ -+static inline -+char * -+devid_str_encode( -+ ddi_devid_t devid, -+ char *minor_name) -+{ -+ abort(); -+} - -diff --git a/lib/libspl/include/libdevinfo.h b/lib/libspl/include/libdevinfo.h -index f0f9d7e..be1d291 100644 ---- a/lib/libspl/include/libdevinfo.h -+++ b/lib/libspl/include/libdevinfo.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_LIBDEVINFO_H --#define _LIBSPL_LIBDEVINFO_H -+#define _LIBSPL_LIBDEVINFO_H - -diff --git a/lib/libspl/include/libgen.h b/lib/libspl/include/libgen.h -index 29e5400..7c03d81 100644 ---- a/lib/libspl/include/libgen.h -+++ b/lib/libspl/include/libgen.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_LIBGEN_H --#define _LIBSPL_LIBGEN_H -+#define _LIBSPL_LIBGEN_H - -diff --git a/lib/libspl/include/libshare.h b/lib/libspl/include/libshare.h -index a35bfac..4016ff0 100644 ---- a/lib/libspl/include/libshare.h -+++ b/lib/libspl/include/libshare.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_LIBSHARE_H --#define _LIBSPL_LIBSHARE_H -+#define _LIBSPL_LIBSHARE_H - -diff --git a/lib/libspl/include/limits.h b/lib/libspl/include/limits.h -index 341a2eb..1a42cfe 100644 ---- a/lib/libspl/include/limits.h -+++ b/lib/libspl/include/limits.h -@@ -29,11 +29,11 @@ - #ifndef _LIBSPL_LIMITS_H --#define _LIBSPL_LIMITS_H -+#define _LIBSPL_LIMITS_H - --#define DBL_DIG 15 --#define DBL_MAX 1.7976931348623157081452E+308 --#define DBL_MIN 2.2250738585072013830903E-308 -+#define DBL_DIG 15 -+#define DBL_MAX 1.7976931348623157081452E+308 -+#define DBL_MIN 2.2250738585072013830903E-308 - --#define FLT_DIG 6 --#define FLT_MAX 3.4028234663852885981170E+38F --#define FLT_MIN 1.1754943508222875079688E-38F -+#define FLT_DIG 6 -+#define FLT_MAX 3.4028234663852885981170E+38F -+#define FLT_MIN 1.1754943508222875079688E-38F - -diff --git a/lib/libspl/include/locale.h b/lib/libspl/include/locale.h -index 98ca330..6c74df7 100644 ---- a/lib/libspl/include/locale.h -+++ b/lib/libspl/include/locale.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_LOCALE_H --#define _LIBSPL_LOCALE_H -+#define _LIBSPL_LOCALE_H - -diff --git a/lib/libspl/include/note.h b/lib/libspl/include/note.h -index ed6b4ba..cb6b33e 100644 ---- a/lib/libspl/include/note.h -+++ b/lib/libspl/include/note.h -@@ -38,3 +38,3 @@ - #ifndef _NOTE_H --#define _NOTE_H -+#define _NOTE_H - -@@ -46,3 +46,3 @@ extern "C" { - --#define NOTE _NOTE -+#define NOTE _NOTE - -diff --git a/lib/libspl/include/priv.h b/lib/libspl/include/priv.h -index 3e8b138..15b76a4 100644 ---- a/lib/libspl/include/priv.h -+++ b/lib/libspl/include/priv.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_PRIV_H --#define _LIBSPL_PRIV_H -+#define _LIBSPL_PRIV_H - -@@ -32,3 +32,3 @@ - /* Couldn't find this definition in OpenGrok */ --#define PRIV_SYS_CONFIG "sys_config" -+#define PRIV_SYS_CONFIG "sys_config" - -diff --git a/lib/libspl/include/rpc/types.h b/lib/libspl/include/rpc/types.h -index 68c17f1..aa9901f 100644 ---- a/lib/libspl/include/rpc/types.h -+++ b/lib/libspl/include/rpc/types.h -@@ -26,3 +26,3 @@ - #ifndef LIBSPL_RPC_TYPES_H --#define LIBSPL_RPC_TYPES_H -+#define LIBSPL_RPC_TYPES_H - -diff --git a/lib/libspl/include/rpc/xdr.h b/lib/libspl/include/rpc/xdr.h -index cd6680f..99500d6 100644 ---- a/lib/libspl/include/rpc/xdr.h -+++ b/lib/libspl/include/rpc/xdr.h -@@ -32,3 +32,3 @@ - #ifndef LIBSPL_RPC_XDR_H --#define LIBSPL_RPC_XDR_H -+#define LIBSPL_RPC_XDR_H - -@@ -57,6 +57,6 @@ typedef struct xdr_bytesrec { - */ --#define XDR_PEEK 2 --#define XDR_SKIPBYTES 3 --#define XDR_RDMAGET 4 --#define XDR_RDMASET 5 -+#define XDR_PEEK 2 -+#define XDR_SKIPBYTES 3 -+#define XDR_RDMAGET 4 -+#define XDR_RDMASET 5 - -diff --git a/lib/libspl/include/stdio.h b/lib/libspl/include/stdio.h -index f80fdc0..6152b09 100644 ---- a/lib/libspl/include/stdio.h -+++ b/lib/libspl/include/stdio.h -@@ -29,5 +29,5 @@ - #ifndef _LIBSPL_STDIO_H --#define _LIBSPL_STDIO_H -+#define _LIBSPL_STDIO_H - --#define enable_extended_FILE_stdio(fd, sig) ((void) 0) -+#define enable_extended_FILE_stdio(fd, sig) ((void) 0) - -diff --git a/lib/libspl/include/stdlib.h b/lib/libspl/include/stdlib.h -index 67d6e96..a4ce4f7 100644 ---- a/lib/libspl/include/stdlib.h -+++ b/lib/libspl/include/stdlib.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_STDLIB_H --#define _LIBSPL_STDLIB_H -+#define _LIBSPL_STDLIB_H - -diff --git a/lib/libspl/include/string.h b/lib/libspl/include/string.h -index 213977d..9e5133e 100644 ---- a/lib/libspl/include/string.h -+++ b/lib/libspl/include/string.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_STRING_H --#define _LIBSPL_STRING_H -+#define _LIBSPL_STRING_H - -diff --git a/lib/libspl/include/strings.h b/lib/libspl/include/strings.h -index 48944e1..3f35af4 100644 ---- a/lib/libspl/include/strings.h -+++ b/lib/libspl/include/strings.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_STRINGS_H --#define _LIBSPL_STRINGS_H -+#define _LIBSPL_STRINGS_H - -diff --git a/lib/libspl/include/synch.h b/lib/libspl/include/synch.h -index 2da270a..7ce2a53 100644 ---- a/lib/libspl/include/synch.h -+++ b/lib/libspl/include/synch.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYNCH_H --#define _LIBSPL_SYNCH_H -+#define _LIBSPL_SYNCH_H - -diff --git a/lib/libspl/include/sys/bitmap.h b/lib/libspl/include/sys/bitmap.h -index 8fef7fc..95122ab 100644 ---- a/lib/libspl/include/sys/bitmap.h -+++ b/lib/libspl/include/sys/bitmap.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_BITMAP_H --#define _LIBSPL_SYS_BITMAP_H -+#define _LIBSPL_SYS_BITMAP_H - -diff --git a/lib/libspl/include/sys/callb.h b/lib/libspl/include/sys/callb.h -index 29a6a67..8ffd187 100644 ---- a/lib/libspl/include/sys/callb.h -+++ b/lib/libspl/include/sys/callb.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_CALLB_H --#define _SYS_CALLB_H -+#define _SYS_CALLB_H - -diff --git a/lib/libspl/include/sys/cmn_err.h b/lib/libspl/include/sys/cmn_err.h -index d199361..63ff4eb 100644 ---- a/lib/libspl/include/sys/cmn_err.h -+++ b/lib/libspl/include/sys/cmn_err.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_CMN_ERR_H --#define _LIBSPL_SYS_CMN_ERR_H -+#define _LIBSPL_SYS_CMN_ERR_H - -diff --git a/lib/libspl/include/sys/compress.h b/lib/libspl/include/sys/compress.h -index 6e03e73..282f178 100644 ---- a/lib/libspl/include/sys/compress.h -+++ b/lib/libspl/include/sys/compress.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_COMPRESS_H --#define _LIBSPL_SYS_COMPRESS_H -+#define _LIBSPL_SYS_COMPRESS_H - -diff --git a/lib/libspl/include/sys/cred.h b/lib/libspl/include/sys/cred.h -index 6a58315..463b3ab 100644 ---- a/lib/libspl/include/sys/cred.h -+++ b/lib/libspl/include/sys/cred.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_CRED_H --#define _LIBSPL_SYS_CRED_H -+#define _LIBSPL_SYS_CRED_H - -diff --git a/lib/libspl/include/sys/debug.h b/lib/libspl/include/sys/debug.h -index 0069620..fde4a01 100644 ---- a/lib/libspl/include/sys/debug.h -+++ b/lib/libspl/include/sys/debug.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_DEBUG_H --#define _LIBSPL_SYS_DEBUG_H -+#define _LIBSPL_SYS_DEBUG_H - -diff --git a/lib/libspl/include/sys/feature_tests.h b/lib/libspl/include/sys/feature_tests.h -index 96f6271..1a68b75 100644 ---- a/lib/libspl/include/sys/feature_tests.h -+++ b/lib/libspl/include/sys/feature_tests.h -@@ -27,5 +27,5 @@ - #ifndef _SYS_FEATURE_TESTS_H --#define _SYS_FEATURE_TESTS_H -+#define _SYS_FEATURE_TESTS_H - --#define __NORETURN __attribute__((__noreturn__)) -+#define __NORETURN __attribute__((__noreturn__)) - -diff --git a/lib/libspl/include/sys/file.h b/lib/libspl/include/sys/file.h -index 9aaba35..163a4dc 100644 ---- a/lib/libspl/include/sys/file.h -+++ b/lib/libspl/include/sys/file.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_FILE_H --#define _LIBSPL_SYS_FILE_H -+#define _LIBSPL_SYS_FILE_H - -@@ -33,17 +33,17 @@ - --#define FREAD 1 --#define FWRITE 2 --//#define FAPPEND 8 -+#define FREAD 1 -+#define FWRITE 2 -+// #define FAPPEND 8 - --#define FCREAT O_CREAT --#define FTRUNC O_TRUNC --#define FOFFMAX O_LARGEFILE --#define FSYNC O_SYNC --#define FDSYNC O_DSYNC --#define FRSYNC O_RSYNC --#define FEXCL O_EXCL -+#define FCREAT O_CREAT -+#define FTRUNC O_TRUNC -+#define FOFFMAX O_LARGEFILE -+#define FSYNC O_SYNC -+#define FDSYNC O_DSYNC -+#define FRSYNC O_RSYNC -+#define FEXCL O_EXCL - --#define FNODSYNC 0x10000 /* fsync pseudo flag */ --#define FNOFOLLOW 0x20000 /* don't follow symlinks */ --#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ -+#define FNODSYNC 0x10000 /* fsync pseudo flag */ -+#define FNOFOLLOW 0x20000 /* don't follow symlinks */ -+#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ - -diff --git a/lib/libspl/include/sys/frame.h b/lib/libspl/include/sys/frame.h -index f936ab8..a4c7d8b 100644 ---- a/lib/libspl/include/sys/frame.h -+++ b/lib/libspl/include/sys/frame.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_FRAME_H --#define _SYS_FRAME_H -+#define _SYS_FRAME_H - -diff --git a/lib/libspl/include/sys/int_limits.h b/lib/libspl/include/sys/int_limits.h -index 2b50ddd..7af68cd 100644 ---- a/lib/libspl/include/sys/int_limits.h -+++ b/lib/libspl/include/sys/int_limits.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_INT_LIMITS_H --#define _LIBSPL_SYS_INT_LIMITS_H -+#define _LIBSPL_SYS_INT_LIMITS_H - -diff --git a/lib/libspl/include/sys/int_types.h b/lib/libspl/include/sys/int_types.h -index b325122..51e9e02 100644 ---- a/lib/libspl/include/sys/int_types.h -+++ b/lib/libspl/include/sys/int_types.h -@@ -27,3 +27,3 @@ - #ifndef _SOL_SYS_INT_TYPES_H --#define _SOL_SYS_INT_TYPES_H -+#define _SOL_SYS_INT_TYPES_H - -diff --git a/lib/libspl/include/sys/inttypes.h b/lib/libspl/include/sys/inttypes.h -index 7630f2d..d7d0639 100644 ---- a/lib/libspl/include/sys/inttypes.h -+++ b/lib/libspl/include/sys/inttypes.h -@@ -27,3 +27,3 @@ - #ifndef _SOL_SYS_INTTYPES_H --#define _SOL_SYS_INTTYPES_H -+#define _SOL_SYS_INTTYPES_H - -@@ -31,3 +31,3 @@ - --#define _INT64_TYPE -+#define _INT64_TYPE - -diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h -index 4ab07eb..446dbfc 100644 ---- a/lib/libspl/include/sys/isa_defs.h -+++ b/lib/libspl/include/sys/isa_defs.h -@@ -37,3 +37,3 @@ extern "C" { - #if !defined(__x86_64) --#define __x86_64 -+#define __x86_64 - #endif -@@ -41,3 +41,3 @@ extern "C" { - #if !defined(__amd64) --#define __amd64 -+#define __amd64 - #endif -@@ -45,3 +45,3 @@ extern "C" { - #if !defined(__x86) --#define __x86 -+#define __x86 - #endif -@@ -49,3 +49,3 @@ extern "C" { - #if !defined(_LP64) --#define _LP64 -+#define _LP64 - #endif -@@ -53,6 +53,6 @@ extern "C" { - #if !defined(_LITTLE_ENDIAN) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -62,3 +62,3 @@ extern "C" { - #if !defined(__i386) --#define __i386 -+#define __i386 - #endif -@@ -66,3 +66,3 @@ extern "C" { - #if !defined(__x86) --#define __x86 -+#define __x86 - #endif -@@ -70,3 +70,3 @@ extern "C" { - #if !defined(_ILP32) --#define _ILP32 -+#define _ILP32 - #endif -@@ -74,6 +74,6 @@ extern "C" { - #if !defined(_LITTLE_ENDIAN) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -83,3 +83,3 @@ extern "C" { - #if !defined(__powerpc) --#define __powerpc -+#define __powerpc - #endif -@@ -87,3 +87,3 @@ extern "C" { - #if !defined(__powerpc__) --#define __powerpc__ -+#define __powerpc__ - #endif -@@ -92,5 +92,5 @@ extern "C" { - #ifdef __powerpc64__ --#define _LP64 -+#define _LP64 - #else --#define _LP32 -+#define _LP32 - #endif -@@ -99,6 +99,6 @@ extern "C" { - #if !defined(_BIG_ENDIAN) --#define _BIG_ENDIAN -+#define _BIG_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - -@@ -108,3 +108,3 @@ extern "C" { - #if !defined(__arm) --#define __arm -+#define __arm - #endif -@@ -112,3 +112,3 @@ extern "C" { - #if !defined(__arm__) --#define __arm__ -+#define __arm__ - #endif -@@ -116,10 +116,38 @@ extern "C" { - #if defined(__ARMEL__) --#define _LITTLE_ENDIAN -+#define _LITTLE_ENDIAN - #else --#define _BIG_ENDIAN -+#define _BIG_ENDIAN - #endif - --#define _SUNOS_VTOC_16 -+#define _SUNOS_VTOC_16 - --#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ -+/* sparc arch specific defines */ -+#elif defined(__sparc) || defined(__sparc__) -+ -+#if !defined(__sparc) -+#define __sparc -+#endif -+ -+#if !defined(__sparc__) -+#define __sparc__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+/* sparc64 arch specific defines */ -+#elif defined(__sparc64) || defined(__sparc64__) -+ -+#if !defined(__sparc64) -+#define __sparc64 -+#endif -+ -+#if !defined(__sparc64__) -+#define __sparc64__ -+#endif -+ -+#define _BIG_ENDIAN -+#define _SUNOS_VTOC_16 -+ -+#else /* Currently x86_64, i386, arm, powerpc, and sparc are supported */ - #error "Unsupported ISA type" -diff --git a/lib/libspl/include/sys/kmem.h b/lib/libspl/include/sys/kmem.h -index 401e040..83d4756 100644 ---- a/lib/libspl/include/sys/kmem.h -+++ b/lib/libspl/include/sys/kmem.h -@@ -37,4 +37,4 @@ extern "C" { - --#define kmem_alloc(size, flags) malloc(size) --#define kmem_free(ptr, size) free(ptr) -+#define kmem_alloc(size, flags) malloc(size) -+#define kmem_free(ptr, size) free(ptr) - -diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h -index 6bd2ec8..fcd3ed9 100644 ---- a/lib/libspl/include/sys/kstat.h -+++ b/lib/libspl/include/sys/kstat.h -@@ -230,6 +230,4 @@ typedef struct kstat32 { - /* ks_ndata >= 1 */ --#define KSTAT_TYPE_TXG 5 /* txg statistics */ -- /* ks_ndata >= 0 */ - --#define KSTAT_NUM_TYPES 6 -+#define KSTAT_NUM_TYPES 5 - -@@ -702,25 +700,2 @@ typedef struct kstat_timer { - --/* -- * TXG statistics - bytes read/written and iops performed -- */ --typedef enum kstat_txg_state { -- TXG_STATE_OPEN = 1, -- TXG_STATE_QUIESCING = 2, -- TXG_STATE_SYNCING = 3, -- TXG_STATE_COMMITTED = 4, --} kstat_txg_state_t; -- --typedef struct kstat_txg { -- u_longlong_t txg; /* txg id */ -- kstat_txg_state_t state; /* txg state */ -- hrtime_t birth; /* birth time stamp */ -- u_longlong_t nread; /* number of bytes read */ -- u_longlong_t nwritten; /* number of bytes written */ -- uint_t reads; /* number of read operations */ -- uint_t writes; /* number of write operations */ -- hrtime_t open_time; /* open time */ -- hrtime_t quiesce_time; /* quiesce time */ -- hrtime_t sync_time; /* sync time */ --} kstat_txg_t; -- - #if defined(_KERNEL) -diff --git a/lib/libspl/include/sys/mkdev.h b/lib/libspl/include/sys/mkdev.h -index 76e3a4f..5978de6 100644 ---- a/lib/libspl/include/sys/mkdev.h -+++ b/lib/libspl/include/sys/mkdev.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_MKDEV_H --#define _LIBSPL_SYS_MKDEV_H -+#define _LIBSPL_SYS_MKDEV_H - -diff --git a/lib/libspl/include/sys/mntent.h b/lib/libspl/include/sys/mntent.h -index 8fad65b..b57ffee 100644 ---- a/lib/libspl/include/sys/mntent.h -+++ b/lib/libspl/include/sys/mntent.h -@@ -41,2 +41,3 @@ - #define MOUNT_SOMEOK 0x40 /* At least on mount succeeded */ -+#define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY */ - -@@ -48,3 +49,2 @@ - #define MNTOPT_CONTEXT "context" /* selinux context */ --#define MNTOPT_NOCONTEXT "nocontext" /* No selinux context (zfs-only) */ - #define MNTOPT_FSCONTEXT "fscontext" /* selinux fscontext */ -@@ -94,6 +94,8 @@ - #define MNTOPT_ZFSUTIL "zfsutil" /* called by zfs utility */ -+#define MNTOPT_ACL "acl" /* passed by util-linux-2.24 mount */ -+#define MNTOPT_NOACL "noacl" /* likewise */ -+#define MNTOPT_POSIXACL "posixacl" /* likewise */ - --#define ZS_COMMENT 0x00000000 /* comment */ --#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ --#define ZS_NOCONTEXT 0x00000002 /* do not add selinux context */ -+#define ZS_COMMENT 0x00000000 /* comment */ -+#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ - -diff --git a/lib/libspl/include/sys/mnttab.h b/lib/libspl/include/sys/mnttab.h -index a30549a..6bfbdd6 100644 ---- a/lib/libspl/include/sys/mnttab.h -+++ b/lib/libspl/include/sys/mnttab.h -@@ -21,3 +21,3 @@ - */ --/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T*/ -+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ - /* All Rights Reserved */ -@@ -30,3 +30,3 @@ - #ifndef _SYS_MNTTAB_H --#define _SYS_MNTTAB_H -+#define _SYS_MNTTAB_H - -@@ -79,7 +79,7 @@ static inline char *_sol_hasmntopt(struct mnttab *mnt, char *opt) - -- return hasmntopt(&mnt_new, opt); -+ return (hasmntopt(&mnt_new, opt)); - } - --#define hasmntopt _sol_hasmntopt --#define getmntent _sol_getmntent -+#define hasmntopt _sol_hasmntopt -+#define getmntent _sol_getmntent - -diff --git a/lib/libspl/include/sys/mount.h b/lib/libspl/include/sys/mount.h -index 7b1e06b..41cd839 100644 ---- a/lib/libspl/include/sys/mount.h -+++ b/lib/libspl/include/sys/mount.h -@@ -29,3 +29,3 @@ - #ifndef _LIBSPL_SYS_MOUNT_H --#define _LIBSPL_SYS_MOUNT_H -+#define _LIBSPL_SYS_MOUNT_H - -@@ -41,3 +41,3 @@ - #if !defined(BLKGETSIZE64) --#define BLKGETSIZE64 _IOR(0x12, 114, size_t) -+#define BLKGETSIZE64 _IOR(0x12, 114, size_t) - #endif -@@ -50,3 +50,12 @@ - #if !defined(MS_DIRSYNC) --#define MS_DIRSYNC S_WRITE -+#define MS_DIRSYNC S_WRITE -+#endif -+ -+/* -+ * Some old glibc headers don't correctly define MS_POSIXACL and -+ * instead leave it undefined. When using these older headers define -+ * MS_POSIXACL to the reserved value of (1<<16). -+ */ -+#if !defined(MS_POSIXACL) -+#define MS_POSIXACL (1<<16) - #endif -@@ -64,5 +73,5 @@ - #ifdef MNT_FORCE --# define MS_FORCE MNT_FORCE -+#define MS_FORCE MNT_FORCE - #else --# define MS_FORCE 0x00000001 -+#define MS_FORCE 0x00000001 - #endif /* MNT_FORCE */ -@@ -70,5 +79,5 @@ - #ifdef MNT_DETACH --# define MS_DETACH MNT_DETACH -+#define MS_DETACH MNT_DETACH - #else --# define MS_DETACH 0x00000002 -+#define MS_DETACH 0x00000002 - #endif /* MNT_DETACH */ -@@ -80,3 +89,3 @@ - */ --#define MS_OVERLAY 0x00000004 -+#define MS_OVERLAY 0x00000004 - -diff --git a/lib/libspl/include/sys/param.h b/lib/libspl/include/sys/param.h -index 75cf0b7..4090cef 100644 ---- a/lib/libspl/include/sys/param.h -+++ b/lib/libspl/include/sys/param.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_PARAM_H --#define _LIBSPL_SYS_PARAM_H -+#define _LIBSPL_SYS_PARAM_H - -@@ -45,17 +45,17 @@ - */ --#define MAXBSIZE 8192 --#define DEV_BSIZE 512 --#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ -+#define MAXBSIZE 8192 -+#define DEV_BSIZE 512 -+#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ - --#define MAXNAMELEN 256 --#define MAXOFFSET_T LLONG_MAX -+#define MAXNAMELEN 256 -+#define MAXOFFSET_T LLONG_MAX - --#define UID_NOBODY 60001 /* user ID no body */ --#define GID_NOBODY UID_NOBODY --#define UID_NOACCESS 60002 /* user ID no access */ -+#define UID_NOBODY 60001 /* user ID no body */ -+#define GID_NOBODY UID_NOBODY -+#define UID_NOACCESS 60002 /* user ID no access */ - --#define MAXUID UINT32_MAX /* max user id */ --#define MAXPROJID MAXUID /* max project id */ -+#define MAXUID UINT32_MAX /* max user id */ -+#define MAXPROJID MAXUID /* max project id */ - --#define PAGESIZE (sysconf(_SC_PAGESIZE)) -+#define PAGESIZE (sysconf(_SC_PAGESIZE)) - -diff --git a/lib/libspl/include/sys/priv.h b/lib/libspl/include/sys/priv.h -index 4a3ab96..76c76d1 100644 ---- a/lib/libspl/include/sys/priv.h -+++ b/lib/libspl/include/sys/priv.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_PRIV_H --#define _LIBSPL_SYS_PRIV_H -+#define _LIBSPL_SYS_PRIV_H - -diff --git a/lib/libspl/include/sys/processor.h b/lib/libspl/include/sys/processor.h -index 0af9dc0..78e95d0 100644 ---- a/lib/libspl/include/sys/processor.h -+++ b/lib/libspl/include/sys/processor.h -@@ -27,5 +27,5 @@ - #ifndef _LIBSPL_SYS_PROCESSOR_H --#define _LIBSPL_SYS_PROCESSOR_H -+#define _LIBSPL_SYS_PROCESSOR_H - --#define getcpuid() (-1) -+#define getcpuid() (-1) - -diff --git a/lib/libspl/include/sys/sdt.h b/lib/libspl/include/sys/sdt.h -index 79733ee..f68f790 100644 ---- a/lib/libspl/include/sys/sdt.h -+++ b/lib/libspl/include/sys/sdt.h -@@ -27,9 +27,9 @@ - #ifndef _LIBSPL_SYS_SDT_H --#define _LIBSPL_SYS_SDT_H -+#define _LIBSPL_SYS_SDT_H - --#define DTRACE_PROBE(a) ((void) 0) --#define DTRACE_PROBE1(a,b,c) ((void) 0) --#define DTRACE_PROBE2(a,b,c,d,e) ((void) 0) --#define DTRACE_PROBE3(a,b,c,d,e,f,g) ((void) 0) --#define DTRACE_PROBE4(a,b,c,d,e,f,g,h,i) ((void) 0) -+#define DTRACE_PROBE(a) ((void) 0) -+#define DTRACE_PROBE1(a, b, c) ((void) 0) -+#define DTRACE_PROBE2(a, b, c, d, e) ((void) 0) -+#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void) 0) -+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void) 0) - -diff --git a/lib/libspl/include/sys/stack.h b/lib/libspl/include/sys/stack.h -index 41f0beb..59807e9 100644 ---- a/lib/libspl/include/sys/stack.h -+++ b/lib/libspl/include/sys/stack.h -@@ -25,3 +25,3 @@ - #ifndef _SYS_STACK_H --#define _SYS_STACK_H -+#define _SYS_STACK_H - -@@ -29,3 +29,3 @@ - --#define STACK_BIAS 0 -+#define STACK_BIAS 0 - -@@ -41,3 +41,3 @@ stack_getbounds(stack_t *sp) - if (rc) -- return rc; -+ return (rc); - -@@ -49,3 +49,3 @@ stack_getbounds(stack_t *sp) - -- return rc; -+ return (rc); - } -@@ -59,7 +59,9 @@ thr_stksegment(stack_t *sp) - if (rc) -- return rc; -+ return (rc); - -- /* thr_stksegment() is expected to set sp.ss_sp to the high stack -- * address, but the stack_getbounds() interface is expected to -- * set sp.ss_sp to the low address. Adjust accordingly. */ -+ /* -+ * thr_stksegment() is expected to set sp.ss_sp to the high stack -+ * address, but the stack_getbounds() interface is expected to -+ * set sp.ss_sp to the low address. Adjust accordingly. -+ */ - sp->ss_sp = (void *)(((uintptr_t)sp->ss_sp) + sp->ss_size); -@@ -67,3 +69,3 @@ thr_stksegment(stack_t *sp) - -- return rc; -+ return (rc); - } -diff --git a/lib/libspl/include/sys/stat.h b/lib/libspl/include/sys/stat.h -index b9ad152..3e8d27e 100644 ---- a/lib/libspl/include/sys/stat.h -+++ b/lib/libspl/include/sys/stat.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_SYS_STAT_H --#define _LIBSPL_SYS_STAT_H -+#define _LIBSPL_SYS_STAT_H - -@@ -39,3 +39,3 @@ fstat64_blk(int fd, struct stat64 *st) - if (fstat64(fd, st) == -1) -- return -1; -+ return (-1); - -@@ -44,6 +44,6 @@ fstat64_blk(int fd, struct stat64 *st) - if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) -- return -1; -+ return (-1); - } - -- return 0; -+ return (0); - } -diff --git a/lib/libspl/include/sys/stropts.h b/lib/libspl/include/sys/stropts.h -index e036b0e..08c2e79 100644 ---- a/lib/libspl/include/sys/stropts.h -+++ b/lib/libspl/include/sys/stropts.h -@@ -26,3 +26,3 @@ - #ifndef _LIBSPL_SYS_STROPTS_H --#define _LIBSPL_SYS_STROPTS_H -+#define _LIBSPL_SYS_STROPTS_H - -diff --git a/lib/libspl/include/sys/sysevent.h b/lib/libspl/include/sys/sysevent.h -index 980d145..074d841 100644 ---- a/lib/libspl/include/sys/sysevent.h -+++ b/lib/libspl/include/sys/sysevent.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSEVENT_H --#define _LIBSPL_SYS_SYSEVENT_H -+#define _LIBSPL_SYS_SYSEVENT_H - -diff --git a/lib/libspl/include/sys/sysmacros.h b/lib/libspl/include/sys/sysmacros.h -index 07ab8c9..698b0a7 100644 ---- a/lib/libspl/include/sys/sysmacros.h -+++ b/lib/libspl/include/sys/sysmacros.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSMACROS_H --#define _LIBSPL_SYS_SYSMACROS_H -+#define _LIBSPL_SYS_SYSMACROS_H - -@@ -33,14 +33,14 @@ - #ifndef MIN --#define MIN(a, b) ((a) < (b) ? (a) : (b)) -+#define MIN(a, b) ((a) < (b) ? (a) : (b)) - #endif - #ifndef MAX --#define MAX(a, b) ((a) < (b) ? (b) : (a)) -+#define MAX(a, b) ((a) < (b) ? (b) : (a)) - #endif - #ifndef ABS --#define ABS(a) ((a) < 0 ? -(a) : (a)) -+#define ABS(a) ((a) < 0 ? -(a) : (a)) - #endif - --#define makedevice(maj,min) makedev(maj,min) --#define _sysconf(a) sysconf(a) --#define __NORETURN __attribute__ ((noreturn)) -+#define makedevice(maj, min) makedev(maj, min) -+#define _sysconf(a) sysconf(a) -+#define __NORETURN __attribute__((noreturn)) - -@@ -49,15 +49,15 @@ - */ --#define P2ALIGN(x, align) ((x) & -(align)) --#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) --#define P2ROUNDUP(x, align) (-(-(x) & -(align))) --#define P2ROUNDUP_TYPED(x, align, type) \ -+#define P2ALIGN(x, align) ((x) & -(align)) -+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) -+#define P2ROUNDUP(x, align) (-(-(x) & -(align))) -+#define P2ROUNDUP_TYPED(x, align, type) \ - (-(-(type)(x) & -(type)(align))) --#define P2BOUNDARY(off, len, align) \ -+#define P2BOUNDARY(off, len, align) \ - (((off) ^ ((off) + (len) - 1)) > (align) - 1) --#define P2PHASE(x, align) ((x) & ((align) - 1)) --#define P2NPHASE(x, align) (-(x) & ((align) - 1)) --#define P2NPHASE_TYPED(x, align, type) \ -+#define P2PHASE(x, align) ((x) & ((align) - 1)) -+#define P2NPHASE(x, align) (-(x) & ((align) - 1)) -+#define P2NPHASE_TYPED(x, align, type) \ - (-(type)(x) & ((type)(align) - 1)) --#define ISP2(x) (((x) & ((x) - 1)) == 0) --#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) -+#define ISP2(x) (((x) & ((x) - 1)) == 0) -+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) - -@@ -74,18 +74,18 @@ - */ --#define P2ALIGN_TYPED(x, align, type) \ -- ((type)(x) & -(type)(align)) --#define P2PHASE_TYPED(x, align, type) \ -- ((type)(x) & ((type)(align) - 1)) --#define P2NPHASE_TYPED(x, align, type) \ -- (-(type)(x) & ((type)(align) - 1)) --#define P2ROUNDUP_TYPED(x, align, type) \ -- (-(-(type)(x) & -(type)(align))) --#define P2END_TYPED(x, align, type) \ -- (-(~(type)(x) & -(type)(align))) --#define P2PHASEUP_TYPED(x, align, phase, type) \ -- ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) --#define P2CROSS_TYPED(x, y, align, type) \ -- (((type)(x) ^ (type)(y)) > (type)(align) - 1) --#define P2SAMEHIGHBIT_TYPED(x, y, type) \ -- (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) -+#define P2ALIGN_TYPED(x, align, type) \ -+ ((type)(x) & -(type)(align)) -+#define P2PHASE_TYPED(x, align, type) \ -+ ((type)(x) & ((type)(align) - 1)) -+#define P2NPHASE_TYPED(x, align, type) \ -+ (-(type)(x) & ((type)(align) - 1)) -+#define P2ROUNDUP_TYPED(x, align, type) \ -+ (-(-(type)(x) & -(type)(align))) -+#define P2END_TYPED(x, align, type) \ -+ (-(~(type)(x) & -(type)(align))) -+#define P2PHASEUP_TYPED(x, align, phase, type) \ -+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) -+#define P2CROSS_TYPED(x, y, align, type) \ -+ (((type)(x) ^ (type)(y)) > (type)(align) - 1) -+#define P2SAMEHIGHBIT_TYPED(x, y, type) \ -+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) - -diff --git a/lib/libspl/include/sys/systeminfo.h b/lib/libspl/include/sys/systeminfo.h -index 9f561aa..3f7cef5 100644 ---- a/lib/libspl/include/sys/systeminfo.h -+++ b/lib/libspl/include/sys/systeminfo.h -@@ -27,6 +27,6 @@ - #ifndef _LIBSPL_SYS_SYSTEMINFO_H --#define _LIBSPL_SYS_SYSTEMINFO_H -+#define _LIBSPL_SYS_SYSTEMINFO_H - --#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ --#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ -+#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ -+#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ - /* to hold a decimal or hex */ -@@ -34,3 +34,3 @@ - --#define sysinfo(cmd,buf,cnt) (-1) -+#define sysinfo(cmd, buf, cnt) (-1) - -diff --git a/lib/libspl/include/sys/systm.h b/lib/libspl/include/sys/systm.h -index 5cb088d..1ed031d 100644 ---- a/lib/libspl/include/sys/systm.h -+++ b/lib/libspl/include/sys/systm.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_SYSTM_H --#define _LIBSPL_SYS_SYSTM_H -+#define _LIBSPL_SYS_SYSTM_H - -diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h -index 0cbbd92..f0da440 100644 ---- a/lib/libspl/include/sys/time.h -+++ b/lib/libspl/include/sys/time.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_TIME_H --#define _LIBSPL_SYS_TIME_H -+#define _LIBSPL_SYS_TIME_H - -@@ -33,3 +33,3 @@ - #ifndef SEC --#define SEC 1 -+#define SEC 1 - #endif -@@ -37,3 +37,3 @@ - #ifndef MILLISEC --#define MILLISEC 1000 -+#define MILLISEC 1000 - #endif -@@ -41,3 +41,3 @@ - #ifndef MICROSEC --#define MICROSEC 1000000 -+#define MICROSEC 1000000 - #endif -@@ -45,3 +45,3 @@ - #ifndef NANOSEC --#define NANOSEC 1000000000 -+#define NANOSEC 1000000000 - #endif -@@ -49,3 +49,11 @@ - #ifndef NSEC_PER_USEC --#define NSEC_PER_USEC 1000L -+#define NSEC_PER_USEC 1000L -+#endif -+ -+#ifndef MSEC2NSEC -+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) -+#endif -+ -+#ifndef NSEC2MSEC -+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) - #endif -diff --git a/lib/libspl/include/sys/types.h b/lib/libspl/include/sys/types.h -index 77a5b23..bd34dec 100644 ---- a/lib/libspl/include/sys/types.h -+++ b/lib/libspl/include/sys/types.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_TYPES_H --#define _LIBSPL_SYS_TYPES_H -+#define _LIBSPL_SYS_TYPES_H - -@@ -55,5 +55,5 @@ typedef longlong_t diskaddr_t; - --typedef ulong_t pfn_t; /* page frame number */ --typedef ulong_t pgcnt_t; /* number of pages */ --typedef long spgcnt_t; /* signed number of pages */ -+typedef ulong_t pfn_t; /* page frame number */ -+typedef ulong_t pgcnt_t; /* number of pages */ -+typedef long spgcnt_t; /* signed number of pages */ - -diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h -index 8adc923..97e8412 100644 ---- a/lib/libspl/include/sys/uio.h -+++ b/lib/libspl/include/sys/uio.h -@@ -47,4 +47,4 @@ typedef struct iovec iovec_t; - typedef enum uio_rw { -- UIO_READ = 0, -- UIO_WRITE = 1, -+ UIO_READ = 0, -+ UIO_WRITE = 1, - } uio_rw_t; -@@ -52,5 +52,5 @@ typedef enum uio_rw { - typedef enum uio_seg { -- UIO_USERSPACE = 0, -- UIO_SYSSPACE = 1, -- UIO_USERISPACE= 2, -+ UIO_USERSPACE = 0, -+ UIO_SYSSPACE = 1, -+ UIO_USERISPACE = 2, - } uio_seg_t; -@@ -104,4 +104,4 @@ typedef struct xuio { - --#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv --#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw -+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - -diff --git a/lib/libspl/include/sys/utsname.h b/lib/libspl/include/sys/utsname.h -index fd323b9..e16e22d 100644 ---- a/lib/libspl/include/sys/utsname.h -+++ b/lib/libspl/include/sys/utsname.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UTSNAME_H --#define _LIBSPL_UTSNAME_H -+#define _LIBSPL_UTSNAME_H - -diff --git a/lib/libspl/include/sys/va_list.h b/lib/libspl/include/sys/va_list.h -index cf60454..04ad148 100644 ---- a/lib/libspl/include/sys/va_list.h -+++ b/lib/libspl/include/sys/va_list.h -@@ -27,3 +27,3 @@ - #ifndef _SYS_VA_LIST_H --#define _SYS_VA_LIST_H -+#define _SYS_VA_LIST_H - -diff --git a/lib/libspl/include/sys/varargs.h b/lib/libspl/include/sys/varargs.h -index b8a63d8..3d00a33 100644 ---- a/lib/libspl/include/sys/varargs.h -+++ b/lib/libspl/include/sys/varargs.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_VARARGS_H --#define _LIBSPL_SYS_VARARGS_H -+#define _LIBSPL_SYS_VARARGS_H - -diff --git a/lib/libspl/include/sys/vnode.h b/lib/libspl/include/sys/vnode.h -index f25e9e9..efcdd2c 100644 ---- a/lib/libspl/include/sys/vnode.h -+++ b/lib/libspl/include/sys/vnode.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_VNODE_H --#define _LIBSPL_SYS_VNODE_H -+#define _LIBSPL_SYS_VNODE_H - -diff --git a/lib/libspl/include/sys/zone.h b/lib/libspl/include/sys/zone.h -index ea7c8bd..bbb964d 100644 ---- a/lib/libspl/include/sys/zone.h -+++ b/lib/libspl/include/sys/zone.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_SYS_ZONE_H --#define _LIBSPL_SYS_ZONE_H -+#define _LIBSPL_SYS_ZONE_H - -diff --git a/lib/libspl/include/thread.h b/lib/libspl/include/thread.h -index a72f6d2..74694e2 100644 ---- a/lib/libspl/include/thread.h -+++ b/lib/libspl/include/thread.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_THREAD_H --#define _LIBSPL_THREAD_H -+#define _LIBSPL_THREAD_H - -diff --git a/lib/libspl/include/tzfile.h b/lib/libspl/include/tzfile.h -index 441b8cf..7bd4087 100644 ---- a/lib/libspl/include/tzfile.h -+++ b/lib/libspl/include/tzfile.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_TZFILE_H --#define _LIBSPL_TZFILE_H -+#define _LIBSPL_TZFILE_H - -diff --git a/lib/libspl/include/ucred.h b/lib/libspl/include/ucred.h -index 4ca424e..8178fde 100644 ---- a/lib/libspl/include/ucred.h -+++ b/lib/libspl/include/ucred.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UCRED_H --#define _LIBSPL_UCRED_H -+#define _LIBSPL_UCRED_H - -diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h -index f102f66..0d0778c 100644 ---- a/lib/libspl/include/umem.h -+++ b/lib/libspl/include/umem.h -@@ -27,5 +27,6 @@ - #ifndef _LIBSPL_UMEM_H --#define _LIBSPL_UMEM_H -+#define _LIBSPL_UMEM_H - --/* XXX: We should use the real portable umem library if it is detected -+/* -+ * XXX: We should use the real portable umem library if it is detected - * at configure time. However, if the library is not available, we can -@@ -50,4 +51,4 @@ typedef void vmem_t; - */ --#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ --#define UMEM_NOFAIL 0x0100 /* Never fails */ -+#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ -+#define UMEM_NOFAIL 0x0100 /* Never fails */ - -@@ -56,8 +57,8 @@ typedef void vmem_t; - */ --#define UMC_NOTOUCH 0x00010000 --#define UMC_NODEBUG 0x00020000 --#define UMC_NOMAGAZINE 0x00040000 --#define UMC_NOHASH 0x00080000 -+#define UMC_NOTOUCH 0x00010000 -+#define UMC_NODEBUG 0x00020000 -+#define UMC_NOMAGAZINE 0x00040000 -+#define UMC_NOHASH 0x00080000 - --#define UMEM_CACHE_NAMELEN 31 -+#define UMEM_CACHE_NAMELEN 31 - -@@ -89,3 +90,3 @@ umem_alloc(size_t size, int flags) - -- return ptr; -+ return (ptr); - } -@@ -107,6 +108,8 @@ umem_alloc_aligned(size_t size, size_t align, int flags) - abort(); -- return NULL; -+ return (NULL); - } - -- return ptr; -+ ASSERT0(P2PHASE_TYPED(ptr, align, uint64_t)); -+ -+ return (ptr); - } -@@ -122,3 +125,3 @@ umem_zalloc(size_t size, int flags) - -- return ptr; -+ return (ptr); - } -@@ -135,7 +138,8 @@ umem_nofail_callback(umem_nofail_callback_t *cb) {} - static inline umem_cache_t * --umem_cache_create(char *name, size_t bufsize, size_t align, -- umem_constructor_t *constructor, -- umem_destructor_t *destructor, -- umem_reclaim_t *reclaim, -- void *priv, void *vmp, int cflags) -+umem_cache_create( -+ char *name, size_t bufsize, size_t align, -+ umem_constructor_t *constructor, -+ umem_destructor_t *destructor, -+ umem_reclaim_t *reclaim, -+ void *priv, void *vmp, int cflags) - { -@@ -143,3 +147,3 @@ umem_cache_create(char *name, size_t bufsize, size_t align, - -- cp = umem_alloc(sizeof(umem_cache_t), UMEM_DEFAULT); -+ cp = umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); - if (cp) { -@@ -156,3 +160,3 @@ umem_cache_create(char *name, size_t bufsize, size_t align, - -- return cp; -+ return (cp); - } -@@ -162,3 +166,3 @@ umem_cache_destroy(umem_cache_t *cp) - { -- umem_free(cp, sizeof(umem_cache_t)); -+ umem_free(cp, sizeof (umem_cache_t)); - } -@@ -171,3 +175,4 @@ umem_cache_alloc(umem_cache_t *cp, int flags) - if (cp->cache_align != 0) -- ptr = umem_alloc_aligned(cp->cache_bufsize, cp->cache_align, flags); -+ ptr = umem_alloc_aligned( -+ cp->cache_bufsize, cp->cache_align, flags); - else -@@ -178,3 +183,3 @@ umem_cache_alloc(umem_cache_t *cp, int flags) - -- return ptr; -+ return (ptr); - } -diff --git a/lib/libspl/include/unistd.h b/lib/libspl/include/unistd.h -index dc95e28..53851f4 100644 ---- a/lib/libspl/include/unistd.h -+++ b/lib/libspl/include/unistd.h -@@ -29,17 +29,17 @@ - #ifndef _LIBSPL_UNISTD_H --#define _LIBSPL_UNISTD_H -+#define _LIBSPL_UNISTD_H - - #if !defined(HAVE_IOCTL_IN_UNISTD_H) --# if defined(HAVE_IOCTL_IN_SYS_IOCTL_H) --# include --# elif defined(HAVE_IOCTL_IN_STROPTS_H) --# include --# else --# error "System call ioctl() unavailable" --# endif --#endif -+#if defined(HAVE_IOCTL_IN_SYS_IOCTL_H) -+#include -+#elif defined(HAVE_IOCTL_IN_STROPTS_H) -+#include -+#else /* HAVE_IOCTL_IN_STROPTS_H */ -+#error "System call ioctl() unavailable" -+#endif /* HAVE_IOCTL_IN_SYS_IOCTL_H */ -+#endif /* !HAVE_IOCTL_IN_UNISTD_H */ - - #if !defined(HAVE_ISSETUGID) --# include --# define issetugid() (geteuid() == 0 || getegid() == 0) -+#include -+#define issetugid() (geteuid() == 0 || getegid() == 0) - #endif -diff --git a/lib/libspl/include/util/sscanf.h b/lib/libspl/include/util/sscanf.h -index 9d13bf2..ead36ac 100644 ---- a/lib/libspl/include/util/sscanf.h -+++ b/lib/libspl/include/util/sscanf.h -@@ -27,3 +27,3 @@ - #ifndef _LIBSPL_UTIL_SSCANF_H --#define _LIBSPL_UTIL_SSCANF_H -+#define _LIBSPL_UTIL_SSCANF_H - -diff --git a/lib/libspl/include/zone.h b/lib/libspl/include/zone.h -index dd24a1b..b4a6deb 100644 ---- a/lib/libspl/include/zone.h -+++ b/lib/libspl/include/zone.h -@@ -38,4 +38,4 @@ extern "C" { - --#define GLOBAL_ZONEID 0 --#define GLOBAL_ZONEID_NAME "global" -+#define GLOBAL_ZONEID 0 -+#define GLOBAL_ZONEID_NAME "global" - -diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c -index f98e31e..2f09188 100644 ---- a/lib/libspl/mkdirp.c -+++ b/lib/libspl/mkdirp.c -@@ -148,4 +148,6 @@ simplify(const char *str) - -- if (!str) -+ if (!str) { -+ errno = ENOENT; - return (NULL); -+ } - -diff --git a/lib/libspl/zone.c b/lib/libspl/zone.c -index f4269a7..5ca93b2 100644 ---- a/lib/libspl/zone.c -+++ b/lib/libspl/zone.c -@@ -29,22 +29,25 @@ - --zoneid_t getzoneid() -+zoneid_t -+getzoneid() - { -- return GLOBAL_ZONEID; -+ return (GLOBAL_ZONEID); - } - --zoneid_t getzoneidbyname(const char *name) -+zoneid_t -+getzoneidbyname(const char *name) - { -- if(name == NULL) -- return GLOBAL_ZONEID; -+ if (name == NULL) -+ return (GLOBAL_ZONEID); - -- if(strcmp(name, GLOBAL_ZONEID_NAME) == 0) -- return GLOBAL_ZONEID; -+ if (strcmp(name, GLOBAL_ZONEID_NAME) == 0) -+ return (GLOBAL_ZONEID); - -- return EINVAL; -+ return (EINVAL); - } - --ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) -+ssize_t -+getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - { -- if(id != GLOBAL_ZONEID) -- return EINVAL; -+ if (id != GLOBAL_ZONEID) -+ return (EINVAL); - -@@ -52,4 +55,4 @@ ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - -- if(buf == NULL || buflen == 0) -- return ret; -+ if (buf == NULL || buflen == 0) -+ return (ret); - -@@ -58,3 +61,3 @@ ssize_t getzonenamebyid(zoneid_t id, char *buf, size_t buflen) - -- return ret; -+ return (ret); - } -diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am -index 524efaa..8b1f517 100644 ---- a/lib/libzfs/Makefile.am -+++ b/lib/libzfs/Makefile.am -@@ -24,2 +24,3 @@ libzfs_la_SOURCES = \ - libzfs_la_LIBADD = \ -+ $(top_builddir)/lib/libzfs_core/libzfs_core.la \ - $(top_builddir)/lib/libshare/libshare.la \ -@@ -28,2 +29,3 @@ libzfs_la_LIBADD = \ - --libzfs_la_LDFLAGS = -lm -ldl -version-info 1:1:0 $(LIBSELINUX) -+libzfs_la_LIBADD += -lm -ldl $(LIBBLKID) -+libzfs_la_LDFLAGS = -version-info 2:0:0 -diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c -index 3a83e2d..0bcfc04 100644 ---- a/lib/libzfs/libzfs_changelist.c -+++ b/lib/libzfs/libzfs_changelist.c -@@ -293,6 +293,2 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - cn = uu_list_next(clp->cl_list, cn)) { -- zfs_handle_t *hdl; -- -- hdl = cn->cn_handle; -- - /* -@@ -300,3 +296,3 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - */ -- if (!isa_child_of(hdl->zfs_name, src)) -+ if (!isa_child_of(cn->cn_handle->zfs_name, src)) - continue; -@@ -306,13 +302,9 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) - */ -- remove_mountpoint(hdl); -+ remove_mountpoint(cn->cn_handle); - - (void) strlcpy(newname, dst, sizeof (newname)); -- (void) strcat(newname, hdl->zfs_name + strlen(src)); -- -- if (ZFS_IS_VOLUME(hdl)) { -- (void) zvol_remove_link(hdl->zfs_hdl, hdl->zfs_name); -- (void) zvol_create_link(hdl->zfs_hdl, newname); -- } -+ (void) strcat(newname, cn->cn_handle->zfs_name + strlen(src)); - -- (void) strlcpy(hdl->zfs_name, newname, sizeof (hdl->zfs_name)); -+ (void) strlcpy(cn->cn_handle->zfs_name, newname, -+ sizeof (cn->cn_handle->zfs_name)); - } -diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c -index ee94fe1..4175635 100644 ---- a/lib/libzfs/libzfs_config.c -+++ b/lib/libzfs/libzfs_config.c -@@ -108,3 +108,3 @@ namespace_reload(libzfs_handle_t *hdl) - nvpair_t *elem; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - void *cookie; -@@ -263,3 +263,3 @@ zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error; -diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c -index 244b687..5532531 100644 ---- a/lib/libzfs/libzfs_dataset.c -+++ b/lib/libzfs/libzfs_dataset.c -@@ -23,6 +23,8 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright (c) 2012 Pawel Jakub Dawidek . -- * Copyright 2012 Nexenta Systems, Inc. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -63,3 +65,2 @@ - --static int zvol_create_link_common(libzfs_handle_t *, const char *, int); - static int userquota_propname_decode(const char *propname, boolean_t zoned, -@@ -315,3 +316,3 @@ get_recvd_props_ioctl(zfs_handle_t *zhp) - nvlist_t *recvdprops; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -378,3 +379,3 @@ get_stats(zfs_handle_t *zhp) - int rc = 0; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -418,4 +419,3 @@ make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) - else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) -- return (-1); /* zpios' and other testing datasets are -- of this type, ignore if encountered */ -+ return (-1); - else -@@ -441,3 +441,3 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -642,2 +642,3 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - mnttab_node_t *mtn; -+ avl_index_t where; - -@@ -645,2 +646,3 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - continue; -+ - mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); -@@ -650,2 +652,13 @@ libzfs_mnttab_update(libzfs_handle_t *hdl) - mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); -+ -+ /* Exclude duplicate mounts */ -+ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { -+ free(mtn->mtn_mt.mnt_special); -+ free(mtn->mtn_mt.mnt_mountp); -+ free(mtn->mtn_mt.mnt_fstype); -+ free(mtn->mtn_mt.mnt_mntopts); -+ free(mtn); -+ continue; -+ } -+ - avl_add(&hdl->libzfs_mnttab_cache, mtn); -@@ -1410,2 +1423,3 @@ zfs_is_namespace_prop(zfs_prop_t prop) - case ZFS_PROP_ATIME: -+ case ZFS_PROP_RELATIME: - case ZFS_PROP_DEVICES: -@@ -1429,3 +1443,3 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret = -1; -@@ -1436,4 +1450,3 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - zfs_prop_t prop; -- boolean_t do_prefix; -- uint64_t idx; -+ boolean_t do_prefix = B_TRUE; - int added_resv = 0; -@@ -1476,8 +1489,13 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) - /* -- * If the dataset's canmount property is being set to noauto, -- * then we want to prevent unmounting & remounting it. -+ * We don't want to unmount & remount the dataset when changing -+ * its canmount property to 'on' or 'noauto'. We only use -+ * the changelist logic to unmount when setting canmount=off. - */ -- do_prefix = !((prop == ZFS_PROP_CANMOUNT) && -- (zprop_string_to_index(prop, propval, &idx, -- ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO)); -+ if (prop == ZFS_PROP_CANMOUNT) { -+ uint64_t idx; -+ int err = zprop_string_to_index(prop, propval, &idx, -+ ZFS_TYPE_DATASET); -+ if (err == 0 && idx != ZFS_CANMOUNT_OFF) -+ do_prefix = B_FALSE; -+ } - -@@ -1551,3 +1569,3 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret; -@@ -1639,2 +1657,11 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) - (void) get_stats(zhp); -+ -+ /* -+ * Remount the filesystem to propagate the change -+ * if one of the options handled by the generic -+ * Linux namespace layer has been modified. -+ */ -+ if (zfs_is_namespace_prop(prop) && -+ zfs_is_mounted(zhp, NULL)) -+ ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); - } -@@ -1726,3 +1753,3 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *zplprops = NULL; -@@ -1741,2 +1768,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - -+ case ZFS_PROP_RELATIME: -+ mntopt_on = MNTOPT_RELATIME; -+ mntopt_off = MNTOPT_NORELATIME; -+ break; -+ - case ZFS_PROP_DEVICES: -@@ -1801,2 +1833,3 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - case ZFS_PROP_ATIME: -+ case ZFS_PROP_RELATIME: - case ZFS_PROP_DEVICES: -@@ -1868,2 +1901,6 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, - -+ case ZFS_PROP_INCONSISTENT: -+ *val = zhp->zfs_dmustats.dds_inconsistent; -+ break; -+ - default: -@@ -2000,6 +2037,3 @@ get_clones_cb(zfs_handle_t *zhp, void *arg) - if (strcmp(gca->buf, gca->origin) == 0) { -- if (nvlist_add_boolean(gca->value, zfs_get_name(zhp)) != 0) { -- zfs_close(zhp); -- return (no_memory(zhp->zfs_hdl)); -- } -+ fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); - gca->numclones--; -@@ -2116,3 +2150,4 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, - &t) == 0) -- (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t) val); -+ (void) snprintf(propbuf, proplen, "%llu", -+ (u_longlong_t) val); - } -@@ -2578,3 +2613,3 @@ zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -2623,3 +2658,3 @@ zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, - (void) snprintf(propbuf, proplen, "%llu", -- (u_longlong_t)propvalue); -+ (u_longlong_t)propvalue); - } else if (propvalue == 0 && -@@ -2638,3 +2673,3 @@ zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - const char *snapname; -@@ -2680,3 +2715,4 @@ zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - if (literal) { -- (void) snprintf(propbuf, proplen, "%llu", (long long unsigned int)propvalue); -+ (void) snprintf(propbuf, proplen, "%llu", -+ (u_longlong_t)propvalue); - } else { -@@ -2688,21 +2724,2 @@ zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, - --int --zfs_get_snapused_int(zfs_handle_t *firstsnap, zfs_handle_t *lastsnap, -- uint64_t *usedp) --{ -- int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- -- (void) strlcpy(zc.zc_name, lastsnap->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, firstsnap->zfs_name, sizeof (zc.zc_value)); -- -- err = ioctl(lastsnap->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_SNAPS, &zc); -- if (err) -- return (err); -- -- *usedp = zc.zc_cookie; -- -- return (0); --} -- - /* -@@ -2777,3 +2794,3 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char parent[ZFS_MAXNAMELEN]; -@@ -2907,3 +2924,2 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - (cp = strchr(cp, '/')); *cp = '/', cp++) { -- char *logstr; - -@@ -2918,7 +2934,4 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - -- logstr = hdl->libzfs_log_str; -- hdl->libzfs_log_str = NULL; - if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, - NULL) != 0) { -- hdl->libzfs_log_str = logstr; - opname = dgettext(TEXT_DOMAIN, "create"); -@@ -2927,3 +2940,2 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) - -- hdl->libzfs_log_str = logstr; - h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); -@@ -2985,3 +2997,2 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - int ret; -@@ -2991,2 +3002,3 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - uint64_t zoned; -+ dmu_objset_type_t ost; - -@@ -3010,4 +3022,3 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - */ -- (void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name)); -- if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { -+ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -@@ -3018,5 +3029,5 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - if (type == ZFS_TYPE_VOLUME) -- zc.zc_objset_type = DMU_OST_ZVOL; -+ ost = DMU_OST_ZVOL; - else -- zc.zc_objset_type = DMU_OST_ZFS; -+ ost = DMU_OST_ZFS; - -@@ -3072,22 +3083,5 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, - -- if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0) -- return (-1); -- nvlist_free(props); -- - /* create the dataset */ -- ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); -- -- if (ret == 0 && type == ZFS_TYPE_VOLUME) { -- ret = zvol_create_link(hdl, path); -- if (ret) { -- (void) zfs_standard_error(hdl, errno, -- dgettext(TEXT_DOMAIN, -- "Volume successfully created, but device links " -- "were not created")); -- zcmd_free_nvlists(&zc); -- return (-1); -- } -- } -- -- zcmd_free_nvlists(&zc); -+ ret = lzc_create(path, ost, props); -+ nvlist_free(props); - -@@ -3149,3 +3143,3 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3154,5 +3148,2 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) - if (ZFS_IS_VOLUME(zhp)) { -- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) -- return (-1); -- - zc.zc_objset_type = DMU_OST_ZVOL; -@@ -3184,3 +3175,2 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) - struct destroydata *dd = arg; -- zfs_handle_t *szhp; - char name[ZFS_MAXNAMELEN]; -@@ -3191,16 +3181,4 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) - -- szhp = make_dataset_handle(zhp->zfs_hdl, name); -- if (szhp) { -+ if (lzc_exists(name)) - verify(nvlist_add_boolean(dd->nvl, name) == 0); -- zfs_close(szhp); -- } -- -- if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- (void) zvol_remove_link(zhp->zfs_hdl, name); -- /* -- * NB: this is simply a best-effort. We don't want to -- * return an error, because then we wouldn't visit all -- * the volumes. -- */ -- } - -@@ -3224,3 +3202,3 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - -- if (nvlist_next_nvpair(dd.nvl, NULL) == NULL) { -+ if (nvlist_empty(dd.nvl)) { - ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, -@@ -3229,3 +3207,3 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - } else { -- ret = zfs_destroy_snaps_nvl(zhp, dd.nvl, defer); -+ ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); - } -@@ -3236,32 +3214,39 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) - /* -- * Destroys all the snapshots named in the nvlist. They must be underneath -- * the zhp (either snapshots of it, or snapshots of its descendants). -+ * Destroys all the snapshots named in the nvlist. - */ - int --zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) -+zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) - { - int ret; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ nvlist_t *errlist; -+ nvpair_t *pair; - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, snaps) != 0) -- return (-1); -- zc.zc_defer_destroy = defer; -+ ret = lzc_destroy_snaps(snaps, defer, &errlist); - -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS_NVL, &zc); -- if (ret != 0) { -+ if (ret == 0) -+ return (0); -+ -+ if (nvlist_empty(errlist)) { - char errbuf[1024]; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); - -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot destroy snapshots in %s"), zc.zc_name); -+ ret = zfs_standard_error(hdl, ret, errbuf); -+ } -+ for (pair = nvlist_next_nvpair(errlist, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { -+ char errbuf[1024]; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), -+ nvpair_name(pair)); - -- switch (errno) { -+ switch (fnvpair_value_int32(pair)) { - case EEXIST: -- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, -- "snapshot is cloned")); -- return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf)); -- -+ zfs_error_aux(hdl, -+ dgettext(TEXT_DOMAIN, "snapshot is cloned")); -+ ret = zfs_error(hdl, EZFS_EXISTS, errbuf); -+ break; - default: -- return (zfs_standard_error(zhp->zfs_hdl, errno, -- errbuf)); -+ ret = zfs_standard_error(hdl, errno, errbuf); -+ break; - } -@@ -3269,3 +3254,3 @@ zfs_destroy_snaps_nvl(zfs_handle_t *zhp, nvlist_t *snaps, boolean_t defer) - -- return (0); -+ return (ret); - } -@@ -3278,3 +3263,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - char parent[ZFS_MAXNAMELEN]; -@@ -3283,3 +3267,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - libzfs_handle_t *hdl = zhp->zfs_hdl; -- zfs_type_t type; - uint64_t zoned; -@@ -3302,11 +3285,10 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - /* do the clone */ -- if (ZFS_IS_VOLUME(zhp)) { -- zc.zc_objset_type = DMU_OST_ZVOL; -- type = ZFS_TYPE_VOLUME; -- } else { -- zc.zc_objset_type = DMU_OST_ZFS; -- type = ZFS_TYPE_FILESYSTEM; -- } - - if (props) { -+ zfs_type_t type; -+ if (ZFS_IS_VOLUME(zhp)) { -+ type = ZFS_TYPE_VOLUME; -+ } else { -+ type = ZFS_TYPE_FILESYSTEM; -+ } - if ((props = zfs_valid_proplist(hdl, type, props, zoned, -@@ -3314,16 +3296,6 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - return (-1); -- -- if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { -- nvlist_free(props); -- return (-1); -- } -- -- nvlist_free(props); - } - -- (void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc); -- -- zcmd_free_nvlists(&zc); -+ ret = lzc_clone(target, zhp->zfs_name, props); -+ nvlist_free(props); - -@@ -3356,4 +3328,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - } -- } else if (ZFS_IS_VOLUME(zhp)) { -- ret = zvol_create_link(zhp->zfs_hdl, target); - } -@@ -3363,59 +3333,2 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) - --typedef struct promote_data { -- char cb_mountpoint[MAXPATHLEN]; -- const char *cb_target; -- const char *cb_errbuf; -- uint64_t cb_pivot_txg; --} promote_data_t; -- --static int --promote_snap_cb(zfs_handle_t *zhp, void *data) --{ -- promote_data_t *pd = data; -- zfs_handle_t *szhp; -- char snapname[MAXPATHLEN]; -- int rv = 0; -- -- /* We don't care about snapshots after the pivot point */ -- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) { -- zfs_close(zhp); -- return (0); -- } -- -- /* Remove the device link if it's a zvol. */ -- if (ZFS_IS_VOLUME(zhp)) -- (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name); -- -- /* Check for conflicting names */ -- (void) strlcpy(snapname, pd->cb_target, sizeof (snapname)); -- (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname)); -- szhp = make_dataset_handle(zhp->zfs_hdl, snapname); -- if (szhp != NULL) { -- zfs_close(szhp); -- zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, -- "snapshot name '%s' from origin \n" -- "conflicts with '%s' from target"), -- zhp->zfs_name, snapname); -- rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf); -- } -- zfs_close(zhp); -- return (rv); --} -- --static int --promote_snap_done_cb(zfs_handle_t *zhp, void *data) --{ -- promote_data_t *pd = data; -- -- /* We don't care about snapshots after the pivot point */ -- if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) { -- /* Create the device link if it's a zvol. */ -- if (ZFS_IS_VOLUME(zhp)) -- (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); -- } -- -- zfs_close(zhp); -- return (0); --} -- - /* -@@ -3427,8 +3340,5 @@ zfs_promote(zfs_handle_t *zhp) - libzfs_handle_t *hdl = zhp->zfs_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char parent[MAXPATHLEN]; -- char *cp; - int ret; -- zfs_handle_t *pzhp; -- promote_data_t pd; - char errbuf[1024]; -@@ -3450,25 +3360,3 @@ zfs_promote(zfs_handle_t *zhp) - } -- cp = strchr(parent, '@'); -- *cp = '\0'; - -- /* Walk the snapshots we will be moving */ -- pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); -- if (pzhp == NULL) -- return (-1); -- pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG); -- zfs_close(pzhp); -- pd.cb_target = zhp->zfs_name; -- pd.cb_errbuf = errbuf; -- pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET); -- if (pzhp == NULL) -- return (-1); -- (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint, -- sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE); -- ret = zfs_iter_snapshots(pzhp, B_FALSE, promote_snap_cb, &pd); -- if (ret != 0) { -- zfs_close(pzhp); -- return (-1); -- } -- -- /* issue the ioctl */ - (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, -@@ -3481,13 +3369,5 @@ zfs_promote(zfs_handle_t *zhp) - -- (void) zfs_iter_snapshots(pzhp, B_FALSE, promote_snap_done_cb, -- &pd); -- zfs_close(pzhp); -- - switch (save_errno) { - case EEXIST: -- /* -- * There is a conflicting snapshot name. We -- * should have caught this above, but they could -- * have renamed something in the mean time. -- */ -+ /* There is a conflicting snapshot name. */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -@@ -3500,8 +3380,3 @@ zfs_promote(zfs_handle_t *zhp) - } -- } else { -- (void) zfs_iter_snapshots(zhp, B_FALSE, promote_snap_done_cb, -- &pd); - } -- -- zfs_close(pzhp); - return (ret); -@@ -3509,33 +3384,25 @@ zfs_promote(zfs_handle_t *zhp) - --struct createdata { -- const char *cd_snapname; -- int cd_ifexists; --}; -+typedef struct snapdata { -+ nvlist_t *sd_nvl; -+ const char *sd_snapname; -+} snapdata_t; - - static int --zfs_create_link_cb(zfs_handle_t *zhp, void *arg) -+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) - { -- struct createdata *cd = arg; -- int ret; -- -- if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- char name[MAXPATHLEN]; -+ snapdata_t *sd = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; - -- (void) strlcpy(name, zhp->zfs_name, sizeof (name)); -- (void) strlcat(name, "@", sizeof (name)); -- (void) strlcat(name, cd->cd_snapname, sizeof (name)); -- (void) zvol_create_link_common(zhp->zfs_hdl, name, -- cd->cd_ifexists); -- /* -- * NB: this is simply a best-effort. We don't want to -- * return an error, because then we wouldn't visit all -- * the volumes. -- */ -- } -+ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zfs_get_name(zhp), sd->sd_snapname); - -- ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd); -+ fnvlist_add_boolean(sd->sd_nvl, name); - -+ rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); -+ } - zfs_close(zhp); - -- return (ret); -+ return (rv); - } -@@ -3543,89 +3410,107 @@ zfs_create_link_cb(zfs_handle_t *zhp, void *arg) - /* -- * Takes a snapshot of the given dataset. -+ * Creates snapshots. The keys in the snaps nvlist are the snapshots to be -+ * created. - */ - int --zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, -- nvlist_t *props) -+zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) - { -- const char *delim; -- char parent[ZFS_MAXNAMELEN]; -- zfs_handle_t *zhp; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - int ret; - char errbuf[1024]; -+ nvpair_t *elem; -+ nvlist_t *errors; - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot snapshot '%s'"), path); -+ "cannot create snapshots ")); - -- /* validate the target name */ -- if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) -- return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); -- -- if (props) { -- if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, -- props, B_FALSE, NULL, errbuf)) == NULL) -- return (-1); -+ elem = NULL; -+ while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { -+ const char *snapname = nvpair_name(elem); - -- if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { -- nvlist_free(props); -- return (-1); -+ /* validate the target name */ -+ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, -+ B_TRUE)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot create snapshot '%s'"), snapname); -+ return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - } -+ } - -- nvlist_free(props); -+ if (props != NULL && -+ (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, -+ props, B_FALSE, NULL, errbuf)) == NULL) { -+ return (-1); - } - -- /* make sure the parent exists and is of the appropriate type */ -- delim = strchr(path, '@'); -- (void) strncpy(parent, path, delim - path); -- parent[delim - path] = '\0'; -+ ret = lzc_snapshot(snaps, props, &errors); - -- if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM | -- ZFS_TYPE_VOLUME)) == NULL) { -- zcmd_free_nvlists(&zc); -- return (-1); -+ if (ret != 0) { -+ boolean_t printed = B_FALSE; -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot create snapshot '%s'"), nvpair_name(elem)); -+ (void) zfs_standard_error(hdl, -+ fnvpair_value_int32(elem), errbuf); -+ printed = B_TRUE; -+ } -+ if (!printed) { -+ switch (ret) { -+ case EXDEV: -+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -+ "multiple snapshots of same " -+ "fs not allowed")); -+ (void) zfs_error(hdl, EZFS_EXISTS, errbuf); -+ -+ break; -+ default: -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ } - } - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value)); -- if (ZFS_IS_VOLUME(zhp)) -- zc.zc_objset_type = DMU_OST_ZVOL; -- else -- zc.zc_objset_type = DMU_OST_ZFS; -- zc.zc_cookie = recursive; -- ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc); -+ nvlist_free(props); -+ nvlist_free(errors); -+ return (ret); -+} - -- zcmd_free_nvlists(&zc); -+int -+zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, -+ nvlist_t *props) -+{ -+ int ret; -+ snapdata_t sd = { 0 }; -+ char fsname[ZFS_MAXNAMELEN]; -+ char *cp; -+ zfs_handle_t *zhp; -+ char errbuf[1024]; - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -- if (ret != 0) -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); -+ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -+ "cannot snapshot %s"), path); - -- if (ret == 0 && recursive) { -- struct createdata cd; -+ if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) -+ return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - -- cd.cd_snapname = delim + 1; -- cd.cd_ifexists = B_FALSE; -- (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd); -- } -- if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) { -- ret = zvol_create_link(zhp->zfs_hdl, path); -- if (ret != 0) { -- (void) zfs_standard_error(hdl, errno, -- dgettext(TEXT_DOMAIN, -- "Volume successfully snapshotted, but device links " -- "were not created")); -- zfs_close(zhp); -- return (-1); -- } -+ (void) strlcpy(fsname, path, sizeof (fsname)); -+ cp = strchr(fsname, '@'); -+ *cp = '\0'; -+ sd.sd_snapname = cp + 1; -+ -+ if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | -+ ZFS_TYPE_VOLUME)) == NULL) { -+ return (-1); - } - -- if (ret != 0) -- (void) zfs_standard_error(hdl, errno, errbuf); -+ verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); -+ if (recursive) { -+ (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); -+ } else { -+ fnvlist_add_boolean(sd.sd_nvl, path); -+ } - -+ ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); -+ nvlist_free(sd.sd_nvl); - zfs_close(zhp); -- - return (ret); -@@ -3657,3 +3542,2 @@ rollback_destroy(zfs_handle_t *zhp, void *data) - cbp->cb_create) { -- char *logstr; - -@@ -3664,6 +3548,3 @@ rollback_destroy(zfs_handle_t *zhp, void *data) - -- logstr = zhp->zfs_hdl->libzfs_log_str; -- zhp->zfs_hdl->libzfs_log_str = NULL; - cbp->cb_error |= zfs_destroy(zhp, B_FALSE); -- zhp->zfs_hdl->libzfs_log_str = logstr; - } -@@ -3704,3 +3585,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - int err; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; - boolean_t restore_resv = 0; -@@ -3729,4 +3609,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - if (zhp->zfs_type == ZFS_TYPE_VOLUME) { -- if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) -- return (-1); - if (zfs_which_resv_prop(zhp, &resv_prop) < 0) -@@ -3738,9 +3616,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- -- if (ZFS_IS_VOLUME(zhp)) -- zc.zc_objset_type = DMU_OST_ZVOL; -- else -- zc.zc_objset_type = DMU_OST_ZFS; -- - /* -@@ -3751,5 +3622,5 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - * snapshot since we verified that this was the most recent. -- * - */ -- if ((err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_ROLLBACK, &zc)) != 0) { -+ err = lzc_rollback(zhp->zfs_name, NULL, 0); -+ if (err != 0) { - (void) zfs_standard_error_fmt(zhp->zfs_hdl, errno, -@@ -3768,6 +3639,2 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) - (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { -- if ((err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name))) { -- zfs_close(zhp); -- return (err); -- } - if (restore_resv) { -@@ -3791,3 +3658,3 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - int ret; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char *delim; -@@ -3885,3 +3752,2 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - if (recursive) { -- struct destroydata dd; - -@@ -3900,11 +3766,2 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - -- dd.snapname = delim + 1; -- -- /* We remove any zvol links prior to renaming them */ -- verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); -- ret = zfs_iter_filesystems(zhrp, zfs_check_snap_cb, &dd); -- nvlist_free(dd.nvl); -- if (ret) { -- goto error; -- } - } else { -@@ -3958,23 +3815,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, - */ -- if (recursive) { -- struct createdata cd; -- -- /* only create links for datasets that had existed */ -- cd.cd_snapname = delim + 1; -- cd.cd_ifexists = B_TRUE; -- (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb, -- &cd); -- } else { -+ if (!recursive) - (void) changelist_postfix(cl); -- } - } else { -- if (recursive) { -- struct createdata cd; -- -- /* only create links for datasets that had existed */ -- cd.cd_snapname = strchr(target, '@') + 1; -- cd.cd_ifexists = B_TRUE; -- ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb, -- &cd); -- } else { -+ if (!recursive) { - changelist_rename(cl, zfs_get_name(zhp), target); -@@ -3997,122 +3837,2 @@ error: - --/* -- * Given a zvol dataset, issue the ioctl to create the appropriate minor node, -- * and wait briefly for udev to create the /dev link. -- */ --int --zvol_create_link(libzfs_handle_t *hdl, const char *dataset) --{ -- return (zvol_create_link_common(hdl, dataset, B_FALSE)); --} -- --static int --zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) --{ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- char path[MAXPATHLEN]; -- int error; -- -- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -- -- /* -- * Issue the appropriate ioctl. -- */ -- if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) { -- switch (errno) { -- case EEXIST: -- /* -- * Silently ignore the case where the link already -- * exists. This allows 'zfs volinit' to be run multiple -- * times without errors. -- */ -- return (0); -- -- case ENODEV: -- /* -- * snapdev set to hidden : -- * device creation was not permitted (see zvol.c) -- * ignore error quietly -- */ -- return (0); -- -- case ENOENT: -- /* -- * Dataset does not exist in the kernel. If we -- * don't care (see zfs_rename), then ignore the -- * error quietly. -- */ -- if (ifexists) { -- return (0); -- } -- -- /* FALLTHROUGH */ -- -- default: -- return (zfs_standard_error_fmt(hdl, errno, -- dgettext(TEXT_DOMAIN, "cannot create device links " -- "for '%s'"), dataset)); -- } -- } -- -- /* -- * Wait up to 10 seconds for udev to create the device. -- */ -- (void) snprintf(path, sizeof (path), "%s/%s", ZVOL_DIR, dataset); -- error = zpool_label_disk_wait(path, 10000); -- if (error) -- (void) printf(gettext("%s may not be immediately " -- "available\n"), path); -- -- return (0); --} -- --/* -- * Remove a minor node for the given zvol and the associated /dev links. -- */ --int --zvol_remove_link(libzfs_handle_t *hdl, const char *dataset) --{ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- int timeout = 3000; /* in milliseconds */ -- int error = 0; -- int i; -- -- (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -- -- /* -- * Due to concurrent updates by udev the device may be reported as -- * busy. In this case don't immediately fail. Instead briefly delay -- * and retry the ioctl() which is now likely to succeed. If unable -- * remove the link after timeout milliseconds return the failure. -- */ -- for (i = 0; i < timeout; i++) { -- error = ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc); -- if (error && errno == EBUSY) { -- usleep(1000); -- continue; -- } else { -- break; -- } -- } -- -- if (error) { -- switch (errno) { -- case ENXIO: -- /* -- * Silently ignore the case where the link no longer -- * exists, so that 'zfs volfini' can be run multiple -- * times without errors. -- */ -- return (0); -- -- default: -- return (zfs_standard_error_fmt(hdl, errno, -- dgettext(TEXT_DOMAIN, "cannot remove device " -- "links for '%s': %s"), dataset, strerror(errno))); -- } -- } -- -- return (0); --} -- - nvlist_t * -@@ -4137,3 +3857,4 @@ zfs_get_user_props(zfs_handle_t *zhp) - int --zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) -+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, -+ boolean_t literal) - { -@@ -4199,3 +3920,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - for (entry = *plp; entry != NULL; entry = entry->pl_next) { -- if (entry->pl_fixed) -+ if (entry->pl_fixed && !literal) - continue; -@@ -4204,3 +3925,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - if (zfs_prop_get(zhp, entry->pl_prop, -- buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) { -+ buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { - if (strlen(buf) > entry->pl_width) -@@ -4210,3 +3931,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - zfs_prop_to_name(entry->pl_prop), -- buf, sizeof (buf), B_FALSE) == 0) -+ buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) -@@ -4223,3 +3944,3 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) - entry->pl_user_prop, -- buf, sizeof (buf), B_FALSE) == 0) -+ buf, sizeof (buf), literal) == 0) - if (strlen(buf) > entry->pl_recvd_width) -@@ -4267,3 +3988,3 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *nvlist = NULL; -@@ -4349,3 +4070,3 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_useracct_t buf[100]; -@@ -4386,33 +4107,104 @@ zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - -+struct holdarg { -+ nvlist_t *nvl; -+ const char *snapname; -+ const char *tag; -+ boolean_t recursive; -+ int error; -+}; -+ -+static int -+zfs_hold_one(zfs_handle_t *zhp, void *arg) -+{ -+ struct holdarg *ha = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; -+ -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zhp->zfs_name, ha->snapname); -+ -+ if (lzc_exists(name)) -+ fnvlist_add_string(ha->nvl, name, ha->tag); -+ -+ if (ha->recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); -+ zfs_close(zhp); -+ return (rv); -+} -+ - int - zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, -- boolean_t recursive, boolean_t temphold, boolean_t enoent_ok, -- int cleanup_fd, uint64_t dsobj, uint64_t createtxg) -+ boolean_t recursive, int cleanup_fd) -+{ -+ int ret; -+ struct holdarg ha; -+ -+ ha.nvl = fnvlist_alloc(); -+ ha.snapname = snapname; -+ ha.tag = tag; -+ ha.recursive = recursive; -+ (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); -+ -+ if (nvlist_empty(ha.nvl)) { -+ char errbuf[1024]; -+ -+ fnvlist_free(ha.nvl); -+ ret = ENOENT; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot hold snapshot '%s@%s'"), -+ zhp->zfs_name, snapname); -+ (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); -+ return (ret); -+ } -+ -+ ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); -+ fnvlist_free(ha.nvl); -+ -+ return (ret); -+} -+ -+int -+zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ int ret; -+ nvlist_t *errors; - libzfs_handle_t *hdl = zhp->zfs_hdl; -+ char errbuf[1024]; -+ nvpair_t *elem; - -- ASSERT(!recursive || dsobj == 0); -+ errors = NULL; -+ ret = lzc_hold(holds, cleanup_fd, &errors); - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -- if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) -- >= sizeof (zc.zc_string)) -- return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); -- zc.zc_cookie = recursive; -- zc.zc_temphold = temphold; -- zc.zc_cleanup_fd = cleanup_fd; -- zc.zc_sendobj = dsobj; -- zc.zc_createtxg = createtxg; -+ if (ret == 0) { -+ /* There may be errors even in the success case. */ -+ fnvlist_free(errors); -+ return (0); -+ } - -- if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { -- char errbuf[ZFS_MAXNAMELEN+32]; -+ if (nvlist_empty(errors)) { -+ /* no hold-specific errors */ -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, "cannot hold")); -+ switch (ret) { -+ case ENOTSUP: -+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -+ "pool must be upgraded")); -+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); -+ break; -+ case EINVAL: -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; -+ default: -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ } - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -- (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot hold '%s@%s'"), zc.zc_name, snapname); -- switch (errno) { -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot hold snapshot '%s'"), nvpair_name(elem)); -+ switch (fnvpair_value_int32(elem)) { - case E2BIG: -@@ -4424,17 +4216,13 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - */ -- return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf)); -- case ENOTSUP: -- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "pool must be upgraded")); -- return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); -+ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); -+ break; - case EINVAL: -- return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; - case EEXIST: -- return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); -- case ENOENT: -- if (enoent_ok) -- return (ENOENT); -- /* FALLTHROUGH */ -+ (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); -+ break; - default: -- return (zfs_standard_error_fmt(hdl, errno, errbuf)); -+ (void) zfs_standard_error(hdl, -+ fnvpair_value_int32(elem), errbuf); - } -@@ -4442,3 +4230,32 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - -- return (0); -+ fnvlist_free(errors); -+ return (ret); -+} -+ -+static int -+zfs_release_one(zfs_handle_t *zhp, void *arg) -+{ -+ struct holdarg *ha = arg; -+ char name[ZFS_MAXNAMELEN]; -+ int rv = 0; -+ nvlist_t *existing_holds; -+ -+ (void) snprintf(name, sizeof (name), -+ "%s@%s", zhp->zfs_name, ha->snapname); -+ -+ if (lzc_get_holds(name, &existing_holds) != 0) { -+ ha->error = ENOENT; -+ } else if (!nvlist_exists(existing_holds, ha->tag)) { -+ ha->error = ESRCH; -+ } else { -+ nvlist_t *torelease = fnvlist_alloc(); -+ fnvlist_add_boolean(torelease, ha->tag); -+ fnvlist_add_nvlist(ha->nvl, name, torelease); -+ fnvlist_free(torelease); -+ } -+ -+ if (ha->recursive) -+ rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); -+ zfs_close(zhp); -+ return (rv); - } -@@ -4449,25 +4266,45 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ int ret; -+ struct holdarg ha; -+ nvlist_t *errors = NULL; -+ nvpair_t *elem; - libzfs_handle_t *hdl = zhp->zfs_hdl; -+ char errbuf[1024]; - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); -- (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -- if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) -- >= sizeof (zc.zc_string)) -- return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); -- zc.zc_cookie = recursive; -+ ha.nvl = fnvlist_alloc(); -+ ha.snapname = snapname; -+ ha.tag = tag; -+ ha.recursive = recursive; -+ ha.error = 0; -+ (void) zfs_release_one(zfs_handle_dup(zhp), &ha); -+ -+ if (nvlist_empty(ha.nvl)) { -+ fnvlist_free(ha.nvl); -+ ret = ha.error; -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot release hold from snapshot '%s@%s'"), -+ zhp->zfs_name, snapname); -+ if (ret == ESRCH) { -+ (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); -+ } else { -+ (void) zfs_standard_error(hdl, ret, errbuf); -+ } -+ return (ret); -+ } - -- if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { -- char errbuf[ZFS_MAXNAMELEN+32]; -+ ret = lzc_release(ha.nvl, &errors); -+ fnvlist_free(ha.nvl); - -- /* -- * if it was recursive, the one that actually failed will be in -- * zc.zc_name. -- */ -+ if (ret == 0) { -+ /* There may be errors even in the success case. */ -+ fnvlist_free(errors); -+ return (0); -+ } -+ -+ if (nvlist_empty(errors)) { -+ /* no hold-specific errors */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, -- "cannot release '%s' from '%s@%s'"), tag, zc.zc_name, -- snapname); -+ "cannot release")); - switch (errno) { -- case ESRCH: -- return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); - case ENOTSUP: -@@ -4475,7 +4312,26 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - "pool must be upgraded")); -- return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); -+ break; -+ default: -+ (void) zfs_standard_error_fmt(hdl, errno, errbuf); -+ } -+ } -+ -+ for (elem = nvlist_next_nvpair(errors, NULL); -+ elem != NULL; -+ elem = nvlist_next_nvpair(errors, elem)) { -+ (void) snprintf(errbuf, sizeof (errbuf), -+ dgettext(TEXT_DOMAIN, -+ "cannot release hold from snapshot '%s'"), -+ nvpair_name(elem)); -+ switch (fnvpair_value_int32(elem)) { -+ case ESRCH: -+ (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); -+ break; - case EINVAL: -- return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); -+ (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); -+ break; - default: -- return (zfs_standard_error_fmt(hdl, errno, errbuf)); -+ (void) zfs_standard_error_fmt(hdl, -+ fnvpair_value_int32(elem), errbuf); - } -@@ -4483,3 +4339,4 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - -- return (0); -+ fnvlist_free(errors); -+ return (ret); - } -@@ -4489,3 +4346,3 @@ zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -4494,3 +4351,3 @@ zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) - int err = 0; -- char errbuf[ZFS_MAXNAMELEN+32]; -+ char errbuf[1024]; - -@@ -4556,6 +4413,6 @@ zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char *nvbuf; -- char errbuf[ZFS_MAXNAMELEN+32]; -+ char errbuf[1024]; - size_t nvsz; -@@ -4610,34 +4467,14 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -- libzfs_handle_t *hdl = zhp->zfs_hdl; -- int nvsz = 2048; -- void *nvbuf; -- int err = 0; -- char errbuf[ZFS_MAXNAMELEN+32]; -- -- assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); -- --tryagain: -- -- nvbuf = malloc(nvsz); -- if (nvbuf == NULL) { -- err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); -- goto out; -- } -+ int err; -+ char errbuf[1024]; - -- zc.zc_nvlist_dst_size = nvsz; -- zc.zc_nvlist_dst = (uintptr_t)nvbuf; -+ err = lzc_get_holds(zhp->zfs_name, nvl); - -- (void) strlcpy(zc.zc_name, zhp->zfs_name, ZFS_MAXNAMELEN); -+ if (err != 0) { -+ libzfs_handle_t *hdl = zhp->zfs_hdl; - -- if (zfs_ioctl(hdl, ZFS_IOC_GET_HOLDS, &zc) != 0) { - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), -- zc.zc_name); -- switch (errno) { -- case ENOMEM: -- free(nvbuf); -- nvsz = zc.zc_nvlist_dst_size; -- goto tryagain; -- -+ zhp->zfs_name); -+ switch (err) { - case ENOTSUP: -@@ -4657,15 +4494,4 @@ tryagain: - } -- } else { -- /* success */ -- int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); -- if (rc) { -- (void) snprintf(errbuf, sizeof (errbuf), -- dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), -- zc.zc_name); -- err = zfs_standard_error_fmt(hdl, rc, errbuf); -- } - } - -- free(nvbuf); --out: - return (err); -@@ -4673,2 +4499,7 @@ out: - -+/* -+ * Convert the zvol's volume size to an appropriate reservation. -+ * Note: If this routine is updated, it is necessary to update the ZFS test -+ * suite's shell version in reservation.kshlib. -+ */ - uint64_t -diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c -index 77d5a09..7472d24 100644 ---- a/lib/libzfs/libzfs_diff.c -+++ b/lib/libzfs/libzfs_diff.c -@@ -92,3 +92,3 @@ get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error; -@@ -381,3 +381,3 @@ write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *lhdl = di->zhp->zfs_hdl; -@@ -509,3 +509,3 @@ make_temp_snapshot(differ_info_t *di) - libzfs_handle_t *hdl = di->zhp->zfs_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -751,3 +751,3 @@ zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char errbuf[1024]; -diff --git a/lib/libzfs/libzfs_fru.c b/lib/libzfs/libzfs_fru.c -index 78f2f9c..6be927f 100644 ---- a/lib/libzfs/libzfs_fru.c -+++ b/lib/libzfs/libzfs_fru.c -@@ -363,3 +363,3 @@ zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru) - { -- zfs_cmd_t zc = { 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -463,3 +463,2 @@ libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) - { -- return; - } -@@ -467,3 +466 @@ libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) - #endif /* HAVE_LIBTOPO */ -- -- -diff --git a/lib/libzfs/libzfs_graph.c b/lib/libzfs/libzfs_graph.c -index 0e538e3..63d9138 100644 ---- a/lib/libzfs/libzfs_graph.c -+++ b/lib/libzfs/libzfs_graph.c -@@ -381,3 +381,3 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_vertex_t *zvp; -@@ -475,3 +475,3 @@ external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c -index 9e79bd9..b5a079c 100644 ---- a/lib/libzfs/libzfs_import.c -+++ b/lib/libzfs/libzfs_import.c -@@ -170,3 +170,3 @@ fix_paths(nvlist_t *nv, name_entry_t *names) - if ((strlen(path) == strlen(ne->ne_name)) && -- !strncmp(path, ne->ne_name, strlen(path))) { -+ strncmp(path, ne->ne_name, strlen(path)) == 0) { - best = ne; -@@ -367,3 +367,3 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) - nvlist_t *nvl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -967,3 +967,3 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools) - -- err = blkid_dev_set_search(iter, "TYPE", "zfs"); -+ err = blkid_dev_set_search(iter, "TYPE", "zfs_member"); - if (err != 0) { -@@ -999,3 +999,3 @@ err_blkid2: - err_blkid1: -- return err; -+ return (err); - } -@@ -1127,10 +1127,10 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) - if ((strncmp(name, "watchdog", 8) == 0) || -- (strncmp(name, "fuse", 4) == 0) || -- (strncmp(name, "ppp", 3) == 0) || -- (strncmp(name, "tty", 3) == 0) || -- (strncmp(name, "vcs", 3) == 0) || -- (strncmp(name, "parport", 7) == 0) || -- (strncmp(name, "lp", 2) == 0) || -- (strncmp(name, "fd", 2) == 0) || -- (strncmp(name, "hpet", 4) == 0) || -+ (strncmp(name, "fuse", 4) == 0) || -+ (strncmp(name, "ppp", 3) == 0) || -+ (strncmp(name, "tty", 3) == 0) || -+ (strncmp(name, "vcs", 3) == 0) || -+ (strncmp(name, "parport", 7) == 0) || -+ (strncmp(name, "lp", 2) == 0) || -+ (strncmp(name, "fd", 2) == 0) || -+ (strncmp(name, "hpet", 4) == 0) || - (strncmp(name, "core", 4) == 0)) -@@ -1167,3 +1167,3 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) - if (strcmp(iarg->poolname, pname)) -- matched = B_FALSE; -+ matched = B_FALSE; - -diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c -index 8215d3c..e527bdc 100644 ---- a/lib/libzfs/libzfs_iter.c -+++ b/lib/libzfs/libzfs_iter.c -@@ -23,4 +23,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2010 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -105,3 +105,3 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *nzhp; -@@ -142,3 +142,3 @@ zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *nzhp; -@@ -308,4 +308,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - { -- char buf[ZFS_MAXNAMELEN]; -- char *comma_separated, *cp; -+ char *buf, *comma_separated, *cp; - int err = 0; -@@ -313,3 +312,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - -- (void) strlcpy(buf, spec_orig, sizeof (buf)); -+ buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); - cp = buf; -@@ -371,2 +370,3 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, - -+ free(buf); - return (ret); -@@ -451,4 +451,8 @@ iter_dependents_cb(zfs_handle_t *zhp, void *arg) - } -+ - if (!first && err == 0) - err = ida->func(zhp, ida->data); -+ else -+ zfs_close(zhp); -+ - return (err); -diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c -index bded1f0..b85c5d0 100644 ---- a/lib/libzfs/libzfs_mount.c -+++ b/lib/libzfs/libzfs_mount.c -@@ -281,3 +281,3 @@ do_mount(const char *src, const char *mntpt, char *opts) - (char *)src, -- (char *)mntpt, -+ (char *)mntpt, - (char *)NULL }; -@@ -289,16 +289,18 @@ do_mount(const char *src, const char *mntpt, char *opts) - if (rc & MOUNT_FILEIO) -- return EIO; -+ return (EIO); - if (rc & MOUNT_USER) -- return EINTR; -+ return (EINTR); - if (rc & MOUNT_SOFTWARE) -- return EPIPE; -+ return (EPIPE); -+ if (rc & MOUNT_BUSY) -+ return (EBUSY); - if (rc & MOUNT_SYSERR) -- return EAGAIN; -+ return (EAGAIN); - if (rc & MOUNT_USAGE) -- return EINVAL; -+ return (EINVAL); - -- return ENXIO; /* Generic error */ -+ return (ENXIO); /* Generic error */ - } - -- return 0; -+ return (0); - } -@@ -890,3 +892,3 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, - for (curr_proto = proto; *curr_proto != PROTO_END; -- curr_proto++) { -+ curr_proto++) { - -@@ -1165,3 +1167,6 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) - -- rewind(hdl->libzfs_mnttab); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) -+ return (ENOENT); -+ - used = alloc = 0; -diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c -index a6cacd3..b1ddd98 100644 ---- a/lib/libzfs/libzfs_pool.c -+++ b/lib/libzfs/libzfs_pool.c -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -65,3 +66,3 @@ zpool_get_all_props(zpool_handle_t *zhp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -237,4 +238,3 @@ zpool_pool_state_to_name(pool_state_t state) - /* -- * Get a zpool property value for 'prop' and return the value in -- * a pre-allocated buffer. -+ * API compatibility wrapper around zpool_get_prop_literal - */ -@@ -244,2 +244,13 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - { -+ return (zpool_get_prop_literal(zhp, prop, buf, len, srctype, B_FALSE)); -+} -+ -+/* -+ * Get a zpool property value for 'prop' and return the value in -+ * a pre-allocated buffer. -+ */ -+int -+zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, -+ size_t len, zprop_source_t *srctype, boolean_t literal) -+{ - uint64_t intval; -@@ -309,3 +320,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - case ZPOOL_PROP_ASHIFT: -- (void) zfs_nicenum(intval, buf, len); -+ if (literal) -+ (void) snprintf(buf, len, "%llu", -+ (u_longlong_t)intval); -+ else -+ (void) zfs_nicenum(intval, buf, len); - break; -@@ -693,3 +708,3 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret = -1; -@@ -1142,3 +1157,3 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *zc_fsprops = NULL; -@@ -1146,3 +1161,2 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - char msg[1024]; -- char *altroot; - int ret = -1; -@@ -1210,4 +1224,5 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "one or more vdevs refer to the same device, or one of\n" -- "the devices is part of an active md or lvm device")); -+ "one or more vdevs refer to the same device, or " -+ "one of\nthe devices is part of an active md or " -+ "lvm device")); - return (zfs_error(hdl, EZFS_BADDEV, msg)); -@@ -1247,17 +1262,2 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, - -- /* -- * If this is an alternate root pool, then we automatically set the -- * mountpoint of the root dataset to be '/'. -- */ -- if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), -- &altroot) == 0) { -- zfs_handle_t *zhp; -- -- verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL); -- verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), -- "/") == 0); -- -- zfs_close(zhp); -- } -- - create_failed: -@@ -1274,5 +1274,5 @@ create_failed: - int --zpool_destroy(zpool_handle_t *zhp) -+zpool_destroy(zpool_handle_t *zhp, const char *log_str) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zfp = NULL; -@@ -1286,2 +1286,3 @@ zpool_destroy(zpool_handle_t *zhp) - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); -+ zc.zc_history = (uint64_t)(uintptr_t)log_str; - -@@ -1319,3 +1320,3 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int ret; -@@ -1411,9 +1412,2 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - -- case EDOM: -- zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, -- "root pool can not have multiple vdevs" -- " or separate logs")); -- (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); -- break; -- - case ENOTBLK: -@@ -1442,6 +1436,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) - */ --int --zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) -+static int -+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, -+ const char *log_str) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -1454,2 +1449,3 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) - zc.zc_guid = hardforce; -+ zc.zc_history = (uint64_t)(uintptr_t)log_str; - -@@ -1475,5 +1471,5 @@ zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) - int --zpool_export(zpool_handle_t *zhp, boolean_t force) -+zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) - { -- return (zpool_export_common(zhp, force, B_FALSE)); -+ return (zpool_export_common(zhp, force, B_FALSE, log_str)); - } -@@ -1481,5 +1477,5 @@ zpool_export(zpool_handle_t *zhp, boolean_t force) - int --zpool_export_force(zpool_handle_t *zhp) -+zpool_export_force(zpool_handle_t *zhp, const char *log_str) - { -- return (zpool_export_common(zhp, B_TRUE, B_TRUE)); -+ return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); - } -@@ -1719,3 +1715,3 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zpool_rewind_policy_t policy; -@@ -1911,3 +1907,3 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2173,3 +2169,3 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, - -- guid = strtoull(path, &end, 10); -+ guid = strtoull(path, &end, 0); - if (guid != 0 && *end == '\0') { -@@ -2387,3 +2383,3 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2434,3 +2430,3 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, - error = zfs_resolve_shortname(path, buf, -- sizeof(buf)); -+ sizeof (buf)); - if (error != 0) -@@ -2471,3 +2467,3 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2521,3 +2517,3 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2526,3 +2522,3 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - (void) snprintf(msg, sizeof (msg), -- dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); -+ dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); - -@@ -2556,3 +2552,3 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2561,3 +2557,3 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) - (void) snprintf(msg, sizeof (msg), -- dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); -+ dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); - -@@ -2610,3 +2606,3 @@ zpool_vdev_attach(zpool_handle_t *zhp, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2786,3 +2782,3 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -2884,3 +2880,3 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3095,3 +3091,3 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3140,3 +3136,3 @@ zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3216,3 +3212,3 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3222,3 +3218,3 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) - dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), -- (u_longlong_t)guid); -+ (u_longlong_t)guid); - -@@ -3242,3 +3238,3 @@ zpool_reguid(zpool_handle_t *zhp) - libzfs_handle_t *hdl = zhp->zpool_hdl; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3260,3 +3256,3 @@ zpool_reopen(zpool_handle_t *zhp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3340,3 +3336,3 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -3467,3 +3463,3 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, - &value) == 0 && value) { -- return strip_partition(hdl, path); -+ return (strip_partition(hdl, path)); - } -@@ -3515,3 +3511,3 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - uint64_t count; -@@ -3611,3 +3607,3 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -3625,4 +3621,3 @@ zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) - void --zpool_set_history_str(const char *subcommand, int argc, char **argv, -- char *history_str) -+zfs_save_arguments(int argc, char **argv, char *string, int len) - { -@@ -3630,9 +3625,6 @@ zpool_set_history_str(const char *subcommand, int argc, char **argv, - -- (void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN); -+ (void) strlcpy(string, basename(argv[0]), len); - for (i = 1; i < argc; i++) { -- if (strlen(history_str) + 1 + strlen(argv[i]) > -- HIS_MAX_RECORD_LEN) -- break; -- (void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN); -- (void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN); -+ (void) strlcat(string, " ", len); -+ (void) strlcat(string, argv[i], len); - } -@@ -3640,21 +3632,17 @@ zpool_set_history_str(const char *subcommand, int argc, char **argv, - --/* -- * Stage command history for logging. -- */ - int --zpool_stage_history(libzfs_handle_t *hdl, const char *history_str) -+zpool_log_history(libzfs_handle_t *hdl, const char *message) - { -- if (history_str == NULL) -- return (EINVAL); -- -- if (strlen(history_str) > HIS_MAX_RECORD_LEN) -- return (EINVAL); -- -- if (hdl->libzfs_log_str != NULL) -- free(hdl->libzfs_log_str); -- -- if ((hdl->libzfs_log_str = strdup(history_str)) == NULL) -- return (no_memory(hdl)); -- -- return (0); -+ zfs_cmd_t zc = {"\0"}; -+ nvlist_t *args; -+ int err; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_string(args, "message", message); -+ err = zcmd_write_src_nvlist(hdl, &zc, args); -+ if (err == 0) -+ err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); -+ nvlist_free(args); -+ zcmd_free_nvlists(&zc); -+ return (err); - } -@@ -3673,3 +3661,3 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zpool_hdl; -@@ -3797,10 +3785,11 @@ zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) - /* -- * Retrieve the next event. If there is a new event available 'nvp' will -- * contain a newly allocated nvlist and 'dropped' will be set to the number -- * of missed events since the last call to this function. When 'nvp' is -- * set to NULL it indicates no new events are available. In either case -- * the function returns 0 and it is up to the caller to free 'nvp'. In -- * the case of a fatal error the function will return a non-zero value. -- * When the function is called in blocking mode it will not return until -- * a new event is available. -+ * Retrieve the next event given the passed 'zevent_fd' file descriptor. -+ * If there is a new event available 'nvp' will contain a newly allocated -+ * nvlist and 'dropped' will be set to the number of missed events since -+ * the last call to this function. When 'nvp' is set to NULL it indicates -+ * no new events are available. In either case the function returns 0 and -+ * it is up to the caller to free 'nvp'. In the case of a fatal error the -+ * function will return a non-zero value. When the function is called in -+ * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), -+ * it will not return until a new event is available. - */ -@@ -3808,5 +3797,5 @@ int - zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, -- int *dropped, int block, int cleanup_fd) -+ int *dropped, unsigned flags, int zevent_fd) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int error = 0; -@@ -3815,5 +3804,5 @@ zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, - *dropped = 0; -- zc.zc_cleanup_fd = cleanup_fd; -+ zc.zc_cleanup_fd = zevent_fd; - -- if (!block) -+ if (flags & ZEVENT_NONBLOCK) - zc.zc_guid = ZEVENT_NONBLOCK; -@@ -3832,3 +3821,3 @@ retry: - /* Blocking error case should not occur */ -- if (block) -+ if (!(flags & ZEVENT_NONBLOCK)) - error = zpool_standard_error_fmt(hdl, errno, -@@ -3869,3 +3858,3 @@ zpool_events_clear(libzfs_handle_t *hdl, int *count) - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - char msg[1024]; -@@ -3884,2 +3873,38 @@ zpool_events_clear(libzfs_handle_t *hdl, int *count) - -+/* -+ * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for -+ * the passed zevent_fd file handle. On success zero is returned, -+ * otherwise -1 is returned and hdl->libzfs_error is set to the errno. -+ */ -+int -+zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) -+{ -+ zfs_cmd_t zc = {"\0"}; -+ int error = 0; -+ -+ zc.zc_guid = eid; -+ zc.zc_cleanup_fd = zevent_fd; -+ -+ if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { -+ switch (errno) { -+ case ENOENT: -+ error = zfs_error_fmt(hdl, EZFS_NOENT, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ -+ case ENOMEM: -+ error = zfs_error_fmt(hdl, EZFS_NOMEM, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ -+ default: -+ error = zpool_standard_error_fmt(hdl, errno, -+ dgettext(TEXT_DOMAIN, "cannot get event")); -+ break; -+ } -+ } -+ -+ return (error); -+} -+ - void -@@ -3888,3 +3913,3 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - boolean_t mounted = B_FALSE; -@@ -3895,3 +3920,4 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - /* special case for the MOS */ -- (void) snprintf(pathname, len, ":<0x%llx>", (longlong_t)obj); -+ (void) snprintf(pathname, len, ":<0x%llx>", -+ (longlong_t)obj); - return; -@@ -3927,3 +3953,4 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, - } else { -- (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); -+ (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, -+ (longlong_t)obj); - } -@@ -4027,3 +4054,3 @@ zpool_label_disk_check(char *path) - if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) -- return errno; -+ return (errno); - -@@ -4031,3 +4058,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return err; -+ return (err); - } -@@ -4037,3 +4064,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return EIDRM; -+ return (EIDRM); - } -@@ -4042,3 +4069,3 @@ zpool_label_disk_check(char *path) - (void) close(fd); -- return 0; -+ return (0); - } -@@ -4182,3 +4209,3 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) - -- return 0; -+ return (0); - } -diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c -index 9dbfb16..12ac9bd 100644 ---- a/lib/libzfs/libzfs_sendrecv.c -+++ b/lib/libzfs/libzfs_sendrecv.c -@@ -24,5 +24,6 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -- * Copyright (c) 2012 Pawel Jakub Dawidek . - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2012 Pawel Jakub Dawidek . - * All rights reserved -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -801,2 +802,3 @@ typedef struct send_dump_data { - nvlist_t *fss; -+ nvlist_t *snapholds; - avl_tree_t *fsavl; -@@ -814,3 +816,3 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -878,3 +880,3 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - libzfs_handle_t *hdl = zhp->zfs_hdl; -@@ -950,39 +952,15 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, - --static int --hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) -+static void -+gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) - { -- zfs_handle_t *pzhp; -- int error = 0; -- char *thissnap; -- - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - -- if (sdd->dryrun) -- return (0); -- - /* -- * zfs_send() only opens a cleanup_fd for sends that need it, -+ * zfs_send() only sets snapholds for sends that need them, - * e.g. replication and doall. - */ -- if (sdd->cleanup_fd == -1) -- return (0); -- -- thissnap = strchr(zhp->zfs_name, '@') + 1; -- *(thissnap - 1) = '\0'; -- pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET); -- *(thissnap - 1) = '@'; -- -- /* -- * It's OK if the parent no longer exists. The send code will -- * handle that error. -- */ -- if (pzhp) { -- error = zfs_hold(pzhp, thissnap, sdd->holdtag, -- B_FALSE, B_TRUE, B_TRUE, sdd->cleanup_fd, -- zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID), -- zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG)); -- zfs_close(pzhp); -- } -+ if (sdd->snapholds == NULL) -+ return; - -- return (error); -+ fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); - } -@@ -994,3 +972,3 @@ send_progress_thread(void *arg) - -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp = pa->pa_zhp; -@@ -1042,3 +1020,2 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - pthread_t tid; -- - char *thissnap; -@@ -1048,2 +1025,3 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - -+ err = 0; - thissnap = strchr(zhp->zfs_name, '@') + 1; -@@ -1053,13 +1031,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - if (!sdd->seenfrom && isfromsnap) { -- err = hold_for_send(zhp, sdd); -- if (err == 0) { -- sdd->seenfrom = B_TRUE; -- (void) strcpy(sdd->prevsnap, thissnap); -- sdd->prevsnap_obj = zfs_prop_get_int(zhp, -- ZFS_PROP_OBJSETID); -- } else if (err == ENOENT) { -- err = 0; -- } -+ gather_holds(zhp, sdd); -+ sdd->seenfrom = B_TRUE; -+ (void) strcpy(sdd->prevsnap, thissnap); -+ sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); - zfs_close(zhp); -- return (err); -+ return (0); - } -@@ -1114,10 +1087,3 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) - -- err = hold_for_send(zhp, sdd); -- if (err) { -- if (err == ENOENT) -- err = 0; -- zfs_close(zhp); -- return (err); -- } -- -+ gather_holds(zhp, sdd); - fromorigin = sdd->prevsnap[0] == '\0' && -@@ -1197,3 +1163,3 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) - boolean_t missingfrom = B_FALSE; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -1389,3 +1355,3 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - int spa_version; -- pthread_t tid; -+ pthread_t tid = 0; - int pipefd[2]; -@@ -1462,7 +1428,4 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - nvlist_free(hdrnv); -- if (err) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); -+ if (err) - goto stderr_out; -- } - } -@@ -1490,4 +1453,2 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - if (err == -1) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); - err = errno; -@@ -1502,4 +1463,2 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - if (err == -1) { -- fsavl_destroy(fsavl); -- nvlist_free(fss); - err = errno; -@@ -1515,3 +1474,3 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sdd.tosnap = tosnap; -- if (flags->dedup) -+ if (tid != 0) - sdd.outfd = pipefd[0]; -@@ -1552,10 +1511,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - } -+ sdd.snapholds = fnvlist_alloc(); - } else { - sdd.cleanup_fd = -1; -+ sdd.snapholds = NULL; - } -- if (flags->verbose) { -+ if (flags->verbose || sdd.snapholds != NULL) { - /* - * Do a verbose no-op dry run to get all the verbose output -- * before generating any data. Then do a non-verbose real -- * run to generate the streams. -+ * or to gather snapshot hold's before generating any data, -+ * then do a non-verbose real run to generate the streams. - */ -@@ -1563,14 +1524,41 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - err = dump_filesystems(zhp, &sdd); -- sdd.dryrun = flags->dryrun; -- sdd.verbose = B_FALSE; -- if (flags->parsable) { -- (void) fprintf(stderr, "size\t%llu\n", -- (longlong_t)sdd.size); -- } else { -- char buf[16]; -- zfs_nicenum(sdd.size, buf, sizeof (buf)); -- (void) fprintf(stderr, dgettext(TEXT_DOMAIN, -- "total estimated size is %s\n"), buf); -+ -+ if (err != 0) -+ goto stderr_out; -+ -+ if (flags->verbose) { -+ if (flags->parsable) { -+ (void) fprintf(stderr, "size\t%llu\n", -+ (longlong_t)sdd.size); -+ } else { -+ char buf[16]; -+ zfs_nicenum(sdd.size, buf, sizeof (buf)); -+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, -+ "total estimated size is %s\n"), buf); -+ } -+ } -+ -+ /* Ensure no snaps found is treated as an error. */ -+ if (!sdd.seento) { -+ err = ENOENT; -+ goto err_out; -+ } -+ -+ /* Skip the second run if dryrun was requested. */ -+ if (flags->dryrun) -+ goto err_out; -+ -+ if (sdd.snapholds != NULL) { -+ err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); -+ if (err != 0) -+ goto stderr_out; -+ -+ fnvlist_free(sdd.snapholds); -+ sdd.snapholds = NULL; - } -+ -+ sdd.dryrun = B_FALSE; -+ sdd.verbose = B_FALSE; - } -+ - err = dump_filesystems(zhp, &sdd); -@@ -1579,3 +1567,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - -- if (flags->dedup) { -+ /* Ensure no snaps found is treated as an error. */ -+ if (err == 0 && !sdd.seento) -+ err = ENOENT; -+ -+ if (tid != 0) { -+ if (err != 0) -+ (void) pthread_cancel(tid); - (void) close(pipefd[0]); -@@ -1609,8 +1603,12 @@ stderr_out: - err_out: -+ fsavl_destroy(fsavl); -+ nvlist_free(fss); -+ fnvlist_free(sdd.snapholds); -+ - if (sdd.cleanup_fd != -1) - VERIFY(0 == close(sdd.cleanup_fd)); -- if (flags->dedup) { -+ if (tid != 0) { - (void) pthread_cancel(tid); -- (void) pthread_join(tid, NULL); - (void) close(pipefd[0]); -+ (void) pthread_join(tid, NULL); - } -@@ -1685,3 +1683,3 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - static int seq; -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err; -@@ -1721,8 +1719,7 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - -- if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) { -+ if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { - seq++; - -- (void) strncpy(newname, name, baselen); -- (void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen, -- "recv-%ld-%u", (long) getpid(), seq); -+ (void) snprintf(newname, ZFS_MAXNAMELEN, "%.*srecv-%u-%u", -+ baselen, name, getpid(), seq); - (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value)); -@@ -1758,3 +1755,3 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - int err = 0; -@@ -2017,3 +2014,3 @@ again: - /* promote it! */ -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - nvlist_t *origin_nvfs; -@@ -2089,3 +2086,3 @@ again: - stream_snapname, &props)) { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - -@@ -2520,3 +2517,3 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - { -- zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc = {"\0"}; - time_t begin_time; -@@ -2651,3 +2648,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - */ -- (void) strcpy(zc.zc_top_ds, tosnap); - (void) strcpy(zc.zc_value, tosnap); -@@ -2796,8 +2792,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - } -- if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME && -- zvol_remove_link(hdl, zhp->zfs_name) != 0) { -- zfs_close(zhp); -- zcmd_free_nvlists(&zc); -- return (-1); -- } - zfs_close(zhp); -@@ -2894,3 +2884,3 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - if (err == 0 && snapprops_nvlist) { -- zfs_cmd_t zc2 = { "\0", "\0", "\0", "\0", 0 }; -+ zfs_cmd_t zc2 = {"\0"}; - -@@ -3007,6 +2997,2 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - *cp = '@'; -- err = zvol_create_link(hdl, h->zfs_name); -- if (err == 0 && ioctl_err == 0) -- err = zvol_create_link(hdl, -- zc.zc_value); - } else if (newfs || stream_avl) { -diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c -index e6e9230..534ff85 100644 ---- a/lib/libzfs/libzfs_status.c -+++ b/lib/libzfs/libzfs_status.c -@@ -24,2 +24,3 @@ - * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -68,2 +69,3 @@ static char *zfs_msgid_table[] = { - "ZFS-8000-K4", -+ "ZFS-8000-ER", - }; -@@ -152,2 +154,12 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) - -+ /* -+ * Check any L2 cache devs -+ */ -+ if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, -+ &children) == 0) { -+ for (c = 0; c < children; c++) -+ if (find_vdev_problem(child[c], func)) -+ return (B_TRUE); -+ } -+ - return (B_FALSE); -@@ -173,3 +185,3 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) - static zpool_status_t --check_status(nvlist_t *config, boolean_t isimport) -+check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) - { -@@ -184,2 +196,3 @@ check_status(nvlist_t *config, boolean_t isimport) - uint64_t hostid = 0; -+ uint64_t errata = 0; - unsigned long system_hostid = gethostid() & 0xffffffff; -@@ -347,2 +360,11 @@ check_status(nvlist_t *config, boolean_t isimport) - -+ /* -+ * Informational errata available. -+ */ -+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRATA, &errata); -+ if (errata) { -+ *erratap = errata; -+ return (ZPOOL_STATUS_ERRATA); -+ } -+ - return (ZPOOL_STATUS_OK); -@@ -351,5 +373,5 @@ check_status(nvlist_t *config, boolean_t isimport) - zpool_status_t --zpool_get_status(zpool_handle_t *zhp, char **msgid) -+zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) - { -- zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE); -+ zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata); - -@@ -364,5 +386,5 @@ zpool_get_status(zpool_handle_t *zhp, char **msgid) - zpool_status_t --zpool_import_status(nvlist_t *config, char **msgid) -+zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) - { -- zpool_status_t ret = check_status(config, B_TRUE); -+ zpool_status_t ret = check_status(config, B_TRUE, errata); - -diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c -index 5bb88e9..e99603b 100644 ---- a/lib/libzfs/libzfs_util.c -+++ b/lib/libzfs/libzfs_util.c -@@ -47,2 +47,3 @@ - #include -+#include - -@@ -618,4 +619,4 @@ libzfs_module_loaded(const char *module) - -- memcpy(path, path_prefix, sizeof(path_prefix) - 1); -- strcpy(path + sizeof(path_prefix) - 1, module); -+ memcpy(path, path_prefix, sizeof (path_prefix) - 1); -+ strcpy(path + sizeof (path_prefix) - 1, module); - -@@ -653,8 +654,8 @@ libzfs_run_process(const char *path, char *argv[], int flags) - if (rc < 0 || !WIFEXITED(status)) -- return -1; -+ return (-1); - -- return WEXITSTATUS(status); -+ return (WEXITSTATUS(status)); - } - -- return -1; -+ return (-1); - } -@@ -667,5 +668,5 @@ libzfs_load_module(const char *module) - if (libzfs_module_loaded(module)) -- return 0; -+ return (0); - -- return libzfs_run_process("/sbin/modprobe", argv, 0); -+ return (libzfs_run_process("/sbin/modprobe", argv, 0)); - } -@@ -679,4 +680,4 @@ libzfs_init(void) - (void) fprintf(stderr, gettext("Failed to load ZFS module " -- "stack.\nLoad the module manually by running " -- "'insmod /zfs.ko' as root.\n")); -+ "stack.\nLoad the module manually by running " -+ "'insmod /zfs.ko' as root.\n")); - return (NULL); -@@ -690,7 +691,7 @@ libzfs_init(void) - (void) fprintf(stderr, gettext("Unable to open %s: %s.\n"), -- ZFS_DEV, strerror(errno)); -+ ZFS_DEV, strerror(errno)); - if (errno == ENOENT) - (void) fprintf(stderr, -- gettext("Verify the ZFS module stack is " -- "loaded by running '/sbin/modprobe zfs'.\n")); -+ gettext("Verify the ZFS module stack is " -+ "loaded by running '/sbin/modprobe zfs'.\n")); - -@@ -714,2 +715,10 @@ libzfs_init(void) - -+ if (libzfs_core_init() != 0) { -+ (void) close(hdl->libzfs_fd); -+ (void) fclose(hdl->libzfs_mnttab); -+ (void) fclose(hdl->libzfs_sharetab); -+ free(hdl); -+ return (NULL); -+ } -+ - zfs_prop_init(); -@@ -735,4 +744,2 @@ libzfs_fini(libzfs_handle_t *hdl) - zfs_uninit_libshare(hdl); -- if (hdl->libzfs_log_str) -- (void) free(hdl->libzfs_log_str); - zpool_free_handles(hdl); -@@ -741,2 +748,3 @@ libzfs_fini(libzfs_handle_t *hdl) - libzfs_mnttab_fini(hdl); -+ libzfs_core_fini(); - free(hdl); -@@ -787,3 +795,6 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) - -- rewind(hdl->libzfs_mnttab); -+ /* Reopen MNTTAB to prevent reading stale data from open file */ -+ if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) -+ return (NULL); -+ - while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { -@@ -908,3 +919,3 @@ zfs_strcmp_shortname(char *name, char *cmp_name, int wholedisk) - -- if ((path_len == cmp_len) && !strcmp(path_name, cmp_name)) { -+ if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) { - error = 0; -@@ -951,3 +962,3 @@ zfs_strcmp_pathname(char *name, char *cmp, int wholedisk) - if (name[0] != '/') -- return zfs_strcmp_shortname(name, cmp_name, wholedisk); -+ return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); - -@@ -1063,13 +1074,3 @@ zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) - { -- int error; -- -- zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str; -- error = ioctl(hdl->libzfs_fd, request, zc); -- if (hdl->libzfs_log_str) { -- free(hdl->libzfs_log_str); -- hdl->libzfs_log_str = NULL; -- } -- zc->zc_history = 0; -- -- return (error); -+ return (ioctl(hdl->libzfs_fd, request, zc)); - } -@@ -1317,6 +1318,6 @@ str2shift(libzfs_handle_t *hdl, const char *buf) - (toupper(buf[0]) != 'B' && -- ((toupper(buf[1]) == 'B' && buf[2] == '\0') || -- (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && -- buf[3] == '\0')))) -- return (10*i); -+ ((toupper(buf[1]) == 'B' && buf[2] == '\0') || -+ (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && -+ buf[3] == '\0')))) -+ return (10 * i); - -diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am -new file mode 100644 -index 0000000..0ecd208 ---- /dev/null -+++ b/lib/libzfs_core/Makefile.am -@@ -0,0 +1,15 @@ -+include $(top_srcdir)/config/Rules.am -+ -+DEFAULT_INCLUDES += \ -+ -I$(top_srcdir)/include \ -+ -I$(top_srcdir)/lib/libspl/include -+ -+lib_LTLIBRARIES = libzfs_core.la -+ -+libzfs_core_la_SOURCES = \ -+ $(top_srcdir)/lib/libzfs_core/libzfs_core.c -+ -+libzfs_core_la_LIBADD = \ -+ $(top_builddir)/lib/libnvpair/libnvpair.la -+ -+libzfs_core_la_LDFLAGS = -version-info 1:0:0 -diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c -new file mode 100644 -index 0000000..3befa4d ---- /dev/null -+++ b/lib/libzfs_core/libzfs_core.c -@@ -0,0 +1,607 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+/* -+ * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. -+ * It has the following characteristics: -+ * -+ * - Thread Safe. libzfs_core is accessible concurrently from multiple -+ * threads. This is accomplished primarily by avoiding global data -+ * (e.g. caching). Since it's thread-safe, there is no reason for a -+ * process to have multiple libzfs "instances". Therefore, we store -+ * our few pieces of data (e.g. the file descriptor) in global -+ * variables. The fd is reference-counted so that the libzfs_core -+ * library can be "initialized" multiple times (e.g. by different -+ * consumers within the same process). -+ * -+ * - Committed Interface. The libzfs_core interface will be committed, -+ * therefore consumers can compile against it and be confident that -+ * their code will continue to work on future releases of this code. -+ * Currently, the interface is Evolving (not Committed), but we intend -+ * to commit to it once it is more complete and we determine that it -+ * meets the needs of all consumers. -+ * -+ * - Programatic Error Handling. libzfs_core communicates errors with -+ * defined error numbers, and doesn't print anything to stdout/stderr. -+ * -+ * - Thin Layer. libzfs_core is a thin layer, marshaling arguments -+ * to/from the kernel ioctls. There is generally a 1:1 correspondence -+ * between libzfs_core functions and ioctls to /dev/zfs. -+ * -+ * - Clear Atomicity. Because libzfs_core functions are generally 1:1 -+ * with kernel ioctls, and kernel ioctls are general atomic, each -+ * libzfs_core function is atomic. For example, creating multiple -+ * snapshots with a single call to lzc_snapshot() is atomic -- it -+ * can't fail with only some of the requested snapshots created, even -+ * in the event of power loss or system crash. -+ * -+ * - Continued libzfs Support. Some higher-level operations (e.g. -+ * support for "zfs send -R") are too complicated to fit the scope of -+ * libzfs_core. This functionality will continue to live in libzfs. -+ * Where appropriate, libzfs will use the underlying atomic operations -+ * of libzfs_core. For example, libzfs may implement "zfs send -R | -+ * zfs receive" by using individual "send one snapshot", rename, -+ * destroy, and "receive one snapshot" operations in libzfs_core. -+ * /sbin/zfs and /zbin/zpool will link with both libzfs and -+ * libzfs_core. Other consumers should aim to use only libzfs_core, -+ * since that will be the supported, stable interface going forwards. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static int g_fd; -+static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; -+static int g_refcount; -+ -+int -+libzfs_core_init(void) -+{ -+ (void) pthread_mutex_lock(&g_lock); -+ if (g_refcount == 0) { -+ g_fd = open("/dev/zfs", O_RDWR); -+ if (g_fd < 0) { -+ (void) pthread_mutex_unlock(&g_lock); -+ return (errno); -+ } -+ } -+ g_refcount++; -+ (void) pthread_mutex_unlock(&g_lock); -+ return (0); -+} -+ -+void -+libzfs_core_fini(void) -+{ -+ (void) pthread_mutex_lock(&g_lock); -+ ASSERT3S(g_refcount, >, 0); -+ g_refcount--; -+ if (g_refcount == 0) -+ (void) close(g_fd); -+ (void) pthread_mutex_unlock(&g_lock); -+} -+ -+static int -+lzc_ioctl(zfs_ioc_t ioc, const char *name, -+ nvlist_t *source, nvlist_t **resultp) -+{ -+ zfs_cmd_t zc = {"\0"}; -+ int error = 0; -+ char *packed; -+ size_t size; -+ -+ ASSERT3S(g_refcount, >, 0); -+ -+ (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); -+ -+ packed = fnvlist_pack(source, &size); -+ zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; -+ zc.zc_nvlist_src_size = size; -+ -+ if (resultp != NULL) { -+ *resultp = NULL; -+ zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); -+ zc.zc_nvlist_dst = (uint64_t)(uintptr_t) -+ malloc(zc.zc_nvlist_dst_size); -+ if (zc.zc_nvlist_dst == (uint64_t)0) { -+ error = ENOMEM; -+ goto out; -+ } -+ } -+ -+ while (ioctl(g_fd, ioc, &zc) != 0) { -+ if (errno == ENOMEM && resultp != NULL) { -+ free((void *)(uintptr_t)zc.zc_nvlist_dst); -+ zc.zc_nvlist_dst_size *= 2; -+ zc.zc_nvlist_dst = (uint64_t)(uintptr_t) -+ malloc(zc.zc_nvlist_dst_size); -+ if (zc.zc_nvlist_dst == (uint64_t)0) { -+ error = ENOMEM; -+ goto out; -+ } -+ } else { -+ error = errno; -+ break; -+ } -+ } -+ if (zc.zc_nvlist_dst_filled) { -+ *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, -+ zc.zc_nvlist_dst_size); -+ } -+ -+out: -+ fnvlist_pack_free(packed, size); -+ free((void *)(uintptr_t)zc.zc_nvlist_dst); -+ return (error); -+} -+ -+int -+lzc_create(const char *fsname, dmu_objset_type_t type, nvlist_t *props) -+{ -+ int error; -+ nvlist_t *args = fnvlist_alloc(); -+ fnvlist_add_int32(args, "type", type); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); -+ nvlist_free(args); -+ return (error); -+} -+ -+int -+lzc_clone(const char *fsname, const char *origin, -+ nvlist_t *props) -+{ -+ int error; -+ nvlist_t *args = fnvlist_alloc(); -+ fnvlist_add_string(args, "origin", origin); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); -+ nvlist_free(args); -+ return (error); -+} -+ -+/* -+ * Creates snapshots. -+ * -+ * The keys in the snaps nvlist are the snapshots to be created. -+ * They must all be in the same pool. -+ * -+ * The props nvlist is properties to set. Currently only user properties -+ * are supported. { user:prop_name -> string value } -+ * -+ * The returned results nvlist will have an entry for each snapshot that failed. -+ * The value will be the (int32) error code. -+ * -+ * The return value will be 0 if all snapshots were created, otherwise it will -+ * be the errno of a (unspecified) snapshot that failed. -+ */ -+int -+lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) -+{ -+ nvpair_t *elem; -+ nvlist_t *args; -+ int error; -+ char pool[MAXNAMELEN]; -+ -+ *errlist = NULL; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(snaps, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "snaps", snaps); -+ if (props != NULL) -+ fnvlist_add_nvlist(args, "props", props); -+ -+ error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); -+ nvlist_free(args); -+ -+ return (error); -+} -+ -+/* -+ * Destroys snapshots. -+ * -+ * The keys in the snaps nvlist are the snapshots to be destroyed. -+ * They must all be in the same pool. -+ * -+ * Snapshots that do not exist will be silently ignored. -+ * -+ * If 'defer' is not set, and a snapshot has user holds or clones, the -+ * destroy operation will fail and none of the snapshots will be -+ * destroyed. -+ * -+ * If 'defer' is set, and a snapshot has user holds or clones, it will be -+ * marked for deferred destruction, and will be destroyed when the last hold -+ * or clone is removed/destroyed. -+ * -+ * The return value will be 0 if all snapshots were destroyed (or marked for -+ * later destruction if 'defer' is set) or didn't exist to begin with. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) snapshot -+ * that failed, no snapshots will be destroyed, and the errlist will have an -+ * entry for each snapshot that failed. The value in the errlist will be -+ * the (int32) error code. -+ */ -+int -+lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) -+{ -+ nvpair_t *elem; -+ nvlist_t *args; -+ int error; -+ char pool[MAXNAMELEN]; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(snaps, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "snaps", snaps); -+ if (defer) -+ fnvlist_add_boolean(args, "defer"); -+ -+ error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); -+ nvlist_free(args); -+ -+ return (error); -+} -+ -+int -+lzc_snaprange_space(const char *firstsnap, const char *lastsnap, -+ uint64_t *usedp) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ char fs[MAXNAMELEN]; -+ char *atp; -+ -+ /* determine the fs name */ -+ (void) strlcpy(fs, firstsnap, sizeof (fs)); -+ atp = strchr(fs, '@'); -+ if (atp == NULL) -+ return (EINVAL); -+ *atp = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_string(args, "firstsnap", firstsnap); -+ -+ err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); -+ nvlist_free(args); -+ if (err == 0) -+ *usedp = fnvlist_lookup_uint64(result, "used"); -+ fnvlist_free(result); -+ -+ return (err); -+} -+ -+boolean_t -+lzc_exists(const char *dataset) -+{ -+ /* -+ * The objset_stats ioctl is still legacy, so we need to construct our -+ * own zfs_cmd_t rather than using zfsc_ioctl(). -+ */ -+ zfs_cmd_t zc = {"\0"}; -+ -+ (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); -+ return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); -+} -+ -+/* -+ * Create "user holds" on snapshots. If there is a hold on a snapshot, -+ * the snapshot can not be destroyed. (However, it can be marked for deletion -+ * by lzc_destroy_snaps(defer=B_TRUE).) -+ * -+ * The keys in the nvlist are snapshot names. -+ * The snapshots must all be in the same pool. -+ * The value is the name of the hold (string type). -+ * -+ * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). -+ * In this case, when the cleanup_fd is closed (including on process -+ * termination), the holds will be released. If the system is shut down -+ * uncleanly, the holds will be released when the pool is next opened -+ * or imported. -+ * -+ * Holds for snapshots which don't exist will be skipped and have an entry -+ * added to errlist, but will not cause an overall failure. -+ * -+ * The return value will be 0 if all holds, for snapshots that existed, -+ * were succesfully created. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) hold that -+ * failed and no holds will be created. -+ * -+ * In all cases the errlist will have an entry for each hold that failed -+ * (name = snapshot), with its value being the error code (int32). -+ */ -+int -+lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) -+{ -+ char pool[MAXNAMELEN]; -+ nvlist_t *args; -+ nvpair_t *elem; -+ int error; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(holds, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_nvlist(args, "holds", holds); -+ if (cleanup_fd != -1) -+ fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); -+ -+ error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); -+ nvlist_free(args); -+ return (error); -+} -+ -+/* -+ * Release "user holds" on snapshots. If the snapshot has been marked for -+ * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have -+ * any clones, and all the user holds are removed, then the snapshot will be -+ * destroyed. -+ * -+ * The keys in the nvlist are snapshot names. -+ * The snapshots must all be in the same pool. -+ * The value is a nvlist whose keys are the holds to remove. -+ * -+ * Holds which failed to release because they didn't exist will have an entry -+ * added to errlist, but will not cause an overall failure. -+ * -+ * The return value will be 0 if the nvl holds was empty or all holds that -+ * existed, were successfully removed. -+ * -+ * Otherwise the return value will be the errno of a (unspecified) hold that -+ * failed to release and no holds will be released. -+ * -+ * In all cases the errlist will have an entry for each hold that failed to -+ * to release. -+ */ -+int -+lzc_release(nvlist_t *holds, nvlist_t **errlist) -+{ -+ char pool[MAXNAMELEN]; -+ nvpair_t *elem; -+ -+ /* determine the pool name */ -+ elem = nvlist_next_nvpair(holds, NULL); -+ if (elem == NULL) -+ return (0); -+ (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); -+ pool[strcspn(pool, "/@")] = '\0'; -+ -+ return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); -+} -+ -+/* -+ * Retrieve list of user holds on the specified snapshot. -+ * -+ * On success, *holdsp will be set to a nvlist which the caller must free. -+ * The keys are the names of the holds, and the value is the creation time -+ * of the hold (uint64) in seconds since the epoch. -+ */ -+int -+lzc_get_holds(const char *snapname, nvlist_t **holdsp) -+{ -+ int error; -+ nvlist_t *innvl = fnvlist_alloc(); -+ error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp); -+ fnvlist_free(innvl); -+ return (error); -+} -+ -+/* -+ * If fromsnap is NULL, a full (non-incremental) stream will be sent. -+ */ -+int -+lzc_send(const char *snapname, const char *fromsnap, int fd) -+{ -+ nvlist_t *args; -+ int err; -+ -+ args = fnvlist_alloc(); -+ fnvlist_add_int32(args, "fd", fd); -+ if (fromsnap != NULL) -+ fnvlist_add_string(args, "fromsnap", fromsnap); -+ err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); -+ nvlist_free(args); -+ return (err); -+} -+ -+/* -+ * If fromsnap is NULL, a full (non-incremental) stream will be estimated. -+ */ -+int -+lzc_send_space(const char *snapname, const char *fromsnap, uint64_t *spacep) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ -+ args = fnvlist_alloc(); -+ if (fromsnap != NULL) -+ fnvlist_add_string(args, "fromsnap", fromsnap); -+ err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); -+ nvlist_free(args); -+ if (err == 0) -+ *spacep = fnvlist_lookup_uint64(result, "space"); -+ nvlist_free(result); -+ return (err); -+} -+ -+static int -+recv_read(int fd, void *buf, int ilen) -+{ -+ char *cp = buf; -+ int rv; -+ int len = ilen; -+ -+ do { -+ rv = read(fd, cp, len); -+ cp += rv; -+ len -= rv; -+ } while (rv > 0); -+ -+ if (rv < 0 || len != 0) -+ return (EIO); -+ -+ return (0); -+} -+ -+/* -+ * The simplest receive case: receive from the specified fd, creating the -+ * specified snapshot. Apply the specified properties a "received" properties -+ * (which can be overridden by locally-set properties). If the stream is a -+ * clone, its origin snapshot must be specified by 'origin'. The 'force' -+ * flag will cause the target filesystem to be rolled back or destroyed if -+ * necessary to receive. -+ * -+ * Return 0 on success or an errno on failure. -+ * -+ * Note: this interface does not work on dedup'd streams -+ * (those with DMU_BACKUP_FEATURE_DEDUP). -+ */ -+int -+lzc_receive(const char *snapname, nvlist_t *props, const char *origin, -+ boolean_t force, int fd) -+{ -+ /* -+ * The receive ioctl is still legacy, so we need to construct our own -+ * zfs_cmd_t rather than using zfsc_ioctl(). -+ */ -+ zfs_cmd_t zc = {"\0"}; -+ char *atp; -+ char *packed = NULL; -+ size_t size; -+ dmu_replay_record_t drr; -+ int error; -+ -+ ASSERT3S(g_refcount, >, 0); -+ -+ /* zc_name is name of containing filesystem */ -+ (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name)); -+ atp = strchr(zc.zc_name, '@'); -+ if (atp == NULL) -+ return (EINVAL); -+ *atp = '\0'; -+ -+ /* if the fs does not exist, try its parent. */ -+ if (!lzc_exists(zc.zc_name)) { -+ char *slashp = strrchr(zc.zc_name, '/'); -+ if (slashp == NULL) -+ return (ENOENT); -+ *slashp = '\0'; -+ -+ } -+ -+ /* zc_value is full name of the snapshot to create */ -+ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); -+ -+ if (props != NULL) { -+ /* zc_nvlist_src is props to set */ -+ packed = fnvlist_pack(props, &size); -+ zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; -+ zc.zc_nvlist_src_size = size; -+ } -+ -+ /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */ -+ if (origin != NULL) -+ (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); -+ -+ /* zc_begin_record is non-byteswapped BEGIN record */ -+ error = recv_read(fd, &drr, sizeof (drr)); -+ if (error != 0) -+ goto out; -+ zc.zc_begin_record = drr.drr_u.drr_begin; -+ -+ /* zc_cookie is fd to read from */ -+ zc.zc_cookie = fd; -+ -+ /* zc guid is force flag */ -+ zc.zc_guid = force; -+ -+ /* zc_cleanup_fd is unused */ -+ zc.zc_cleanup_fd = -1; -+ -+ error = ioctl(g_fd, ZFS_IOC_RECV, &zc); -+ if (error != 0) -+ error = errno; -+ -+out: -+ if (packed != NULL) -+ fnvlist_pack_free(packed, size); -+ free((void*)(uintptr_t)zc.zc_nvlist_dst); -+ return (error); -+} -+ -+/* -+ * Roll back this filesystem or volume to its most recent snapshot. -+ * If snapnamebuf is not NULL, it will be filled in with the name -+ * of the most recent snapshot. -+ * -+ * Return 0 on success or an errno on failure. -+ */ -+int -+lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) -+{ -+ nvlist_t *args; -+ nvlist_t *result; -+ int err; -+ -+ args = fnvlist_alloc(); -+ err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); -+ nvlist_free(args); -+ if (err == 0 && snapnamebuf != NULL) { -+ const char *snapname = fnvlist_lookup_string(result, "target"); -+ (void) strlcpy(snapnamebuf, snapname, snapnamelen); -+ } -+ return (err); -+} -diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am -index cbba388..e4189a3 100644 ---- a/lib/libzpool/Makefile.am -+++ b/lib/libzpool/Makefile.am -@@ -27,2 +27,3 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/dbuf.c \ -+ $(top_srcdir)/module/zfs/dbuf_stats.c \ - $(top_srcdir)/module/zfs/ddt.c \ -@@ -47,2 +48,4 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/dsl_synctask.c \ -+ $(top_srcdir)/module/zfs/dsl_destroy.c \ -+ $(top_srcdir)/module/zfs/dsl_userhold.c \ - $(top_srcdir)/module/zfs/fm.c \ -@@ -62,2 +65,3 @@ libzpool_la_SOURCES = \ - $(top_srcdir)/module/zfs/spa_misc.c \ -+ $(top_srcdir)/module/zfs/spa_stats.c \ - $(top_srcdir)/module/zfs/space_map.c \ -@@ -99,3 +103,4 @@ libzpool_la_LIBADD = \ - --libzpool_la_LDFLAGS = -pthread -version-info 1:1:0 -+libzpool_la_LIBADD += $(ZLIB) -+libzpool_la_LDFLAGS = -version-info 2:0:0 - -diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c -index f7aeeb4..05bbd06 100644 ---- a/lib/libzpool/kernel.c -+++ b/lib/libzpool/kernel.c -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -75,3 +76,3 @@ thread_init(void) - /* Create entry for primary kthread */ -- kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); -+ kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); - kt->t_tid = pthread_self(); -@@ -94,3 +95,3 @@ thread_fini(void) - -- umem_free(kt, sizeof(kthread_t)); -+ umem_free(kt, sizeof (kthread_t)); - -@@ -118,3 +119,3 @@ zk_thread_current(void) - -- return kt; -+ return (kt); - } -@@ -138,3 +139,3 @@ zk_thread_helper(void *arg) - -- return NULL; -+ return (NULL); - } -@@ -143,3 +144,3 @@ kthread_t * - zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, -- size_t len, proc_t *pp, int state, pri_t pri, int detachstate) -+ size_t len, proc_t *pp, int state, pri_t pri, int detachstate) - { -@@ -151,3 +152,3 @@ zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, - -- kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); -+ kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); - kt->t_func = func; -@@ -189,3 +190,3 @@ zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, - -- return kt; -+ return (kt); - } -@@ -199,3 +200,3 @@ zk_thread_exit(void) - -- umem_free(kt, sizeof(kthread_t)); -+ umem_free(kt, sizeof (kthread_t)); - -@@ -225,4 +226,4 @@ zk_thread_join(kt_did_t tid) - kstat_t * --kstat_create(char *module, int instance, char *name, char *class, -- uchar_t type, ulong_t ndata, uchar_t ks_flag) -+kstat_create(const char *module, int instance, const char *name, -+ const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) - { -@@ -241,2 +242,39 @@ kstat_delete(kstat_t *ksp) - -+/*ARGSUSED*/ -+void -+kstat_waitq_enter(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_waitq_exit(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_enter(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_exit(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_waitq_to_runq(kstat_io_t *kiop) -+{} -+ -+/*ARGSUSED*/ -+void -+kstat_runq_back_to_waitq(kstat_io_t *kiop) -+{} -+ -+void -+kstat_set_raw_ops(kstat_t *ksp, -+ int (*headers)(char *buf, size_t size), -+ int (*data)(char *buf, size_t size, void *data), -+ void *(*addr)(kstat_t *ksp, loff_t index)) -+{} -+ - /* -@@ -492,2 +530,37 @@ top: - -+/*ARGSUSED*/ -+clock_t -+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, -+ int flag) -+{ -+ int error; -+ timestruc_t ts; -+ hrtime_t delta; -+ -+ ASSERT(flag == 0); -+ -+top: -+ delta = tim - gethrtime(); -+ if (delta <= 0) -+ return (-1); -+ -+ ts.tv_sec = delta / NANOSEC; -+ ts.tv_nsec = delta % NANOSEC; -+ -+ ASSERT(mutex_owner(mp) == curthread); -+ mp->m_owner = NULL; -+ error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); -+ mp->m_owner = curthread; -+ -+ if (error == ETIME) -+ return (-1); -+ -+ if (error == EINTR) -+ goto top; -+ -+ ASSERT(error == 0); -+ -+ return (1); -+} -+ - void -@@ -666,3 +739,3 @@ vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, - */ -- abort(); -+ abort(); - } -@@ -1044,2 +1117,4 @@ kernel_init(int mode) - { -+ extern uint_t rrw_tsd_key; -+ - umem_nofail_callback(umem_out_of_memory); -@@ -1061,2 +1136,4 @@ kernel_init(int mode) - spa_init(mode); -+ -+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy); - } -@@ -1084,2 +1161,8 @@ crgetuid(cred_t *cr) - -+uid_t -+crgetruid(cred_t *cr) -+{ -+ return (0); -+} -+ - gid_t -diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c -index 96c0d5c..72807f6 100644 ---- a/lib/libzpool/taskq.c -+++ b/lib/libzpool/taskq.c -@@ -159,3 +159,3 @@ taskq_empty_ent(taskq_ent_t *t) - { -- return t->tqent_next == NULL; -+ return (t->tqent_next == NULL); - } -@@ -289,3 +289,4 @@ taskq_create(const char *name, int nthreads, pri_t pri, - tq->tq_task.tqent_prev = &tq->tq_task; -- tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP); -+ tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *), -+ KM_SLEEP); - -diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am -index 9d44398..113cd0d 100644 ---- a/man/man1/Makefile.am -+++ b/man/man1/Makefile.am -@@ -1,3 +1,3 @@ --man_MANS = zhack.1 zpios.1 ztest.1 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = zhack.1 zpios.1 ztest.1 -+EXTRA_DIST = cstyle.1 - -diff --git a/man/man1/cstyle.1 b/man/man1/cstyle.1 -new file mode 100644 -index 0000000..f467c55 ---- /dev/null -+++ b/man/man1/cstyle.1 -@@ -0,0 +1,167 @@ -+.\" Copyright 2009 Sun Microsystems, Inc. All rights reserved. -+.\" Use is subject to license terms. -+.\" -+.\" CDDL HEADER START -+.\" -+.\" The contents of this file are subject to the terms of the -+.\" Common Development and Distribution License (the "License"). -+.\" You may not use this file except in compliance with the License. -+.\" -+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+.\" or http://www.opensolaris.org/os/licensing. -+.\" See the License for the specific language governing permissions -+.\" and limitations under the License. -+.\" -+.\" When distributing Covered Code, include this CDDL HEADER in each -+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+.\" If applicable, add the following below this CDDL HEADER, with the -+.\" fields enclosed by brackets "[]" replaced with your own identifying -+.\" information: Portions Copyright [yyyy] [name of copyright owner] -+.\" -+.\" CDDL HEADER END -+.\" -+.TH cstyle 1 "28 March 2005" -+.SH NAME -+.I cstyle -+\- check for some common stylistic errors in C source files -+.SH SYNOPSIS -+\fBcstyle [-chpvCP] [-o constructs] [file...]\fP -+.LP -+.SH DESCRIPTION -+.IX "OS-Net build tools" "cstyle" "" "\fBcstyle\fP" -+.LP -+.I cstyle -+inspects C source files (*.c and *.h) for common sylistic errors. It -+attempts to check for the cstyle documented in -+\fIhttp://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf\fP. -+Note that there is much in that document that -+.I cannot -+be checked for; just because your code is \fBcstyle(1)\fP clean does not -+mean that you've followed Sun's C style. \fICaveat emptor\fP. -+.LP -+.SH OPTIONS -+.LP -+The following options are supported: -+.TP 4 -+.B \-c -+Check continuation line indentation inside of functions. Sun's C style -+states that all statements must be indented to an appropriate tab stop, -+and any continuation lines after them must be indented \fIexactly\fP four -+spaces from the start line. This option enables a series of checks -+designed to find contination line problems within functions only. The -+checks have some limitations; see CONTINUATION CHECKING, below. -+.LP -+.TP 4 -+.B \-h -+Performs heuristic checks that are sometimes wrong. Not generally used. -+.LP -+.TP 4 -+.B \-p -+Performs some of the more picky checks. Includes ANSI #else and #endif -+rules, and tries to detect spaces after casts. Used as part of the -+putback checks. -+.LP -+.TP 4 -+.B \-v -+Verbose output; includes the text of the line of error, and, for -+\fB-c\fP, the first statement in the current continuation block. -+.LP -+.TP 4 -+.B \-C -+Ignore errors in header comments (i.e. block comments starting in the -+first column). Not generally used. -+.LP -+.TP 4 -+.B \-P -+Check for use of non-POSIX types. Historically, types like "u_int" and -+"u_long" were used, but they are now deprecated in favor of the POSIX -+types uint_t, ulong_t, etc. This detects any use of the deprecated -+types. Used as part of the putback checks. -+.LP -+.TP 4 -+.B \-o \fIconstructs\fP -+Allow a comma-seperated list of additional constructs. Available -+constructs include: -+.LP -+.TP 10 -+.B doxygen -+Allow doxygen-style block comments (\fB/**\fP and \fB/*!\fP) -+.LP -+.TP 10 -+.B splint -+Allow splint-style lint comments (\fB/*@...@*/\fP) -+.LP -+.SH NOTES -+.LP -+The cstyle rule for the OS/Net consolidation is that all new files must -+be \fB-pP\fP clean. For existing files, the following invocations are -+run against both the old and new files: -+.LP -+.TP 4 -+\fBcstyle file\fB -+.LP -+.TP 4 -+\fBcstyle -p file\fB -+.LP -+.TP 4 -+\fBcstyle -pP file\fB -+.LP -+If the old file gave no errors for one of the invocations, the new file -+must also give no errors. This way, files can only become more clean. -+.LP -+.SH CONTINUATION CHECKING -+.LP -+The continuation checker is a resonably simple state machine that knows -+something about how C is layed out, and can match parenthesis, etc. over -+multiple lines. It does have some limitations: -+.LP -+.TP 4 -+.B 1. -+Preprocessor macros which cause unmatched parenthesis will confuse the -+checker for that line. To fix this, you'll need to make sure that each -+branch of the #if statement has balanced parenthesis. -+.LP -+.TP 4 -+.B 2. -+Some \fBcpp\fP macros do not require ;s after them. Any such macros -+*must* be ALL_CAPS; any lower case letters will cause bad output. -+.LP -+The bad output will generally be corrected after the next \fB;\fP, -+\fB{\fP, or \fB}\fP. -+.LP -+Some continuation error messages deserve some additional explanation -+.LP -+.TP 4 -+.B -+multiple statements continued over multiple lines -+A multi-line statement which is not broken at statement -+boundries. For example: -+.RS 4 -+.HP 4 -+if (this_is_a_long_variable == another_variable) a = -+.br -+b + c; -+.LP -+Will trigger this error. Instead, do: -+.HP 8 -+if (this_is_a_long_variable == another_variable) -+.br -+a = b + c; -+.RE -+.LP -+.TP 4 -+.B -+empty if/for/while body not on its own line -+For visibility, empty bodies for if, for, and while statements should be -+on their own line. For example: -+.RS 4 -+.HP 4 -+while (do_something(&x) == 0); -+.LP -+Will trigger this error. Instead, do: -+.HP 8 -+while (do_something(&x) == 0) -+.br -+; -+.RE -+ -diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 -index 26a46f1..007be77 100644 ---- a/man/man1/zhack.1 -+++ b/man/man1/zhack.1 -@@ -25,4 +25,5 @@ - .TH zhack 1 "2013 MAR 16" "ZFS on Linux" "User Commands" -+ - .SH NAME --.BR zhack " \- libzpool debugging tool" -+zhack \- libzpool debugging tool - .SH DESCRIPTION -diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am -index aac4d0b..fcb73f4 100644 ---- a/man/man5/Makefile.am -+++ b/man/man5/Makefile.am -@@ -1,3 +1,2 @@ --man_MANS = vdev_id.conf.5 zpool-features.5 --EXTRA_DIST = $(man_MANS) -+dist_man_MANS = vdev_id.conf.5 zpool-features.5 zfs-module-parameters.5 - -diff --git a/man/man5/vdev_id.conf.5 b/man/man5/vdev_id.conf.5 -index df3f59f..7ac3247 100644 ---- a/man/man5/vdev_id.conf.5 -+++ b/man/man5/vdev_id.conf.5 -@@ -51,5 +51,13 @@ connected to the disk enclosure being mapped. - .TP --\fIslot\fR --Maps a disk slot number as reported by the operating system --to an alternative slot number. -+\fIslot\fR [channel] -+Maps a disk slot number as reported by the operating system to an -+alternative slot number. If the \fIchannel\fR parameter is specified -+then the mapping is only applied to slots in the named channel, -+otherwise the mapping is applied to all channels. The first-specified -+\fIslot\fR rule that can match a slot takes precedence. Therefore a -+channel-specific mapping for a given slot should generally appear before -+a generic mapping for the same slot. In this way a custom mapping may -+be applied to a particular channel and a default mapping applied to the -+others. -+ - .TP -@@ -86,46 +94,29 @@ arbitrary slot re-mapping. - .P -+.nf - multipath no --.br - topology sas_direct --.br - phys_per_port 4 --.br - --.br - # PCI_SLOT HBA PORT CHANNEL NAME --.br - channel 85:00.0 1 A --.br - channel 85:00.0 0 B --.br - channel 86:00.0 1 C --.br - channel 86:00.0 0 D --.br - --.br -+ # Custom mapping for Channel A -+ - # Linux Mapped --.br -- # Slot Slot --.br -- slot 1 7 --.br -- slot 2 10 --.br -- slot 3 3 --.br -- slot 4 6 --.br -- slot 5 2 --.br -- slot 6 8 --.br -- slot 7 1 --.br -- slot 8 4 --.br -- slot 9 9 --.br -- slot 10 5 --.br -+ # Slot Slot Channel -+ slot 1 7 A -+ slot 2 10 A -+ slot 3 3 A -+ slot 4 6 A -+ -+ # Default mapping for B, C, and D -+ -+ slot 1 4 -+ slot 2 2 -+ slot 3 1 -+ slot 4 3 -+.fi - .P -@@ -135,16 +126,11 @@ keyword takes only two arguments in this example. - .P -+.nf - topology sas_switch --.br - --.br - # SWITCH PORT CHANNEL NAME --.br - channel 1 A --.br - channel 2 B --.br - channel 3 C --.br - channel 4 D --.br -+.fi - .P -@@ -153,16 +139,11 @@ definitions - one per physical path. - .P -+.nf - multipath yes --.br - --.br - # PCI_SLOT HBA PORT CHANNEL NAME --.br - channel 85:00.0 1 A --.br - channel 85:00.0 0 B --.br - channel 86:00.0 1 A --.br - channel 86:00.0 0 B --.br -+.fi - .P -@@ -170,11 +151,8 @@ A configuration using device link aliases. - .P --.br -+.nf - # by-vdev --.br - # name fully qualified or base name of device link --.br - alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca --.br - alias d2 wwn-0x5000c5002def789e --.br -+.fi - .P -diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 -new file mode 100644 -index 0000000..e0d44d2 ---- /dev/null -+++ b/man/man5/zfs-module-parameters.5 -@@ -0,0 +1,1375 @@ -+'\" te -+.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. -+.\" The contents of this file are subject to the terms of the Common Development -+.\" and Distribution License (the "License"). You may not use this file except -+.\" in compliance with the License. You can obtain a copy of the license at -+.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. -+.\" -+.\" See the License for the specific language governing permissions and -+.\" limitations under the License. When distributing Covered Code, include this -+.\" CDDL HEADER in each file and include the License file at -+.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this -+.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your -+.\" own identifying information: -+.\" Portions Copyright [yyyy] [name of copyright owner] -+.TH ZFS-MODULE-PARAMETERS 5 "Nov 16, 2013" -+.SH NAME -+zfs\-module\-parameters \- ZFS module parameters -+.SH DESCRIPTION -+.sp -+.LP -+Description of the different parameters to the ZFS module. -+ -+.SS "Module parameters" -+.sp -+.LP -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_again\fR (int) -+.ad -+.RS 12n -+Turbo L2ARC warmup -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_min_ms\fR (ulong) -+.ad -+.RS 12n -+Min feed interval in milliseconds -+.sp -+Default value: \fB200\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_feed_secs\fR (ulong) -+.ad -+.RS 12n -+Seconds between L2ARC writing -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_headroom\fR (ulong) -+.ad -+.RS 12n -+Number of max device writes to precache -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_headroom_boost\fR (ulong) -+.ad -+.RS 12n -+Compressed l2arc_headroom multiplier -+.sp -+Default value: \fB200\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_nocompress\fR (int) -+.ad -+.RS 12n -+Skip compressing L2ARC buffers -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_noprefetch\fR (int) -+.ad -+.RS 12n -+Skip caching prefetched buffers -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_norw\fR (int) -+.ad -+.RS 12n -+No reads during writes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_write_boost\fR (ulong) -+.ad -+.RS 12n -+Extra write bytes during device warmup -+.sp -+Default value: \fB8,388,608\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBl2arc_write_max\fR (ulong) -+.ad -+.RS 12n -+Max write bytes per interval -+.sp -+Default value: \fB8,388,608\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBmetaslab_debug\fR (int) -+.ad -+.RS 12n -+Keep space maps in core to verify frees -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspa_config_path\fR (charp) -+.ad -+.RS 12n -+SPA config file -+.sp -+Default value: \fB/etc/zfs/zpool.cache\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBspa_asize_inflation\fR (int) -+.ad -+.RS 12n -+Multiplication factor used to estimate actual disk consumption from the -+size of data being written. The default value is a worst case estimate, -+but lower values may be valid for a given pool depending on its -+configuration. Pool administrators who understand the factors involved -+may wish to specify a more realistic inflation factor, particularly if -+they operate close to quota or capacity limits. -+.sp -+Default value: 24 -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_array_rd_sz\fR (ulong) -+.ad -+.RS 12n -+Number of bytes in a array_read -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_block_cap\fR (uint) -+.ad -+.RS 12n -+Max number of blocks to fetch at a time -+.sp -+Default value: \fB256\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_max_streams\fR (uint) -+.ad -+.RS 12n -+Max number of streams per zfetch -+.sp -+Default value: \fB8\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfetch_min_sec_reap\fR (uint) -+.ad -+.RS 12n -+Min time before stream reclaim -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_grow_retry\fR (int) -+.ad -+.RS 12n -+Seconds before growing arc size -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_max\fR (ulong) -+.ad -+.RS 12n -+Max arc size -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_memory_throttle_disable\fR (int) -+.ad -+.RS 12n -+Disable memory throttle -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_meta_limit\fR (ulong) -+.ad -+.RS 12n -+Meta limit for arc size -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_meta_prune\fR (int) -+.ad -+.RS 12n -+Bytes of meta data to prune -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_min\fR (ulong) -+.ad -+.RS 12n -+Min arc size -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_min_prefetch_lifespan\fR (int) -+.ad -+.RS 12n -+Min life of prefetch block -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_p_aggressive_disable\fR (int) -+.ad -+.RS 12n -+Disable aggressive arc_p growth -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_p_dampener_disable\fR (int) -+.ad -+.RS 12n -+Disable arc_p adapt dampener -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_arc_shrink_shift\fR (int) -+.ad -+.RS 12n -+log2(fraction of arc to reclaim) -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_autoimport_disable\fR (int) -+.ad -+.RS 12n -+Disable pool import at module load -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dbuf_state_index\fR (int) -+.ad -+.RS 12n -+Calculate arc header index -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_deadman_enabled\fR (int) -+.ad -+.RS 12n -+Enable deadman timer -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_deadman_synctime_ms\fR (ulong) -+.ad -+.RS 12n -+Expiration time in milliseconds. This value has two meanings. First it is -+used to determine when the spa_deadman() logic should fire. By default the -+spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. -+Secondly, the value determines if an I/O is considered "hung". Any I/O that -+has not completed in zfs_deadman_synctime_ms is considered "hung" resulting -+in a zevent being logged. -+.sp -+Default value: \fB1,000,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dedup_prefetch\fR (int) -+.ad -+.RS 12n -+Enable prefetching dedup-ed blks -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_delay_min_dirty_percent\fR (int) -+.ad -+.RS 12n -+Start to delay each transaction once there is this amount of dirty data, -+expressed as a percentage of \fBzfs_dirty_data_max\fR. -+This value should be >= zfs_vdev_async_write_active_max_dirty_percent. -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: \fB60\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_delay_scale\fR (int) -+.ad -+.RS 12n -+This controls how quickly the transaction delay approaches infinity. -+Larger values cause longer delays for a given amount of dirty data. -+.sp -+For the smoothest delay, this value should be about 1 billion divided -+by the maximum number of operations per second. This will smoothly -+handle between 10x and 1/10th this number. -+.sp -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. -+.sp -+Default value: \fB500,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max\fR (int) -+.ad -+.RS 12n -+Determines the dirty space limit in bytes. Once this limit is exceeded, new -+writes are halted until space frees up. This parameter takes precedence -+over \fBzfs_dirty_data_max_percent\fR. -+See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 10 percent of all memory, capped at \fBzfs_dirty_data_max_max\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_max\fR (int) -+.ad -+.RS 12n -+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. -+This limit is only enforced at module load time, and will be ignored if -+\fBzfs_dirty_data_max\fR is later changed. This parameter takes -+precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section -+"ZFS TRANSACTION DELAY". -+.sp -+Default value: 25% of physical RAM. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_max_percent\fR (int) -+.ad -+.RS 12n -+Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a -+percentage of physical RAM. This limit is only enforced at module load -+time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. -+The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this -+one. See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 25 -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_max_percent\fR (int) -+.ad -+.RS 12n -+Determines the dirty space limit, expressed as a percentage of all -+memory. Once this limit is exceeded, new writes are halted until space frees -+up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this -+one. See the section "ZFS TRANSACTION DELAY". -+.sp -+Default value: 10%, subject to \fBzfs_dirty_data_max_max\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_dirty_data_sync\fR (int) -+.ad -+.RS 12n -+Start syncing out a transaction group if there is at least this much dirty data. -+.sp -+Default value: \fB67,108,864\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_read_max_active\fR (int) -+.ad -+.RS 12n -+Maxium asynchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB3\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_read_min_active\fR (int) -+.ad -+.RS 12n -+Minimum asynchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) -+.ad -+.RS 12n -+When the pool has more than -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use -+\fBzfs_vdev_async_write_max_active\fR to limit active async writes. If -+the dirty data is between min and max, the active I/O limit is linearly -+interpolated. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB60\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) -+.ad -+.RS 12n -+When the pool has less than -+\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use -+\fBzfs_vdev_async_write_min_active\fR to limit active async writes. If -+the dirty data is between min and max, the active I/O limit is linearly -+interpolated. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB30\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_max_active\fR (int) -+.ad -+.RS 12n -+Maxium asynchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_async_write_min_active\fR (int) -+.ad -+.RS 12n -+Minimum asynchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_max_active\fR (int) -+.ad -+.RS 12n -+The maximum number of I/Os active to each device. Ideally, this will be >= -+the sum of each queue's max_active. It must be at least the sum of each -+queue's min_active. See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scrub_max_active\fR (int) -+.ad -+.RS 12n -+Maxium scrub I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scrub_min_active\fR (int) -+.ad -+.RS 12n -+Minimum scrub I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_read_max_active\fR (int) -+.ad -+.RS 12n -+Maxium synchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_read_min_active\fR (int) -+.ad -+.RS 12n -+Minimum synchronous read I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_write_max_active\fR (int) -+.ad -+.RS 12n -+Maxium synchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_sync_write_min_active\fR (int) -+.ad -+.RS 12n -+Minimum synchronous write I/Os active to each device. -+See the section "ZFS I/O SCHEDULER". -+.sp -+Default value: \fB10\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_disable_dup_eviction\fR (int) -+.ad -+.RS 12n -+Disable duplicate buffer eviction -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_expire_snapshot\fR (int) -+.ad -+.RS 12n -+Seconds to expire .zfs/snapshot -+.sp -+Default value: \fB300\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_flags\fR (int) -+.ad -+.RS 12n -+Set additional debugging flags -+.sp -+Default value: \fB1\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_free_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to free per txg -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_immediate_write_sz\fR (long) -+.ad -+.RS 12n -+Largest data block to write to zil -+.sp -+Default value: \fB32,768\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_mdcomp_disable\fR (int) -+.ad -+.RS 12n -+Disable meta data compression -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_no_scrub_io\fR (int) -+.ad -+.RS 12n -+Set for no scrub I/O -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_no_scrub_prefetch\fR (int) -+.ad -+.RS 12n -+Set for no scrub prefetching -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_nocacheflush\fR (int) -+.ad -+.RS 12n -+Disable cache flushes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_nopwrite_enabled\fR (int) -+.ad -+.RS 12n -+Enable NOP writes -+.sp -+Use \fB1\fR for yes (default) and \fB0\fR to disable. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_pd_blks_max\fR (int) -+.ad -+.RS 12n -+Max number of blocks to prefetch -+.sp -+Default value: \fB100\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_prefetch_disable\fR (int) -+.ad -+.RS 12n -+Disable all ZFS prefetching -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_chunk_size\fR (long) -+.ad -+.RS 12n -+Bytes to read per chunk -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_history\fR (int) -+.ad -+.RS 12n -+Historic statistics for the last N reads -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_read_history_hits\fR (int) -+.ad -+.RS 12n -+Include cache hits in read history -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_recover\fR (int) -+.ad -+.RS 12n -+Set to attempt to recover from fatal errors. This should only be used as a -+last resort, as it typically results in leaked space, or worse. -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_resilver_delay\fR (int) -+.ad -+.RS 12n -+Number of ticks to delay resilver -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_resilver_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to resilver per txg -+.sp -+Default value: \fB3,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scan_idle\fR (int) -+.ad -+.RS 12n -+Idle window in clock ticks -+.sp -+Default value: \fB50\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scan_min_time_ms\fR (int) -+.ad -+.RS 12n -+Min millisecs to scrub per txg -+.sp -+Default value: \fB1,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_scrub_delay\fR (int) -+.ad -+.RS 12n -+Number of ticks to delay scrub -+.sp -+Default value: \fB4\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_send_corrupt_data\fR (int) -+.ad -+.RS 12n -+Allow to send corrupt data (ignore read/checksum errors when sending data) -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_deferred_free\fR (int) -+.ad -+.RS 12n -+Defer frees starting in this pass -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_dont_compress\fR (int) -+.ad -+.RS 12n -+Don't compress starting in this pass -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_sync_pass_rewrite\fR (int) -+.ad -+.RS 12n -+Rewrite new bps starting in this pass -+.sp -+Default value: \fB2\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_top_maxinflight\fR (int) -+.ad -+.RS 12n -+Max I/Os per top-level -+.sp -+Default value: \fB32\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_txg_history\fR (int) -+.ad -+.RS 12n -+Historic statistics for the last N txgs -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_txg_timeout\fR (int) -+.ad -+.RS 12n -+Max seconds worth of delta per txg -+.sp -+Default value: \fB5\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_aggregation_limit\fR (int) -+.ad -+.RS 12n -+Max vdev I/O aggregation size -+.sp -+Default value: \fB131,072\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_bshift\fR (int) -+.ad -+.RS 12n -+Shift size to inflate reads too -+.sp -+Default value: \fB16\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_max\fR (int) -+.ad -+.RS 12n -+Inflate reads small than max -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_cache_size\fR (int) -+.ad -+.RS 12n -+Total size of the per-disk cache -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_mirror_switch_us\fR (int) -+.ad -+.RS 12n -+Switch mirrors every N usecs -+.sp -+Default value: \fB10,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_read_gap_limit\fR (int) -+.ad -+.RS 12n -+Aggregate read I/O over gap -+.sp -+Default value: \fB32,768\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_scheduler\fR (charp) -+.ad -+.RS 12n -+I/O scheduler -+.sp -+Default value: \fBnoop\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_vdev_write_gap_limit\fR (int) -+.ad -+.RS 12n -+Aggregate write I/O over gap -+.sp -+Default value: \fB4,096\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_cols\fR (int) -+.ad -+.RS 12n -+Max event column width -+.sp -+Default value: \fB80\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_console\fR (int) -+.ad -+.RS 12n -+Log events to the console -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzfs_zevent_len_max\fR (int) -+.ad -+.RS 12n -+Max event queue length -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzil_replay_disable\fR (int) -+.ad -+.RS 12n -+Disable intent logging replay -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzil_slog_limit\fR (ulong) -+.ad -+.RS 12n -+Max commit bytes to separate log device -+.sp -+Default value: \fB1,048,576\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_bulk_flags\fR (int) -+.ad -+.RS 12n -+Additional flags to pass to bulk buffers -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_delay_max\fR (int) -+.ad -+.RS 12n -+Max zio millisec delay before posting event -+.sp -+Default value: \fB30,000\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_injection_enabled\fR (int) -+.ad -+.RS 12n -+Enable fault injection -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzio_requeue_io_start_cut_in_line\fR (int) -+.ad -+.RS 12n -+Prioritize requeued I/O -+.sp -+Default value: \fB0\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_inhibit_dev\fR (uint) -+.ad -+.RS 12n -+Do not create zvol device nodes -+.sp -+Use \fB1\fR for yes and \fB0\fR for no (default). -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_major\fR (uint) -+.ad -+.RS 12n -+Major number for zvol device -+.sp -+Default value: \fB230\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_max_discard_blocks\fR (ulong) -+.ad -+.RS 12n -+Max number of blocks to discard at once -+.sp -+Default value: \fB16,384\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fBzvol_threads\fR (uint) -+.ad -+.RS 12n -+Number of threads for zvol device -+.sp -+Default value: \fB32\fR. -+.RE -+ -+.SH ZFS I/O SCHEDULER -+ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. -+The I/O scheduler determines when and in what order those operations are -+issued. The I/O scheduler divides operations into five I/O classes -+prioritized in the following order: sync read, sync write, async read, -+async write, and scrub/resilver. Each queue defines the minimum and -+maximum number of concurrent operations that may be issued to the -+device. In addition, the device has an aggregate maximum, -+\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums -+must not exceed the aggregate maximum. If the sum of the per-queue -+maximums exceeds the aggregate maximum, then the number of active I/Os -+may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will -+be issued regardless of whether all per-queue minimums have been met. -+.sp -+For many physical devices, throughput increases with the number of -+concurrent operations, but latency typically suffers. Further, physical -+devices typically have a limit at which more concurrent operations have no -+effect on throughput or can actually cause it to decrease. -+.sp -+The scheduler selects the next operation to issue by first looking for an -+I/O class whose minimum has not been satisfied. Once all are satisfied and -+the aggregate maximum has not been hit, the scheduler looks for classes -+whose maximum has not been satisfied. Iteration through the I/O classes is -+done in the order specified above. No further operations are issued if the -+aggregate maximum number of concurrent operations has been hit or if there -+are no operations queued for an I/O class that has not hit its maximum. -+Every time an I/O is queued or an operation completes, the I/O scheduler -+looks for new operations to issue. -+.sp -+In general, smaller max_active's will lead to lower latency of synchronous -+operations. Larger max_active's may lead to higher overall throughput, -+depending on underlying storage. -+.sp -+The ratio of the queues' max_actives determines the balance of performance -+between reads, writes, and scrubs. E.g., increasing -+\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete -+more quickly, but reads and writes to have higher latency and lower throughput. -+.sp -+All I/O classes have a fixed maximum number of outstanding operations -+except for the async write class. Asynchronous writes represent the data -+that is committed to stable storage during the syncing stage for -+transaction groups. Transaction groups enter the syncing state -+periodically so the number of queued async writes will quickly burst up -+and then bleed down to zero. Rather than servicing them as quickly as -+possible, the I/O scheduler changes the maximum number of active async -+write I/Os according to the amount of dirty data in the pool. Since -+both throughput and latency typically increase with the number of -+concurrent operations issued to physical devices, reducing the -+burstiness in the number of concurrent operations also stabilizes the -+response time of operations from other -- and in particular synchronous -+-- queues. In broad strokes, the I/O scheduler will issue more -+concurrent operations from the async write queue as there's more dirty -+data in the pool. -+.sp -+Async Writes -+.sp -+The number of concurrent operations issued for the async write I/O class -+follows a piece-wise linear function defined by a few adjustable points. -+.nf -+ -+ | o---------| <-- zfs_vdev_async_write_max_active -+ ^ | /^ | -+ | | / | | -+active | / | | -+ I/O | / | | -+count | / | | -+ | / | | -+ |-------o | | <-- zfs_vdev_async_write_min_active -+ 0|_______^______|_________| -+ 0% | | 100% of zfs_dirty_data_max -+ | | -+ | `-- zfs_vdev_async_write_active_max_dirty_percent -+ `--------- zfs_vdev_async_write_active_min_dirty_percent -+ -+.fi -+Until the amount of dirty data exceeds a minimum percentage of the dirty -+data allowed in the pool, the I/O scheduler will limit the number of -+concurrent operations to the minimum. As that threshold is crossed, the -+number of concurrent operations issued increases linearly to the maximum at -+the specified maximum percentage of the dirty data allowed in the pool. -+.sp -+Ideally, the amount of dirty data on a busy pool will stay in the sloped -+part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR -+and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the -+maximum percentage, this indicates that the rate of incoming data is -+greater than the rate that the backend storage can handle. In this case, we -+must further throttle incoming writes, as described in the next section. -+ -+.SH ZFS TRANSACTION DELAY -+We delay transactions when we've determined that the backend storage -+isn't able to accommodate the rate of incoming writes. -+.sp -+If there is already a transaction waiting, we delay relative to when -+that transaction will finish waiting. This way the calculated delay time -+is independent of the number of threads concurrently executing -+transactions. -+.sp -+If we are the only waiter, wait relative to when the transaction -+started, rather than the current time. This credits the transaction for -+"time already served", e.g. reading indirect blocks. -+.sp -+The minimum time for a transaction to take is calculated as: -+.nf -+ min_time = zfs_delay_scale * (dirty - min) / (max - dirty) -+ min_time is then capped at 100 milliseconds. -+.fi -+.sp -+The delay has two degrees of freedom that can be adjusted via tunables. The -+percentage of dirty data at which we start to delay is defined by -+\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above -+\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to -+delay after writing at full speed has failed to keep up with the incoming write -+rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, -+this variable determines the amount of delay at the midpoint of the curve. -+.sp -+.nf -+delay -+ 10ms +-------------------------------------------------------------*+ -+ | *| -+ 9ms + *+ -+ | *| -+ 8ms + *+ -+ | * | -+ 7ms + * + -+ | * | -+ 6ms + * + -+ | * | -+ 5ms + * + -+ | * | -+ 4ms + * + -+ | * | -+ 3ms + * + -+ | * | -+ 2ms + (midpoint) * + -+ | | ** | -+ 1ms + v *** + -+ | zfs_delay_scale ----------> ******** | -+ 0 +-------------------------------------*********----------------+ -+ 0% <- zfs_dirty_data_max -> 100% -+.fi -+.sp -+Note that since the delay is added to the outstanding time remaining on the -+most recent transaction, the delay is effectively the inverse of IOPS. -+Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve -+was chosen such that small changes in the amount of accumulated dirty data -+in the first 3/4 of the curve yield relatively small differences in the -+amount of delay. -+.sp -+The effects can be easier to understand when the amount of delay is -+represented on a log scale: -+.sp -+.nf -+delay -+100ms +-------------------------------------------------------------++ -+ + + -+ | | -+ + *+ -+ 10ms + *+ -+ + ** + -+ | (midpoint) ** | -+ + | ** + -+ 1ms + v **** + -+ + zfs_delay_scale ----------> ***** + -+ | **** | -+ + **** + -+100us + ** + -+ + * + -+ | * | -+ + * + -+ 10us + * + -+ + + -+ | | -+ + + -+ +--------------------------------------------------------------+ -+ 0% <- zfs_dirty_data_max -> 100% -+.fi -+.sp -+Note here that only as the amount of dirty data approaches its limit does -+the delay start to increase rapidly. The goal of a properly tuned system -+should be to keep the amount of dirty data out of that range by first -+ensuring that the appropriate limits are set for the I/O scheduler to reach -+optimal throughput on the backend storage, and then by changing the value -+of \fBzfs_delay_scale\fR to increase the steepness of the curve. -diff --git a/man/man8/.gitignore b/man/man8/.gitignore -new file mode 100644 -index 0000000..be7e904 ---- /dev/null -+++ b/man/man8/.gitignore -@@ -0,0 +1 @@ -+/zed.8 -diff --git a/man/man8/Makefile.am b/man/man8/Makefile.am -index be7bc1d..b89e34d 100644 ---- a/man/man8/Makefile.am -+++ b/man/man8/Makefile.am -@@ -1,2 +1,2 @@ --man_MANS = \ -+dist_man_MANS = \ - fsck.zfs.8 \ -@@ -10,3 +10,21 @@ man_MANS = \ - --EXTRA_DIST = $(man_MANS) -+nodist_man_MANS = \ -+ zed.8 -+ -+EXTRA_DIST = \ -+ zed.8.in -+ -+zed.8: $(srcdir)/zed.8.in -+ -+do_subst = $(SED) \ -+ -e 's|@libexecdir[@]|$(libexecdir)|g' \ -+ -e 's|@runstatedir[@]|$(runstatedir)|g' \ -+ -e 's|@sysconfdir[@]|$(sysconfdir)|g' -+ -+$(nodist_man_MANS): Makefile -+ $(RM) $@ $@.tmp -+ srcdir=''; \ -+ test -f ./$@.in || srcdir=$(srcdir)/; \ -+ $(do_subst) $${srcdir}$@.in >$@.tmp -+ mv $@.tmp $@ - -@@ -14 +32,4 @@ install-data-local: - $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man8" -+ -+CLEANFILES = \ -+ $(nodist_man_MANS) -diff --git a/man/man8/fsck.zfs.8 b/man/man8/fsck.zfs.8 -index 08b4308..baa8c33 100644 ---- a/man/man8/fsck.zfs.8 -+++ b/man/man8/fsck.zfs.8 -@@ -27,3 +27,3 @@ - .SH NAME --.BR fsck.zfs " \- Dummy ZFS filesystem checker." -+fsck.zfs \- Dummy ZFS filesystem checker. - -diff --git a/man/man8/mount.zfs.8 b/man/man8/mount.zfs.8 -index 60c36fe..b4e2406 100644 ---- a/man/man8/mount.zfs.8 -+++ b/man/man8/mount.zfs.8 -@@ -77,2 +77,15 @@ Print the usage message. - .TP -+.BI "\-o context" -+This flag sets the SELinux context for all files in the filesytem -+under that mountpoint. -+.TP -+.BI "\-o fscontext" -+This flag sets the SELinux context for the filesytem being mounted. -+.TP -+.BI "\-o defcontext" -+This flag sets the SELinux context for unlabled files. -+.TP -+.BI "\-o rootcontext" -+This flag sets the SELinux context for the root inode of the filesystem. -+.TP - .BI "\-o legacy" -diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 -index 364cf30..6f4f40d 100644 ---- a/man/man8/zdb.8 -+++ b/man/man8/zdb.8 -@@ -135,2 +135,8 @@ the allocated (physically present on disk) and referenced (logically - referenced in the pool) block counts and sizes by reference count. -+.sp -+If specified a third time, display the statistics independently for each deduplication table. -+.sp -+If specified a fourth time, dump the contents of the deduplication tables describing duplicate blocks. -+.sp -+If specified a fifth time, also dump the contents of the deduplication tables describing unique blocks. - .RE -diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in -new file mode 100644 -index 0000000..b853d86 ---- /dev/null -+++ b/man/man8/zed.8.in -@@ -0,0 +1,265 @@ -+.\" -+.\" CDDL HEADER START -+.\" -+.\" The contents of this file are subject to the terms of the -+.\" Common Development and Distribution License (the "License"). -+.\" You may not use this file except in compliance with the License. -+.\" -+.\" You can obtain a copy of the license from the top-level -+.\" OPENSOLARIS.LICENSE or . -+.\" See the License for the specific language governing permissions -+.\" and limitations under the License. -+.\" -+.\" When distributing Covered Code, include this CDDL HEADER in each file -+.\" and include the License file from the top-level OPENSOLARIS.LICENSE. -+.\" If applicable, add the following below this CDDL HEADER, with the -+.\" fields enclosed by brackets "[]" replaced with your own identifying -+.\" information: Portions Copyright [yyyy] [name of copyright owner] -+.\" -+.\" CDDL HEADER END -+.\" -+.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). -+.\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. -+.\" -+.TH ZED 8 "Octember 1, 2013" "ZFS on Linux" "System Administration Commands" -+ -+.SH NAME -+zed \- ZFS Event Daemon -+ -+.SH SYNOPSIS -+.HP -+.B zed -+.\" [\fB\-c\fR \fIconfigfile\fR] -+[\fB\-d\fR \fIscriptdir\fR] -+[\fB\-f\fR] -+[\fB\-F\fR] -+[\fB\-h\fR] -+[\fB\-L\fR] -+[\fB\-M\fR] -+[\fB\-p\fR \fIpidfile\fR] -+[\fB\-s\fR \fIstatefile\fR] -+[\fB\-v\fR] -+[\fB\-V\fR] -+[\fB\-Z\fR] -+ -+.SH DESCRIPTION -+.PP -+\fBzed\fR (ZFS Event Daemon) monitors events generated by the ZFS kernel -+module. When a ZFS event (zevent) is posted, \fBzed\fR will run any scripts -+that have been enabled for the corresponding zevent class. -+ -+.SH OPTIONS -+.TP -+.BI \-h -+Display a summary of the command-line options. -+.TP -+.BI \-L -+Display license information. -+.TP -+.BI \-V -+Display version information. -+.TP -+.BI \-v -+Be verbose. -+.TP -+.BI \-f -+Force the daemon to run if at all possible, disabling security checks and -+throwing caution to the wind. Not recommended for use in production. -+.TP -+.BI \-F -+Run the daemon in the foreground. -+.TP -+.BI \-M -+Lock all current and future pages in the virtual memory address space. -+This may help the daemon remain responsive when the system is under heavy -+memory pressure. -+.TP -+.BI \-Z -+Zero the daemon's state, thereby allowing zevents still within the kernel -+to be reprocessed. -+.\" .TP -+.\" .BI \-c\ configfile -+.\" Read the configuration from the specified file. -+.TP -+.BI \-d\ scriptdir -+Read the enabled scripts from the specified directory. -+.TP -+.BI \-p\ pidfile -+Write the daemon's process ID to the specified file. -+.TP -+.BI \-s\ statefile -+Write the daemon's state to the specified file. -+ -+.SH ZEVENTS -+.PP -+A zevent is comprised of a list of name/value pairs (nvpairs). Each zevent -+contains an EID (Event IDentifier) that uniquely identifies it throughout -+the lifetime of the loaded ZFS kernel module; this EID is a monotonically -+increasing integer that resets to 1 each time the kernel module is loaded. -+Each zevent also contains a class string that identifies the type of event. -+For brevity, a subclass string is defined that omits the leading components -+of the class string. Additional nvpairs exist to provide event details. -+.PP -+The kernel maintains a list of recent zevents that can be viewed (along with -+their associated lists of nvpairs) using the "\fBzpool events \-v\fR" command. -+ -+.SH CONFIGURATION -+.PP -+The scripts to be invoked in response to zevents are located in the -+enabled-scripts directory. These can be symlinked or copied from the -+installed-scripts directory; symlinks allow for automatic updates from the -+installed scripts, whereas copies preserve local modifications. As a security -+measure, scripts must be owned by root. They must have execute permissions -+for the user, but they must not have write permissions for group or other. -+Dotfiles are ignored. -+.PP -+Scripts are named after the zevent class for which they should be invoked. -+In particular, a script will be invoked for a given zevent if either its -+class or subclass string is a prefix of its filename (and is followed by -+a non-alphabetic character). As a special case, the prefix "all" matches -+all zevents. Multiple scripts may be invoked for a given zevent. -+ -+.SH SCRIPTS -+.PP -+Scripts should be written under the presumption they can be invoked -+concurrently, and they should use appropriate locking to access any shared -+resources. Common variables used by the scripts can be stored in the default -+rc file which is sourced by the scripts; these variables should be prefixed -+with "ZED_". -+.PP -+The zevent nvpairs are passed to the scripts as environment variables. -+Each nvpair name is converted to an environment variable in the following -+manner: 1) it is prefixed with "ZEVENT_", 2) it is converted to uppercase, -+and 3) each non-alphanumeric character is converted to an underscore. -+Some additional environment variables have been defined to present certain -+nvpair values in a more convenient form. An incomplete list of zevent -+environment variables is as follows: -+.TP -+.B -+ZEVENT_EID -+The Event IDentifier. -+.TP -+.B -+ZEVENT_CLASS -+The zevent class string. -+.TP -+.B -+ZEVENT_SUBCLASS -+The zevent subclass string. -+.TP -+.B -+ZEVENT_TIME -+The time at which the zevent was posted as -+"\fIseconds\fR\ \fInanoseconds\fR" since the Epoch. -+.TP -+.B -+ZEVENT_TIME_SECS -+The \fIseconds\fR component of ZEVENT_TIME. -+.TP -+.B -+ZEVENT_TIME_NSECS -+The \fInanoseconds\fR component of ZEVENT_TIME. -+.TP -+.B -+ZEVENT_TIME_STRING -+An almost-RFC3339-compliant string for ZEVENT_TIME. -+.PP -+Additionally, the following ZED & ZFS variables are defined: -+.TP -+.B -+ZED_PID -+The daemon's process ID. -+.TP -+.B -+ZED_SCRIPT_DIR -+The daemon's current enabled-scripts directory. -+.TP -+.B -+ZFS_ALIAS -+The ZFS alias (name-ver-rel) string used to build the daemon. -+.TP -+.B -+ZFS_VERSION -+The ZFS version used to build the daemon. -+.TP -+.B -+ZFS_RELEASE -+The ZFS release used to build the daemon. -+.PP -+Scripts may need to call other ZFS commands. The installation paths of -+the following executables are defined: \fBZDB\fR, \fBZED\fR, \fBZFS\fR, -+\fBZINJECT\fR, and \fBZPOOL\fR. These variables can be overridden in the -+zed.rc if needed. -+ -+.SH FILES -+.\" .TP -+.\" @sysconfdir@/zfs/zed.conf -+.\" The default configuration file for the daemon. -+.TP -+.I @sysconfdir@/zfs/zed.d -+The default directory for enabled scripts. -+.TP -+.I @sysconfdir@/zfs/zed.d/zed.rc -+The default rc file for common variables used by the scripts. -+.TP -+.I @libexecdir@/zfs/zed.d -+The default directory for installed scripts. -+.TP -+.I @runstatedir@/zed.pid -+The default file containing the daemon's process ID. -+.TP -+.I @runstatedir@/zed.state -+The default file containing the daemon's state. -+ -+.SH SIGNALS -+.TP -+.B HUP -+Reconfigure the daemon and rescan the directory for enabled scripts. -+.TP -+.B TERM -+Terminate the daemon. -+ -+.SH NOTES -+.PP -+\fBzed\fR requires root privileges. -+.\" Do not taunt zed. -+ -+.SH BUGS -+.PP -+Events are processed synchronously by a single thread. This can delay the -+processing of simultaneous zevents. -+.PP -+There is no maximum timeout for script execution. Consequently, a misbehaving -+script can delay the processing of subsequent zevents. -+.PP -+The ownership and permissions of the enabled-scripts directory (along -+with all parent directories) are not checked. If any of these directories -+are improperly owned or permissioned, an unprivileged user could insert a -+script to be executed as root. The requirement that scripts be owned by -+root mitigates this to some extent. -+.PP -+Scripts are unable to return state/status information to the kernel. -+.PP -+Some zevent nvpair types are not handled. These are denoted by zevent -+environment variables having a "_NOT_IMPLEMENTED_" value. -+.PP -+Internationalization support via gettext has not been added. -+.PP -+The configuration file is not yet implemented. -+.PP -+The diagnosis engine is not yet implemented. -+ -+.SH COPYRIGHT -+.PP -+Developed at Lawrence Livermore National Laboratory (LLNL\-CODE\-403049). -+.br -+Copyright (C) 2013\-2014 Lawrence Livermore National Security, LLC. -+ -+.SH LICENSE -+.PP -+\fBzed\fR (ZFS Event Daemon) is distributed under the terms of the -+Common Development and Distribution License (CDDL\-1.0). -+ -+.SH SEE ALSO -+.BR zfs (8), -+.BR zpool (8) -diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 -index a0089e6..a45f640 100644 ---- a/man/man8/zfs.8 -+++ b/man/man8/zfs.8 -@@ -29,3 +29,3 @@ - .\" --.TH zfs 8 "Jan 10, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" -+.TH zfs 8 "Nov 19, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands" - .SH NAME -@@ -60,4 +60,4 @@ zfs \- configures ZFS file systems - .nf --\fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR]... -- \fIfilesystem@snapname\fR|\fIvolume@snapname\fR -+\fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... -+ \fIfilesystem@snapname\fR|\fIvolume@snapname\fR ... - .fi -@@ -97,4 +97,4 @@ zfs \- configures ZFS file systems - .nf --\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-H\fR][\fB-o\fR \fIproperty\fR[,...]] [\fB-t\fR \fItype\fR[,...]] -- [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fIsnap\fR] ... -+\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]..] -+ [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ... - .fi -@@ -246,3 +246,2 @@ zfs \- configures ZFS file systems - .SH DESCRIPTION --.sp - .LP -@@ -297,3 +296,2 @@ A read-only version of a file system or volume at a given point in time. It is s - .SS "ZFS File System Hierarchy" --.sp - .LP -@@ -307,3 +305,2 @@ See \fBzpool\fR(8) for more information on creating and administering pools. - .SS "Snapshots" --.sp - .LP -@@ -312,3 +309,3 @@ A snapshot is a read-only copy of a file system or volume. Snapshots can be crea - .LP --Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back, but cannot be accessed independently. -+Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back. Visibility is determined by the \fBsnapdev\fR property of the parent volume. - .sp -@@ -317,3 +314,2 @@ File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory - .SS "Clones" --.sp - .LP -@@ -327,3 +323,2 @@ The clone parent-child dependency relationship can be reversed by using the \fBp - .SS "Mount Points" --.sp - .LP -@@ -343,3 +338,2 @@ If needed, \fBZFS\fR file systems can also be managed with traditional tools (\f - .SS "Deduplication" --.sp - .LP -@@ -347,3 +341,2 @@ Deduplication is the process for removing redundant data at the block-level, red - .SS "Native Properties" --.sp - .LP -@@ -433,2 +426,36 @@ This property is \fBon\fR if the snapshot has been marked for deferred destructi - .na -+\fB\fBlogicalreferenced\fR\fR -+.ad -+.sp .6 -+.RS 4n -+The amount of space that is "logically" accessible by this dataset. See -+the \fBreferenced\fR property. The logical space ignores the effect of -+the \fBcompression\fR and \fBcopies\fR properties, giving a quantity -+closer to the amount of data that applications see. However, it does -+include space consumed by metadata. -+.sp -+This property can also be referred to by its shortened column name, -+\fBlrefer\fR. -+.RE -+ -+.sp -+.ne 2 -+.na -+\fB\fBlogicalused\fR\fR -+.ad -+.sp .6 -+.RS 4n -+The amount of space that is "logically" consumed by this dataset and all -+its descendents. See the \fBused\fR property. The logical space -+ignores the effect of the \fBcompression\fR and \fBcopies\fR properties, -+giving a quantity closer to the amount of data that applications see. -+However, it does include space consumed by metadata. -+.sp -+This property can also be referred to by its shortened column name, -+\fBlused\fR. -+.RE -+ -+.sp -+.ne 2 -+.na - \fB\fBmounted\fR\fR -@@ -678,2 +705,4 @@ Controls how \fBACL\fR entries are inherited when files and directories are crea - When the property value is set to \fBpassthrough\fR, files are created with a mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs exist that affect the mode, then the mode is set in accordance to the requested mode from the application. -+.sp -+The \fBaclinherit\fR property does not apply to Posix ACLs. - .RE -@@ -684,3 +713,3 @@ When the property value is set to \fBpassthrough\fR, files are created with a mo - .na --\fB\fBaclmode\fR=\fBdiscard\fR | \fBgroupmask\fR | \fBpassthrough\fR\fR -+\fB\fBacltype\fR=\fBnoacl\fR | \fBposixacl\fR \fR - .ad -@@ -688,3 +717,15 @@ When the property value is set to \fBpassthrough\fR, files are created with a mo - .RS 4n --Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with an \fBaclmode\fR property of \fBdiscard\fR deletes all \fBACL\fR entries that do not represent the mode of the file. An \fBaclmode\fR property of \fBgroupmask\fR (the default) reduces user or group permissions. The permissions are reduced, such that they are no greater than the group permission bits, unless it is a user entry that has the same \fBUID\fR as the owner of the file or directory. In this case, the \fBACL\fR permissions are reduced so that they are no greater than owner permission bits. A file system with an \fBaclmode\fR property of \fBpassthrough\fR indicates that no changes are made to the \fBACL\fR other than generating the necessary \fBACL\fR entries to represent the new mode of the file or directory. -+Controls whether ACLs are enabled and if so what type of ACL to use. When -+a file system has the \fBacltype\fR property set to \fBnoacl\fR (the default) -+then ACLs are disabled. Setting the \fBacltype\fR property to \fBposixacl\fR -+indicates Posix ACLs should be used. Posix ACLs are specific to Linux and -+are not functional on other platforms. Posix ACLs are stored as an xattr and -+therefore will not overwrite any existing ZFS/NFSv4 ACLs which may be set. -+Currently only \fBposixacls\fR are supported on Linux. -+.sp -+To obtain the best performance when setting \fBposixacl\fR users are strongly -+encouraged to set the \fBxattr=sa\fR property. This will result in the -+Posix ACL being stored more efficiently on disk. But as a consequence of this -+all new xattrs will only be accessable from ZFS implementations which support -+the \fBxattr=sa\fR property. See the \fBxattr\fR property for more details. - .RE -@@ -699,3 +740,3 @@ Controls how an \fBACL\fR is modified during \fBchmod\fR(2). A file system with - .RS 4n --Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. -+Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR. See also \fBrelatime\fR below. - .RE -@@ -986,2 +1027,13 @@ This property can also be referred to by its shortened column name, \fBrefreserv - .na -+\fB\fBrelatime\fR=\fBon\fR | \fBoff\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Controls the manner in which the access time is updated when \fBatime=on\fR is set. Turning this property \fBon\fR causes the access time to be updated relative to the modify or change time. Access time is only updated if the previous access time was earlier than the current modify or change time or if the existing access time hasn't been updated within the past 24 hours. The default value is \fBoff\fR. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR -@@ -1178,3 +1230,3 @@ Controls whether regular files should be scanned for viruses when a file is open - .na --\fB\fBxattr\fR=\fBon\fR | \fBoff\fR\fR -+\fB\fBxattr\fR=\fBon\fR | \fBoff\fR | \fBsa\fR\fR - .ad -@@ -1182,3 +1234,25 @@ Controls whether regular files should be scanned for viruses when a file is open - .RS 4n --Controls whether extended attributes are enabled for this file system. The default value is \fBon\fR. -+Controls whether extended attributes are enabled for this file system. Two -+styles of extended attributes are supported either directory based or system -+attribute based. -+.sp -+The default value of \fBon\fR enables directory based extended attributes. -+This style of xattr imposes no practical limit on either the size or number of -+xattrs which may be set on a file. Although under Linux the \fBgetxattr\fR(2) -+and \fBsetxattr\fR(2) system calls limit the maximum xattr size to 64K. This -+is the most compatible style of xattr and it is supported by the majority of -+ZFS implementations. -+.sp -+System attribute based xattrs may be enabled by setting the value to \fBsa\fR. -+The key advantage of this type of xattr is improved performance. Storing -+xattrs as system attributes significantly decreases the amount of disk IO -+required. Up to 64K of xattr data may be stored per file in the space reserved -+for system attributes. If there is not enough space available for an xattr then -+it will be automatically written as a directory based xattr. System attribute -+based xattrs are not accessable on platforms which do not support the -+\fBxattr=sa\fR feature. -+.sp -+The use of system attribute based xattrs is strongly encouraged for users of -+SELinux or Posix ACLs. Both of these features heavily rely of xattrs and -+benefit significantly from the reduced xattr access time. - .RE -@@ -1237,4 +1311,49 @@ Indicates whether the file system should reject file names that include characte - The \fBcasesensitivity\fR, \fBnormalization\fR, and \fButf8only\fR properties are also new permissions that can be assigned to non-privileged users by using the \fBZFS\fR delegated administration feature. --.SS "Temporary Mount Point Properties" -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for all files in the filesytem under the mountpoint for that filesystem. See \fBselinux\fR(8) for more information. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBfscontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for the filesytem being mounted. See \fBselinux\fR(8) for more information. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fBdefntext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for unlabeled files. See \fBselinux\fR(8) for more information. -+.RE -+ - .sp -+.ne 2 -+.mk -+.na -+\fB\fBrootcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR -+.ad -+.sp .6 -+.RS 4n -+This flag sets the SELinux context for the root inode of the filesystem. See \fBselinux\fR(8) for more information. -+.RE -+ -+.SS "Temporary Mount Point Properties" - .LP -@@ -1258,3 +1377,2 @@ In addition, these options can be set on a per-mount basis using the \fB-o\fR op - .SS "User Properties" --.sp - .LP -@@ -1271,3 +1389,2 @@ The values of user properties are arbitrary strings, are always inherited, and a - .SS "ZFS Volumes as Swap" --.sp - .LP -@@ -1278,3 +1395,2 @@ with the \fBzfs create\fR command set up and enable the swap area using the - .SH SUBCOMMANDS --.sp - .LP -@@ -1513,3 +1629,5 @@ Destroy (or mark for deferred destruction) all snapshots with this name in desce - .RS 4n --Recursively destroy all dependents. -+Recursively destroy all clones of these snapshots, including the clones, -+snapshots, and children. If this flag is specified, the \fB-d\fR flag will -+have no effect. - .RE -@@ -1549,3 +1667,3 @@ Print verbose information about the deleted data. - .sp --Extreme care should be taken when applying either the \fB-r\fR or the \fB-f\fR -+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR - options, as they can destroy large portions of a pool and cause unexpected -@@ -1560,3 +1678,3 @@ behavior for mounted file systems in use. - .na --\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR -+\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR ... - .ad -@@ -1564,3 +1682,3 @@ behavior for mounted file systems in use. - .RS 4n --Creates a snapshot with the given name. All previous modifications by successful system calls to the file system are part of the snapshot. See the "Snapshots" section for details. -+Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the "Snapshots" section for details. - .sp -@@ -1573,3 +1691,3 @@ Creates a snapshot with the given name. All previous modifications by successful - .RS 4n --Recursively create snapshots of all descendent datasets. Snapshots are taken atomically, so that all recursive snapshots correspond to the same moment in time. -+Recursively create snapshots of all descendent datasets. - .RE -@@ -1736,3 +1854,3 @@ Recursively rename the snapshots of all descendent datasets. Snapshots are the o - .na --\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-H\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR \fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR|\fIsnap\fR] ...\fR -+\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR \fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ...\fR - .ad -@@ -1740,3 +1858,3 @@ Recursively rename the snapshots of all descendent datasets. Snapshots are the o - .RS 4n --Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR) . The following fields are displayed, \fBname,used,available,referenced,mountpoint\fR. -+Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR). When listing hundreds or thousands of snapshots performance can be improved by restricting the output to only the name. In that case, it is recommended to use \fB-o name -s name\fR. The following fields are displayed by default, \fBname,used,available,referenced,mountpoint\fR. - .sp -@@ -1756,2 +1874,12 @@ Used for scripting mode. Do not print headers and separate fields by a single ta - .na -+\fB\fB-p\fR\fR -+.sp .6 -+.RS 4n -+Display numbers in parsable (exact) values. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fB-r\fR\fR -@@ -1865,3 +1993,3 @@ Same as the \fB-s\fR option, but sorts by property in descending order. - .RS 4n --A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR , \fBvolume\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots. -+A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR, \fBsnap\fR, \fBvolume\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots. - .RE -@@ -1883,4 +2011,3 @@ Sets the property to the given value for each dataset. Only some properties can - .ne 2 --.mk --.na -+.mk .na - \fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIfield\fR[,...] [\fB-t\fR \fItype\fR[,...]] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR -@@ -1968,3 +2095,3 @@ A comma-separated list of sources to display. Those properties coming from a sou - .RS 4n --Display numbers in parseable (exact) values. -+Display numbers in parsable (exact) values. - .RE -@@ -2674,4 +2801,4 @@ userused other Allows reading any userused@... property - -+acltype property - aclinherit property --aclmode property - atime property -@@ -2908,3 +3035,3 @@ F Regular file - .RS 4n --Give more parseable tab-separated output, without header lines and without arrows. -+Give more parsable tab-separated output, without header lines and without arrows. - .RE -@@ -3046,3 +3173,3 @@ pool/home/bob zoned off default - pool/home/bob snapdir hidden default --pool/home/bob aclmode groupmask default -+pool/home/bob acltype off default - pool/home/bob aclinherit restricted default -@@ -3070,2 +3197,3 @@ pool/home/bob dedup off default - pool/home/bob mlslabel none default -+pool/home/bob relatime off default - .fi -@@ -3421,3 +3549,2 @@ M F /tank/test/modified - .SH EXIT STATUS --.sp - .LP -@@ -3458,3 +3585,2 @@ Invalid command line options were specified. - .SH SEE ALSO --.sp - .LP -diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 -index b4b0f46..2cfa855 100644 ---- a/man/man8/zpool.8 -+++ b/man/man8/zpool.8 -@@ -64,3 +64,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ... -+\fBzpool get\fR [\fB-p\fR] "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ... - .fi -@@ -86,3 +86,3 @@ zpool \- configures ZFS storage pools - \fBzpool import\fR [\fB-o \fImntopts\fR\fR] [\fB-o\fR \fIproperty=value\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] -- [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] \fIpool\fR |\fIid\fR [\fInewpool\fR] -+ [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] [\fB-t\fR]] \fIpool\fR |\fIid\fR [\fInewpool\fR] - .fi -@@ -91,3 +91,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool iostat\fR [\fB-T\fR u | d ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] -+\fBzpool iostat\fR [\fB-T\fR d | u ] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] - .fi -@@ -101,3 +101,4 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool list\fR [\fB-T\fR u | d ] [\fB-Hv\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]] -+\fBzpool list\fR [\fB-T\fR d | u ] [\fB-Hv\fR] [\fB-o\fR \fIproperty\fR[,...]] [\fIpool\fR] ... -+ [\fIinterval\fR[\fIcount\fR]] - .fi -@@ -151,3 +152,3 @@ zpool \- configures ZFS storage pools - .nf --\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ... -+\fBzpool status\fR [\fB-xvD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] - .fi -@@ -682,2 +683,13 @@ A text string consisting of printable ASCII characters that will be stored such - .na -+\fB\fBdedupditto\fR=\fB\fInumber\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Threshold for the number of block ditto copies. If the reference count for a deduplicated block increases above this number, a new ditto copy of this block is automatically stored. The default setting is 0 which causes no ditto copies to be created for deduplicated blocks. The miniumum legal nonzero setting is 100. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fBdelegation\fR=\fBon\fR | \fBoff\fR\fR -@@ -1003,3 +1015,3 @@ Forces any active datasets contained within the pool to be unmounted. - .RS 4n --Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data. -+Detaches \fIdevice\fR from a mirror. The operation is refused if there are no other valid replicas of the data. If \fIdevice\fR may be re-added to the pool later on then consider the "\fBzpool offline\fR" command instead. - .RE -@@ -1038,3 +1050,3 @@ This command will forcefully export the pool even if it has a shared spare that - .na --\fB\fBzpool get\fR "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR -+\fB\fBzpool get\fR [\fB-p\fR] "\fIall\fR" | \fIproperty\fR[,...] \fIpool\fR ...\fR - .ad -@@ -1055,2 +1067,13 @@ Retrieves the given list of properties (or all properties if "\fBall\fR" is used - See the "Properties" section for more information on the available pool properties. -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-p\fR\fR -+.ad -+.RS 6n -+.rt -+Display numbers in parseable (exact) values. -+.RE -+ - .RE -@@ -1279,3 +1302,3 @@ Used with the \fB-F\fR recovery option. Determines whether a non-importable pool - .na --\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR -+\fB\fBzpool import\fR [\fB-o\fR \fImntopts\fR] [ \fB-o\fR \fIproperty\fR=\fIvalue\fR] ... [\fB-d\fR \fIdir\fR | \fB-c\fR \fIcachefile\fR] [\fB-D\fR] [\fB-f\fR] [\fB-m\fR] [\fB-R\fR \fIroot\fR] [\fB-F\fR [\fB-n\fR]] [\fB-t\fR]] \fIpool\fR | \fIid\fR [\fInewpool\fR]\fR - .ad -@@ -1389,2 +1412,13 @@ Used with the \fB-F\fR recovery option. Determines whether a non-importable pool - .na -+\fB\fB-t\fR\fR -+.ad -+.sp .6 -+.RS 4n -+Used with "\fBnewpool\fR". Specifies that "\fBnewpool\fR" is temporary. Temporary pool names last until export. Ensures that the original pool name will be used in all label updates and therefore is retained upon export. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na - \fB\fB-m\fR\fR -@@ -1402,3 +1436,3 @@ Allows a pool to import when there is a missing log device. - .na --\fB\fBzpool iostat\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR -+\fB\fBzpool iostat\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-v\fR] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR - .ad -@@ -1459,3 +1493,3 @@ Treat exported or foreign devices as inactive. - .na --\fB\fBzpool list\fR [\fB-T\fR \fBu\fR | \fBd\fR] [\fB-Hv\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR -+\fB\fBzpool list\fR [\fB-T\fR \fBd\fR | \fBu\fR] [\fB-Hv\fR] [\fB-o\fR \fIprops\fR[,...]] [\fIpool\fR] ... [\fIinterval\fR[\fIcount\fR]]\fR - .ad -@@ -1478,3 +1512,3 @@ Scripted mode. Do not display headers, and separate fields by a single tab inste - .na --\fB\fB-T\fR \fBu\fR | \fBd\fR\fR -+\fB\fB-T\fR \fBd\fR | \fBu\fR\fR - .ad -@@ -1703,3 +1737,3 @@ Sets the specified property for \fInewpool\fR. See the “Properties” section - .na --\fB\fBzpool status\fR [\fB-xv\fR] [\fIpool\fR] ...\fR -+\fBzpool status\fR [\fB-xvD\fR] [\fB-T\fR d | u] [\fIpool\fR] ... [\fIinterval\fR [\fIcount\fR]] - .ad -@@ -1716,3 +1750,3 @@ If a scrub or resilver is in progress, this command reports the percentage done - .ad --.RS 6n -+.RS 12n - .rt -@@ -1727,3 +1761,3 @@ Only display status for pools that are exhibiting errors or are otherwise unavai - .ad --.RS 6n -+.RS 12n - .rt -@@ -1732,2 +1766,25 @@ Displays verbose data error information, printing out a complete list of all dat - -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-D\fR\fR -+.ad -+.RS 12n -+.rt -+Display a histogram of deduplication statistics, showing the allocated (physically present on disk) and -+referenced (logically referenced in the pool) block counts and sizes by reference count. -+.RE -+ -+.sp -+.ne 2 -+.mk -+.na -+\fB\fB-T\fR \fBd\fR | \fBu\fR\fR -+.ad -+.RS 12n -+.rt -+Display a time stamp. -+.sp -+Specify \fBu\fR for a printed representation of the internal representation of time. See \fBtime\fR(2). Specify \fBd\fR for standard date format. See \fBdate\fR(1). - .RE -diff --git a/module/avl/avl.c b/module/avl/avl.c -index e000647..b598cc9 100644 ---- a/module/avl/avl.c -+++ b/module/avl/avl.c -@@ -1042,2 +1042,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c -index 1758371..a91b952 100644 ---- a/module/nvpair/fnvpair.c -+++ b/module/nvpair/fnvpair.c -@@ -28,2 +28,3 @@ - #include -+#include - #ifndef _KERNEL -@@ -116,2 +117,14 @@ fnvlist_merge(nvlist_t *dst, nvlist_t *src) - -+size_t -+fnvlist_num_pairs(nvlist_t *nvl) -+{ -+ size_t count = 0; -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; -+ pair = nvlist_next_nvpair(nvl, pair)) -+ count++; -+ return (count); -+} -+ - void -@@ -503,2 +516,3 @@ EXPORT_SYMBOL(fnvlist_size); - EXPORT_SYMBOL(fnvlist_pack); -+EXPORT_SYMBOL(fnvlist_pack_free); - EXPORT_SYMBOL(fnvlist_unpack); -@@ -564,2 +578,3 @@ EXPORT_SYMBOL(fnvpair_value_string); - EXPORT_SYMBOL(fnvpair_value_nvlist); -+EXPORT_SYMBOL(fnvlist_num_pairs); - -diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c -index 36f4e4d..f5c3166 100644 ---- a/module/nvpair/nvpair.c -+++ b/module/nvpair/nvpair.c -@@ -264,11 +264,5 @@ nvlist_nvflag(nvlist_t *nvl) - --/* -- * nvlist_alloc - Allocate nvlist. -- */ --/*ARGSUSED1*/ --int --nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) -+static nv_alloc_t * -+nvlist_nv_alloc(int kmflag) - { -- nv_alloc_t *nva = nv_alloc_nosleep; -- - #if defined(_KERNEL) && !defined(_BOOT) -@@ -276,16 +270,20 @@ nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) - case KM_SLEEP: -- nva = nv_alloc_sleep; -- break; -+ return (nv_alloc_sleep); - case KM_PUSHPAGE: -- nva = nv_alloc_pushpage; -- break; -- case KM_NOSLEEP: -- nva = nv_alloc_nosleep; -- break; -+ return (nv_alloc_pushpage); - default: -- return (EINVAL); -+ return (nv_alloc_nosleep); - } --#endif -+#else -+ return (nv_alloc_nosleep); -+#endif /* _KERNEL && !_BOOT */ -+} - -- return (nvlist_xalloc(nvlp, nvflag, nva)); -+/* -+ * nvlist_alloc - Allocate nvlist. -+ */ -+int -+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) -+{ -+ return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag))); - } -@@ -616,3 +614,2 @@ nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) - */ --/*ARGSUSED1*/ - int -@@ -620,8 +617,3 @@ nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) - { --#if defined(_KERNEL) && !defined(_BOOT) -- return (nvlist_xdup(nvl, nvlp, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); --#endif -+ return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag))); - } -@@ -1626,3 +1618,3 @@ nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, - const char *np; -- char *sepp=NULL; -+ char *sepp = NULL; - char *idxp, *idxep; -@@ -2354,3 +2346,2 @@ nvlist_size(nvlist_t *nvl, size_t *size, int encoding) - */ --/*ARGSUSED1*/ - int -@@ -2359,8 +2350,4 @@ nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - { --#if defined(_KERNEL) && !defined(_BOOT) - return (nvlist_xpack(nvl, bufp, buflen, encoding, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); --#endif -+ nvlist_nv_alloc(kmflag))); - } -@@ -2417,3 +2404,2 @@ nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, - */ --/*ARGSUSED1*/ - int -@@ -2421,8 +2407,3 @@ nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) - { --#if defined(_KERNEL) && !defined(_BOOT) -- return (nvlist_xunpack(buf, buflen, nvlp, -- (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); --#else -- return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); --#endif -+ return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag))); - } -@@ -2601,3 +2582,4 @@ nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) - */ -- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); -+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), -+ sizeof (uint64_t)); - } -@@ -2629,3 +2611,4 @@ nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) - */ -- bzero(&packed->nvl_priv, sizeof (packed->nvl_priv)); -+ bzero((char *)packed + offsetof(nvlist_t, nvl_priv), -+ sizeof (uint64_t)); - } -@@ -3322,2 +3305,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/nvpair/nvpair_alloc_spl.c b/module/nvpair/nvpair_alloc_spl.c -index be6e8f0..a75b4a6 100644 ---- a/module/nvpair/nvpair_alloc_spl.c -+++ b/module/nvpair/nvpair_alloc_spl.c -@@ -54,7 +54,7 @@ nv_free_spl(nv_alloc_t *nva, void *buf, size_t size) - const nv_alloc_ops_t spl_sleep_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_sleep_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_sleep_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -@@ -62,7 +62,7 @@ const nv_alloc_ops_t spl_sleep_ops_def = { - const nv_alloc_ops_t spl_pushpage_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_pushpage_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_pushpage_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -@@ -70,7 +70,7 @@ const nv_alloc_ops_t spl_pushpage_ops_def = { - const nv_alloc_ops_t spl_nosleep_ops_def = { -- NULL, /* nv_ao_init() */ -- NULL, /* nv_ao_fini() */ -- nv_alloc_nosleep_spl, /* nv_ao_alloc() */ -- nv_free_spl, /* nv_ao_free() */ -- NULL /* nv_ao_reset() */ -+ NULL, /* nv_ao_init() */ -+ NULL, /* nv_ao_fini() */ -+ nv_alloc_nosleep_spl, /* nv_ao_alloc() */ -+ nv_free_spl, /* nv_ao_free() */ -+ NULL /* nv_ao_reset() */ - }; -diff --git a/module/unicode/u8_textprep.c b/module/unicode/u8_textprep.c -index df6dcf5..60e586d 100644 ---- a/module/unicode/u8_textprep.c -+++ b/module/unicode/u8_textprep.c -@@ -148,3 +148,3 @@ - (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ -- (((uint32_t)(b2) & 0x3F) << 6) | \ -+ (((uint32_t)(b2) & 0x3F) << 6) | \ - ((uint32_t)(b3) & 0x3F)); -@@ -2145,2 +2145,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c -index ccf169b..6d0314f 100644 ---- a/module/zcommon/zfs_comutil.c -+++ b/module/zcommon/zfs_comutil.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -159,3 +160,7 @@ zfs_spa_version_map(int zpl_version) - --const char *zfs_history_event_names[LOG_END] = { -+/* -+ * This is the table of legacy internal event names; it should not be modified. -+ * The internal events are now stored in the history log as strings. -+ */ -+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { - "invalid event", -diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c -index b27e4f3..dd456b5 100644 ---- a/module/zcommon/zfs_prop.c -+++ b/module/zcommon/zfs_prop.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ -@@ -114,2 +115,10 @@ zfs_prop_init(void) - -+ static zprop_index_t acltype_table[] = { -+ { "off", ZFS_ACLTYPE_OFF }, -+ { "disabled", ZFS_ACLTYPE_OFF }, -+ { "noacl", ZFS_ACLTYPE_OFF }, -+ { "posixacl", ZFS_ACLTYPE_POSIXACL }, -+ { NULL } -+ }; -+ - static zprop_index_t acl_inherit_table[] = { -@@ -228,2 +237,5 @@ zfs_prop_init(void) - "hidden | visible", "SNAPDEV", snapdev_table); -+ zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_OFF, -+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, -+ "noacl | posixacl", "ACLTYPE", acltype_table); - zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", -@@ -253,2 +265,4 @@ zfs_prop_init(void) - ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); -+ zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT, -+ ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table); - zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, -@@ -322,2 +336,14 @@ zfs_prop_init(void) - "", "MLSLABEL"); -+ zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "CONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "FSCONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "DEFCONTEXT"); -+ zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", -+ "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", -+ "ROOTCONTEXT"); - -@@ -355,2 +381,6 @@ zfs_prop_init(void) - ZFS_TYPE_DATASET, "", "WRITTEN"); -+ zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, -+ PROP_READONLY, ZFS_TYPE_DATASET, "", "LUSED"); -+ zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", -+ 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); - -@@ -396,2 +426,4 @@ zfs_prop_init(void) - PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); -+ zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", -+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); - -@@ -644,2 +676,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - -diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c -index 9904645..90376f2 100644 ---- a/module/zcommon/zfs_uio.c -+++ b/module/zcommon/zfs_uio.c -@@ -72,15 +72,12 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) - case UIO_USERISPACE: -- /* p = kernel data pointer -- * iov->iov_base = user data pointer */ -- -+ /* -+ * p = kernel data pointer -+ * iov->iov_base = user data pointer -+ */ - if (rw == UIO_READ) { - if (copy_to_user(iov->iov_base, p, cnt)) -- return EFAULT; -- /* error = xcopyout_nta(p, iov->iov_base, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ return (EFAULT); - } else { -- /* error = xcopyin_nta(iov->iov_base, p, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ - if (copy_from_user(p, iov->iov_base, cnt)) -- return EFAULT; -+ return (EFAULT); - } -@@ -105,3 +102,3 @@ EXPORT_SYMBOL(uiomove); - --#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) -+#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) - -@@ -196,17 +193,14 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) - case UIO_USERISPACE: -- /* p = kernel data pointer -- * iov->iov_base = user data pointer */ -- -+ /* -+ * p = kernel data pointer -+ * iov->iov_base = user data pointer -+ */ - if (rw == UIO_READ) { -- /* * UIO_READ = copy data from kernel to user * */ -+ /* UIO_READ = copy data from kernel to user */ - if (copy_to_user(iov->iov_base, p, cnt)) -- return EFAULT; -- /* error = xcopyout_nta(p, iov->iov_base, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ return (EFAULT); - } else { -- /* * UIO_WRITE = copy data from user to kernel * */ -- /* error = xcopyin_nta(iov->iov_base, p, cnt, -- * (uio->uio_extflg & UIO_COPY_CACHED)); */ -+ /* UIO_WRITE = copy data from user to kernel */ - if (copy_from_user(p, iov->iov_base, cnt)) -- return EFAULT; -+ return (EFAULT); - } -diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c -index 0a0af23..6d9f89a 100644 ---- a/module/zcommon/zprop_common.c -+++ b/module/zcommon/zprop_common.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -131,3 +134,4 @@ zprop_register_hidden(int prop, const char *name, zprop_type_t type, - zprop_register_impl(prop, name, type, 0, NULL, attr, -- objset_types, NULL, colname, B_FALSE, B_FALSE, NULL); -+ objset_types, NULL, colname, -+ type == PROP_TYPE_NUMBER, B_FALSE, NULL); - } -diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in -index 81b1680..5552436 100644 ---- a/module/zfs/Makefile.in -+++ b/module/zfs/Makefile.in -@@ -10,2 +10,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o - $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o - $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o -@@ -46,2 +47,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o - $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o - $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o -@@ -95 +97,3 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o - $(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o -+$(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o -diff --git a/module/zfs/arc.c b/module/zfs/arc.c -index ce4a023..00d2659 100644 ---- a/module/zfs/arc.c -+++ b/module/zfs/arc.c -@@ -22,5 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -60,3 +60,3 @@ - * 3. The Megiddo and Modha model assumes a fixed page size. All -- * elements of the cache are therefor exactly the same size. So -+ * elements of the cache are therefore exactly the same size. So - * when adjusting the cache size following a cache miss, its simply -@@ -64,3 +64,3 @@ - * have variable sized cache blocks (rangeing from 512 bytes to -- * 128K bytes). We therefor choose a set of blocks to evict to make -+ * 128K bytes). We therefore choose a set of blocks to evict to make - * space for a cache miss that approximates as closely as possible -@@ -79,3 +79,3 @@ - * uses method 1, while the internal arc algorithms for -- * adjusting the cache use method 2. We therefor provide two -+ * adjusting the cache use method 2. We therefore provide two - * types of locks: 1) the hash table lock array, and 2) the -@@ -136,2 +136,3 @@ - #include -+#include - #ifdef _KERNEL -@@ -147,2 +148,7 @@ - -+#ifndef _KERNEL -+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ -+boolean_t arc_watch = B_FALSE; -+#endif -+ - static kmutex_t arc_reclaim_thr_lock; -@@ -159,2 +165,8 @@ typedef enum arc_reclaim_strategy { - -+/* -+ * The number of iterations through arc_evict_*() before we -+ * drop & reacquire the lock. -+ */ -+int arc_evict_iterations = 100; -+ - /* number of seconds before growing cache again */ -@@ -162,4 +174,7 @@ int zfs_arc_grow_retry = 5; - --/* shift of arc_c for calculating both min and max arc_p */ --int zfs_arc_p_min_shift = 4; -+/* disable anon data aggressively growing arc_p */ -+int zfs_arc_p_aggressive_disable = 1; -+ -+/* disable arc_p adapt dampener in arc_adapt */ -+int zfs_arc_p_dampener_disable = 1; - -@@ -180,2 +195,7 @@ int zfs_disable_dup_eviction = 0; - -+/* -+ * If this percent of memory is free, don't throttle. -+ */ -+int arc_lotsfree_percent = 10; -+ - static int arc_dead; -@@ -234,2 +254,3 @@ typedef struct arc_state { - kmutex_t arcs_mtx; -+ arc_state_type_t arcs_state; - } arc_state_t; -@@ -261,3 +282,14 @@ typedef struct arc_stats { - kstat_named_t arcstat_recycle_miss; -+ /* -+ * Number of buffers that could not be evicted because the hash lock -+ * was held by another thread. The lock may not necessarily be held -+ * by something using the same buffer, since hash locks are shared -+ * by multiple buffers. -+ */ - kstat_named_t arcstat_mutex_miss; -+ /* -+ * Number of buffers skipped because they have I/O in progress, are -+ * indrect prefetch buffers that have not lived long enough, or are -+ * not from the spa we're trying to evict from. -+ */ - kstat_named_t arcstat_evict_skip; -@@ -278,2 +310,3 @@ typedef struct arc_stats { - kstat_named_t arcstat_data_size; -+ kstat_named_t arcstat_meta_size; - kstat_named_t arcstat_other_size; -@@ -365,2 +398,3 @@ static arc_stats_t arc_stats = { - { "data_size", KSTAT_DATA_UINT64 }, -+ { "meta_size", KSTAT_DATA_UINT64 }, - { "other_size", KSTAT_DATA_UINT64 }, -@@ -421,3 +455,3 @@ static arc_stats_t arc_stats = { - #define ARCSTAT_INCR(stat, val) \ -- atomic_add_64(&arc_stats.stat.value.ui64, (val)); -+ atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -@@ -480,5 +514,5 @@ static arc_state_t *arc_l2c_only; - #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) --#define arc_meta_used ARCSTAT(arcstat_meta_used) --#define arc_meta_limit ARCSTAT(arcstat_meta_limit) --#define arc_meta_max ARCSTAT(arcstat_meta_max) -+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -+#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ -+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ - -@@ -504,2 +538,3 @@ struct arc_write_callback { - arc_done_func_t *awcb_ready; -+ arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; -@@ -536,2 +571,7 @@ struct arc_buf_hdr { - clock_t b_arc_access; -+ uint32_t b_mru_hits; -+ uint32_t b_mru_ghost_hits; -+ uint32_t b_mfu_hits; -+ uint32_t b_mfu_ghost_hits; -+ uint32_t b_l2_hits; - -@@ -554,2 +594,3 @@ static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); -+static void arc_buf_watch(arc_buf_t *buf); - -@@ -650,5 +691,3 @@ uint64_t zfs_crc64_table[256]; - --/* -- * L2ARC Performance Tunables -- */ -+/* L2ARC Performance Tunables */ - unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -@@ -711,3 +750,4 @@ struct l2arc_buf_hdr { - /* real alloc'd buffer size depending on b_compress applied */ -- int b_asize; -+ uint32_t b_hits; -+ uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ -@@ -866,2 +906,3 @@ static kmem_cache_t *hdr_cache; - static kmem_cache_t *buf_cache; -+static kmem_cache_t *l2arc_hdr_cache; - -@@ -873,4 +914,6 @@ buf_fini(void) - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_free() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_free() in the linux kernel\ -+ */ - vmem_free(buf_hash_table.ht_table, -@@ -885,2 +928,3 @@ buf_fini(void) - kmem_cache_destroy(buf_cache); -+ kmem_cache_destroy(l2arc_hdr_cache); - } -@@ -965,4 +1009,6 @@ retry: - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_alloc() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_alloc() in the linux kernel -+ */ - buf_hash_table.ht_table = -@@ -983,2 +1029,4 @@ retry: - 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); -+ l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE, -+ 0, NULL, NULL, NULL, NULL, NULL, 0); - -@@ -1042,3 +1090,3 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), -- KM_PUSHPAGE); -+ KM_PUSHPAGE); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, -@@ -1046,2 +1094,33 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) - mutex_exit(&buf->b_hdr->b_freeze_lock); -+ arc_buf_watch(buf); -+} -+ -+#ifndef _KERNEL -+void -+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) -+{ -+ panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); -+} -+#endif -+ -+/* ARGSUSED */ -+static void -+arc_buf_unwatch(arc_buf_t *buf) -+{ -+#ifndef _KERNEL -+ if (arc_watch) { -+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, -+ PROT_READ | PROT_WRITE)); -+ } -+#endif -+} -+ -+/* ARGSUSED */ -+static void -+arc_buf_watch(arc_buf_t *buf) -+{ -+#ifndef _KERNEL -+ if (arc_watch) -+ ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); -+#endif - } -@@ -1066,2 +1145,4 @@ arc_buf_thaw(arc_buf_t *buf) - mutex_exit(&buf->b_hdr->b_freeze_lock); -+ -+ arc_buf_unwatch(buf); - } -@@ -1083,2 +1164,3 @@ arc_buf_freeze(arc_buf_t *buf) - mutex_exit(hash_lock); -+ - } -@@ -1140,2 +1222,50 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) - /* -+ * Returns detailed information about a specific arc buffer. When the -+ * state_index argument is set the function will calculate the arc header -+ * list position for its arc state. Since this requires a linear traversal -+ * callers are strongly encourage not to do this. However, it can be helpful -+ * for targeted analysis so the functionality is provided. -+ */ -+void -+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) -+{ -+ arc_buf_hdr_t *hdr = ab->b_hdr; -+ arc_state_t *state = hdr->b_state; -+ -+ memset(abi, 0, sizeof (arc_buf_info_t)); -+ abi->abi_flags = hdr->b_flags; -+ abi->abi_datacnt = hdr->b_datacnt; -+ abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; -+ abi->abi_state_contents = hdr->b_type; -+ abi->abi_state_index = -1; -+ abi->abi_size = hdr->b_size; -+ abi->abi_access = hdr->b_arc_access; -+ abi->abi_mru_hits = hdr->b_mru_hits; -+ abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; -+ abi->abi_mfu_hits = hdr->b_mfu_hits; -+ abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; -+ abi->abi_holds = refcount_count(&hdr->b_refcnt); -+ -+ if (hdr->b_l2hdr) { -+ abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; -+ abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; -+ abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; -+ abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; -+ } -+ -+ if (state && state_index && list_link_active(&hdr->b_arc_node)) { -+ list_t *list = &state->arcs_list[hdr->b_type]; -+ arc_buf_hdr_t *h; -+ -+ mutex_enter(&state->arcs_mtx); -+ for (h = list_head(list); h != NULL; h = list_next(list, h)) { -+ abi->abi_state_index++; -+ if (h == hdr) -+ break; -+ } -+ mutex_exit(&state->arcs_mtx); -+ } -+} -+ -+/* - * Move the supplied buffer to the indicated state. The mutex -@@ -1151,3 +1281,3 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) - ASSERT(MUTEX_HELD(hash_lock)); -- ASSERT(new_state != old_state); -+ ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); -@@ -1241,2 +1371,5 @@ arc_space_consume(uint64_t space, arc_space_type_t type) - break; -+ case ARC_SPACE_META: -+ ARCSTAT_INCR(arcstat_meta_size, space); -+ break; - case ARC_SPACE_OTHER: -@@ -1252,3 +1385,5 @@ arc_space_consume(uint64_t space, arc_space_type_t type) - -- atomic_add_64(&arc_meta_used, space); -+ if (type != ARC_SPACE_DATA) -+ ARCSTAT_INCR(arcstat_meta_used, space); -+ - atomic_add_64(&arc_size, space); -@@ -1267,2 +1402,5 @@ arc_space_return(uint64_t space, arc_space_type_t type) - break; -+ case ARC_SPACE_META: -+ ARCSTAT_INCR(arcstat_meta_size, -space); -+ break; - case ARC_SPACE_OTHER: -@@ -1278,6 +1416,9 @@ arc_space_return(uint64_t space, arc_space_type_t type) - -- ASSERT(arc_meta_used >= space); -- if (arc_meta_max < arc_meta_used) -- arc_meta_max = arc_meta_used; -- atomic_add_64(&arc_meta_used, -space); -+ if (type != ARC_SPACE_DATA) { -+ ASSERT(arc_meta_used >= space); -+ if (arc_meta_max < arc_meta_used) -+ arc_meta_max = arc_meta_used; -+ ARCSTAT_INCR(arcstat_meta_used, -space); -+ } -+ - ASSERT(arc_size >= space); -@@ -1300,2 +1441,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) - hdr->b_arc_access = 0; -+ hdr->b_mru_hits = 0; -+ hdr->b_mru_ghost_hits = 0; -+ hdr->b_mfu_hits = 0; -+ hdr->b_mfu_ghost_hits = 0; -+ hdr->b_l2_hits = 0; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); -@@ -1437,5 +1583,6 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) - static void --arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), -- void *data, size_t size) -+arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) - { -+ arc_buf_hdr_t *hdr = buf->b_hdr; -+ - if (HDR_L2_WRITING(hdr)) { -@@ -1443,4 +1590,4 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_PUSHPAGE); -- df->l2df_data = data; -- df->l2df_size = size; -+ df->l2df_data = buf->b_data; -+ df->l2df_size = hdr->b_size; - df->l2df_func = free_func; -@@ -1451,3 +1598,3 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - } else { -- free_func(data, size); -+ free_func(buf->b_data, hdr->b_size); - } -@@ -1467,2 +1614,3 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) - arc_cksum_verify(buf); -+ arc_buf_unwatch(buf); - -@@ -1470,11 +1618,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) - if (type == ARC_BUFC_METADATA) { -- arc_buf_data_free(buf->b_hdr, zio_buf_free, -- buf->b_data, size); -- arc_space_return(size, ARC_SPACE_DATA); -+ arc_buf_data_free(buf, zio_buf_free); -+ arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); -- arc_buf_data_free(buf->b_hdr, -- zio_data_buf_free, buf->b_data, size); -- ARCSTAT_INCR(arcstat_data_size, -size); -- atomic_add_64(&arc_size, -size); -+ arc_buf_data_free(buf, zio_data_buf_free); -+ arc_space_return(size, ARC_SPACE_DATA); - } -@@ -1554,3 +1699,3 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); -- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -1645,3 +1790,3 @@ arc_buf_free(arc_buf_t *buf, void *tag) - --int -+boolean_t - arc_buf_remove_ref(arc_buf_t *buf, void* tag) -@@ -1650,3 +1795,3 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) - kmutex_t *hash_lock = NULL; -- int no_callback = (buf->b_efunc == NULL); -+ boolean_t no_callback = (buf->b_efunc == NULL); - -@@ -1752,2 +1897,4 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - void *stolen = NULL; -+ arc_buf_hdr_t marker = {{{ 0 }}}; -+ int count = 0; - -@@ -1757,2 +1904,3 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - -+top: - mutex_enter(&state->arcs_mtx); -@@ -1775,2 +1923,29 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - continue; -+ -+ /* ignore markers */ -+ if (ab->b_spa == 0) -+ continue; -+ -+ /* -+ * It may take a long time to evict all the bufs requested. -+ * To avoid blocking all arc activity, periodically drop -+ * the arcs_mtx and give other threads a chance to run -+ * before reacquiring the lock. -+ * -+ * If we are looking for a buffer to recycle, we are in -+ * the hot code path, so don't sleep. -+ */ -+ if (!recycle && count++ > arc_evict_iterations) { -+ list_insert_after(list, ab, &marker); -+ mutex_exit(&evicted_state->arcs_mtx); -+ mutex_exit(&state->arcs_mtx); -+ kpreempt(KPREEMPT_SYNC); -+ mutex_enter(&state->arcs_mtx); -+ mutex_enter(&evicted_state->arcs_mtx); -+ ab_prev = list_prev(list, &marker); -+ list_remove(list, &marker); -+ count = 0; -+ continue; -+ } -+ - hash_lock = HDR_LOCK(ab); -@@ -1845,2 +2020,11 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - -+ if (list == &state->arcs_list[ARC_BUFC_DATA] && -+ (bytes < 0 || bytes_evicted < bytes)) { -+ /* Prevent second pass from recycling metadata into data */ -+ recycle = FALSE; -+ type = ARC_BUFC_METADATA; -+ list = &state->arcs_list[type]; -+ goto top; -+ } -+ - if (bytes_evicted < bytes) -@@ -1856,23 +2040,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - /* -- * We have just evicted some date into the ghost state, make -- * sure we also adjust the ghost state size if necessary. -+ * Note: we have just evicted some data into the ghost state, -+ * potentially putting the ghost size over the desired size. Rather -+ * that evicting from the ghost list in this hot code path, leave -+ * this chore to the arc_reclaim_thread(). - */ -- if (arc_no_grow && -- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { -- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + -- arc_mru_ghost->arcs_size - arc_c; -- -- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { -- int64_t todelete = -- MIN(arc_mru_ghost->arcs_lsize[type], mru_over); -- arc_evict_ghost(arc_mru_ghost, 0, todelete, -- ARC_BUFC_DATA); -- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { -- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], -- arc_mru_ghost->arcs_size + -- arc_mfu_ghost->arcs_size - arc_c); -- arc_evict_ghost(arc_mfu_ghost, 0, todelete, -- ARC_BUFC_DATA); -- } -- } - -@@ -1895,5 +2063,6 @@ arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - uint64_t bufs_skipped = 0; -+ int count = 0; - - ASSERT(GHOST_STATE(state)); -- bzero(&marker, sizeof(marker)); -+ bzero(&marker, sizeof (marker)); - top: -@@ -1902,2 +2071,4 @@ top: - ab_prev = list_prev(list, ab); -+ if (ab->b_type > ARC_BUFC_NUMTYPES) -+ panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) -@@ -1913,2 +2084,19 @@ top: - continue; -+ -+ /* -+ * It may take a long time to evict all the bufs requested. -+ * To avoid blocking all arc activity, periodically drop -+ * the arcs_mtx and give other threads a chance to run -+ * before reacquiring the lock. -+ */ -+ if (count++ > arc_evict_iterations) { -+ list_insert_after(list, ab, &marker); -+ mutex_exit(&state->arcs_mtx); -+ kpreempt(KPREEMPT_SYNC); -+ mutex_enter(&state->arcs_mtx); -+ ab_prev = list_prev(list, &marker); -+ list_remove(list, &marker); -+ count = 0; -+ continue; -+ } - if (mutex_tryenter(hash_lock)) { -@@ -1948,4 +2136,5 @@ top: - list_remove(list, &marker); -- } else -+ } else { - bufs_skipped += 1; -+ } - } -@@ -1979,15 +2168,7 @@ arc_adjust(void) - adjustment = MIN((int64_t)(arc_size - arc_c), -- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - -- arc_p)); -+ (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); -+ if (adjustment > 0 && arc_mru->arcs_size > 0) { -+ delta = MIN(arc_mru->arcs_size, adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); -- adjustment -= delta; -- } -- -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); -- (void) arc_evict(arc_mru, 0, delta, FALSE, -- ARC_BUFC_METADATA); - } -@@ -2000,13 +2181,5 @@ arc_adjust(void) - -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { -- delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); -+ if (adjustment > 0 && arc_mfu->arcs_size > 0) { -+ delta = MIN(arc_mfu->arcs_size, adjustment); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); -- adjustment -= delta; -- } -- -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- int64_t delta = MIN(adjustment, -- arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); -- (void) arc_evict(arc_mfu, 0, delta, FALSE, -- ARC_BUFC_METADATA); - } -@@ -2103,20 +2276,57 @@ arc_do_user_evicts(void) - */ --void --arc_adjust_meta(int64_t adjustment, boolean_t may_prune) -+static void -+arc_adjust_meta(void) - { -- int64_t delta; -+ int64_t adjustmnt, delta; -+ -+ /* -+ * This slightly differs than the way we evict from the mru in -+ * arc_adjust because we don't have a "target" value (i.e. no -+ * "meta" arc_p). As a result, I think we can completely -+ * cannibalize the metadata in the MRU before we evict the -+ * metadata from the MFU. I think we probably need to implement a -+ * "metadata arc_p" value to do this properly. -+ */ -+ adjustmnt = arc_meta_used - arc_meta_limit; - -- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); -+ if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); -- adjustment -= delta; -+ adjustmnt -= delta; - } - -- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -- delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment); -+ /* -+ * We can't afford to recalculate adjustmnt here. If we do, -+ * new metadata buffers can sneak into the MRU or ANON lists, -+ * thus penalize the MFU metadata. Although the fudge factor is -+ * small, it has been empirically shown to be significant for -+ * certain workloads (e.g. creating many empty directories). As -+ * such, we use the original calculation for adjustmnt, and -+ * simply decrement the amount of data evicted from the MRU. -+ */ -+ -+ if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); -- adjustment -= delta; - } - -- if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit)) -+ adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + -+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; -+ -+ if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(adjustmnt, -+ arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); -+ arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); -+ } -+ -+ adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + -+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; -+ -+ if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { -+ delta = MIN(adjustmnt, -+ arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); -+ arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); -+ } -+ -+ if (arc_meta_used > arc_meta_limit) - arc_do_user_prune(zfs_arc_meta_prune); -@@ -2179,3 +2389,9 @@ arc_shrink(uint64_t bytes) - -- atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift)); -+ to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift; -+ -+ if (arc_p > to_free) -+ atomic_add_64(&arc_p, -to_free); -+ else -+ arc_p = 0; -+ - if (arc_c > arc_size) -@@ -2234,3 +2450,2 @@ arc_adapt_thread(void) - callb_cpr_t cpr; -- int64_t prune; - -@@ -2258,3 +2473,4 @@ arc_adapt_thread(void) - /* reset the growth delay for every reclaim */ -- arc_grow_time = ddi_get_lbolt()+(zfs_arc_grow_retry * hz); -+ arc_grow_time = ddi_get_lbolt() + -+ (zfs_arc_grow_retry * hz); - -@@ -2269,10 +2485,3 @@ arc_adapt_thread(void) - -- /* -- * Keep meta data usage within limits, arc_shrink() is not -- * used to avoid collapsing the arc_c value when only the -- * arc_meta_limit is being exceeded. -- */ -- prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit; -- if (prune > 0) -- arc_adjust_meta(prune, B_TRUE); -+ arc_adjust_meta(); - -@@ -2411,4 +2620,6 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) - arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan)); -+ pages = btop(arc_evictable_memory()); - } else { - arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan)); -+ pages = -1; - } -@@ -2432,3 +2643,3 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) - -- return (-1); -+ return (pages); - } -@@ -2448,3 +2659,2 @@ arc_adapt(int bytes, arc_state_t *state) - int mult; -- uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift); - -@@ -2465,5 +2675,7 @@ arc_adapt(int bytes, arc_state_t *state) - 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); -- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - -- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); -+ if (!zfs_arc_p_dampener_disable) -+ mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ -+ -+ arc_p = MIN(arc_c, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { -@@ -2473,6 +2685,8 @@ arc_adapt(int bytes, arc_state_t *state) - 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); -- mult = MIN(mult, 10); -+ -+ if (!zfs_arc_p_dampener_disable) -+ mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); -- arc_p = MAX(arc_p_min, arc_p - delta); -+ arc_p = MAX(0, arc_p - delta); - } -@@ -2547,2 +2761,4 @@ arc_get_data_buf(arc_buf_t *buf) - arc_buf_contents_t type = buf->b_hdr->b_type; -+ arc_buf_contents_t evict = ARC_BUFC_DATA; -+ boolean_t recycle = TRUE; - -@@ -2557,3 +2773,3 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_buf_alloc(size); -- arc_space_consume(size, ARC_SPACE_DATA); -+ arc_space_consume(size, ARC_SPACE_META); - } else { -@@ -2561,4 +2777,3 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_data_buf_alloc(size); -- ARCSTAT_INCR(arcstat_data_size, size); -- atomic_add_64(&arc_size, size); -+ arc_space_consume(size, ARC_SPACE_DATA); - } -@@ -2587,6 +2802,23 @@ arc_get_data_buf(arc_buf_t *buf) - -- if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { -+ /* -+ * Evict data buffers prior to metadata buffers, unless we're -+ * over the metadata limit and adding a metadata buffer. -+ */ -+ if (type == ARC_BUFC_METADATA) { -+ if (arc_meta_used >= arc_meta_limit) -+ evict = ARC_BUFC_METADATA; -+ else -+ /* -+ * In this case, we're evicting data while -+ * adding metadata. Thus, to prevent recycling a -+ * data buffer into a metadata buffer, recycling -+ * is disabled in the following arc_evict call. -+ */ -+ recycle = FALSE; -+ } -+ -+ if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); -- arc_space_consume(size, ARC_SPACE_DATA); -+ arc_space_consume(size, ARC_SPACE_META); - -@@ -2598,4 +2830,6 @@ arc_get_data_buf(arc_buf_t *buf) - * thread to avoid deadlocking on the hash_lock. -+ * Of course, only do this when recycle is true. - */ -- cv_signal(&arc_reclaim_thr_cv); -+ if (recycle) -+ cv_signal(&arc_reclaim_thr_cv); - } else { -@@ -2603,7 +2837,8 @@ arc_get_data_buf(arc_buf_t *buf) - buf->b_data = zio_data_buf_alloc(size); -- ARCSTAT_INCR(arcstat_data_size, size); -- atomic_add_64(&arc_size, size); -+ arc_space_consume(size, ARC_SPACE_DATA); - } - -- ARCSTAT_BUMP(arcstat_recycle_miss); -+ /* Only bump this if we tried to recycle and failed */ -+ if (recycle) -+ ARCSTAT_BUMP(arcstat_recycle_miss); - } -@@ -2627,3 +2862,4 @@ out: - */ -- if (arc_size < arc_c && hdr->b_state == arc_anon && -+ if (!zfs_arc_p_aggressive_disable && -+ arc_size < arc_c && hdr->b_state == arc_anon && - arc_anon->arcs_size + arc_mru->arcs_size > arc_p) -@@ -2672,2 +2908,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - buf->b_flags &= ~ARC_PREFETCH; -+ atomic_inc_32(&buf->b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); -@@ -2693,2 +2930,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - } -+ atomic_inc_32(&buf->b_mru_hits); - ARCSTAT_BUMP(arcstat_mru_hits); -@@ -2715,2 +2953,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - -+ atomic_inc_32(&buf->b_mru_ghost_hits); - ARCSTAT_BUMP(arcstat_mru_ghost_hits); -@@ -2730,2 +2969,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - } -+ atomic_inc_32(&buf->b_mfu_hits); - ARCSTAT_BUMP(arcstat_mfu_hits); -@@ -2753,2 +2993,3 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) - -+ atomic_inc_32(&buf->b_mfu_ghost_hits); - ARCSTAT_BUMP(arcstat_mfu_ghost_hits); -@@ -2774,3 +3015,3 @@ arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); -- VERIFY(arc_buf_remove_ref(buf, arg) == 1); -+ VERIFY(arc_buf_remove_ref(buf, arg)); - } -@@ -2783,3 +3024,3 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) - if (zio && zio->io_error) { -- VERIFY(arc_buf_remove_ref(buf, arg) == 1); -+ VERIFY(arc_buf_remove_ref(buf, arg)); - *bufp = NULL; -@@ -2836,2 +3077,3 @@ arc_read_done(zio_t *zio) - arc_cksum_compute(buf, B_FALSE); -+ arc_buf_watch(buf); - -@@ -2937,3 +3179,3 @@ int - arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, -- void *private, int priority, int zio_flags, uint32_t *arc_flags, -+ void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_t *zb) -@@ -2945,2 +3187,3 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - uint64_t guid = spa_load_guid(spa); -+ int rc = 0; - -@@ -2978,6 +3221,6 @@ top: - mutex_exit(hash_lock); -- return (0); -+ goto out; - } - mutex_exit(hash_lock); -- return (0); -+ goto out; - } -@@ -3025,4 +3268,6 @@ top: - vdev_t *vd = NULL; -- uint64_t addr = -1; -+ uint64_t addr = 0; - boolean_t devw = B_FALSE; -+ enum zio_compress b_compress = ZIO_COMPRESS_OFF; -+ uint64_t b_asize = 0; - -@@ -3096,3 +3341,3 @@ top: - -- if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && -+ if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { -@@ -3100,2 +3345,4 @@ top: - addr = hdr->b_l2hdr->b_daddr; -+ b_compress = hdr->b_l2hdr->b_compress; -+ b_asize = hdr->b_l2hdr->b_asize; - /* -@@ -3110,2 +3357,6 @@ top: - -+ /* -+ * At this point, we have a level 1 cache miss. Try again in -+ * L2ARC if possible. -+ */ - ASSERT3U(hdr->b_size, ==, size); -@@ -3135,2 +3386,3 @@ top: - ARCSTAT_BUMP(arcstat_l2_hits); -+ atomic_inc_32(&hdr->b_l2hdr->b_hits); - -@@ -3143,3 +3395,7 @@ top: - cb->l2rcb_flags = zio_flags; -- cb->l2rcb_compress = hdr->b_l2hdr->b_compress; -+ cb->l2rcb_compress = b_compress; -+ -+ ASSERT(addr >= VDEV_LABEL_START_SIZE && -+ addr + size < vd->vdev_psize - -+ VDEV_LABEL_END_SIZE); - -@@ -3151,4 +3407,3 @@ top: - */ -- if (hdr->b_l2hdr->b_compress == -- ZIO_COMPRESS_EMPTY) { -+ if (b_compress == ZIO_COMPRESS_EMPTY) { - rzio = zio_null(pio, spa, vd, -@@ -3161,4 +3416,4 @@ top: - rzio = zio_read_phys(pio, vd, addr, -- hdr->b_l2hdr->b_asize, -- buf->b_data, ZIO_CHECKSUM_OFF, -+ b_asize, buf->b_data, -+ ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, -@@ -3171,4 +3426,3 @@ top: - zio_t *, rzio); -- ARCSTAT_INCR(arcstat_l2_read_bytes, -- hdr->b_l2hdr->b_asize); -+ ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - -@@ -3176,3 +3430,3 @@ top: - zio_nowait(rzio); -- return (0); -+ goto out; - } -@@ -3181,3 +3435,3 @@ top: - if (zio_wait(rzio) == 0) -- return (0); -+ goto out; - -@@ -3205,4 +3459,6 @@ top: - -- if (*arc_flags & ARC_WAIT) -- return (zio_wait(rzio)); -+ if (*arc_flags & ARC_WAIT) { -+ rc = zio_wait(rzio); -+ goto out; -+ } - -@@ -3211,3 +3467,6 @@ top: - } -- return (0); -+ -+out: -+ spa_read_history_add(spa, zb, *arc_flags); -+ return (rc); - } -@@ -3219,3 +3478,3 @@ arc_add_prune_callback(arc_prune_func_t *func, void *private) - -- p = kmem_alloc(sizeof(*p), KM_SLEEP); -+ p = kmem_alloc(sizeof (*p), KM_SLEEP); - p->p_pfunc = func; -@@ -3371,4 +3630,4 @@ arc_buf_evict(arc_buf_t *buf) - /* -- * Release this buffer from the cache. This must be done -- * after a read and prior to modifying the buffer contents. -+ * Release this buffer from the cache, making it an anonymous buffer. This -+ * must be done after a read and prior to modifying the buffer contents. - * If the buffer has more than one reference, we must make -@@ -3410,4 +3669,5 @@ arc_release(arc_buf_t *buf, void *tag) - hdr->b_l2hdr = NULL; -- buf_size = hdr->b_size; -+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - } -+ buf_size = hdr->b_size; - -@@ -3455,2 +3715,3 @@ arc_release(arc_buf_t *buf, void *tag) - arc_cksum_verify(buf); -+ arc_buf_unwatch(buf); - -@@ -3465,2 +3726,7 @@ arc_release(arc_buf_t *buf, void *tag) - nhdr->b_arc_access = 0; -+ nhdr->b_mru_hits = 0; -+ nhdr->b_mru_ghost_hits = 0; -+ nhdr->b_mfu_hits = 0; -+ nhdr->b_mfu_ghost_hits = 0; -+ nhdr->b_l2_hits = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; -@@ -3481,2 +3747,7 @@ arc_release(arc_buf_t *buf, void *tag) - hdr->b_arc_access = 0; -+ hdr->b_mru_hits = 0; -+ hdr->b_mru_ghost_hits = 0; -+ hdr->b_mfu_hits = 0; -+ hdr->b_mfu_ghost_hits = 0; -+ hdr->b_l2_hits = 0; - if (hash_lock) -@@ -3492,4 +3763,3 @@ arc_release(arc_buf_t *buf, void *tag) - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); -- list_remove(l2hdr->b_dev->l2ad_buflist, hdr); -- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -3563,2 +3833,14 @@ arc_write_ready(zio_t *zio) - -+/* -+ * The SPA calls this callback for each physical write that happens on behalf -+ * of a logical write. See the comment in dbuf_write_physdone() for details. -+ */ -+static void -+arc_write_physdone(zio_t *zio) -+{ -+ arc_write_callback_t *cb = zio->io_private; -+ if (cb->awcb_physdone != NULL) -+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -+} -+ - static void -@@ -3611,2 +3893,8 @@ arc_write_done(zio_t *zio) - ASSERT3P(exists, ==, NULL); -+ } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { -+ /* nopwrite */ -+ ASSERT(zio->io_prop.zp_nopwrite); -+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) -+ panic("bad nopwrite, hdr=%p exists=%p", -+ (void *)hdr, (void *)exists); - } else { -@@ -3637,4 +3925,5 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, -- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, -- void *private, int priority, int zio_flags, const zbookmark_t *zb) -+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, -+ arc_done_func_t *done, void *private, zio_priority_t priority, -+ int zio_flags, const zbookmark_t *zb) - { -@@ -3655,2 +3944,3 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - callback->awcb_ready = ready; -+ callback->awcb_physdone = physdone; - callback->awcb_done = done; -@@ -3660,3 +3950,4 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, -- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); -+ arc_write_ready, arc_write_physdone, arc_write_done, callback, -+ priority, zio_flags, zb); - -@@ -3666,7 +3957,5 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - static int --arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) -+arc_memory_throttle(uint64_t reserve, uint64_t txg) - { - #ifdef _KERNEL -- uint64_t available_memory; -- - if (zfs_arc_memory_throttle_disable) -@@ -3674,15 +3963,6 @@ arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) - -- /* Easily reclaimable memory (free + inactive + arc-evictable) */ -- available_memory = ptob(spl_kmem_availrmem()) + arc_evictable_memory(); -- -- if (available_memory <= zfs_write_limit_max) { -+ if (freemem <= physmem * arc_lotsfree_percent / 100) { - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); -- return (EAGAIN); -- } -- -- if (inflight_data > available_memory / 4) { -- ARCSTAT_INCR(arcstat_memory_throttle_count, 1); -- DMU_TX_STAT_BUMP(dmu_tx_memory_inflight); -- return (ERESTART); -+ return (SET_ERROR(EAGAIN)); - } -@@ -3705,11 +3985,2 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - --#ifdef ZFS_DEBUG -- /* -- * Once in a while, fail for no reason. Everything should cope. -- */ -- if (spa_get_random(10000) == 0) { -- dprintf("forcing random failure\n"); -- return (ERESTART); -- } --#endif - if (reserve > arc_c/4 && !arc_no_grow) -@@ -3718,3 +3989,3 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -3730,6 +4001,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - * Writes will, almost always, require additional memory allocations -- * in order to compress/encrypt/etc the data. We therefor need to -+ * in order to compress/encrypt/etc the data. We therefore need to - * make sure that there is sufficient available memory for this. - */ -- if ((error = arc_memory_throttle(reserve, anon_size, txg))) -+ error = arc_memory_throttle(reserve, txg); -+ if (error != 0) - return (error); -@@ -3753,3 +4025,3 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) - DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -3774,3 +4046,3 @@ arc_kstat_update(kstat_t *ksp, int rw) - if (rw == KSTAT_WRITE) { -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } else { -@@ -3828,6 +4100,6 @@ arc_init(void) - -- /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ -- arc_c_min = MAX(arc_c / 4, 64<<20); -+ /* set min cache to zero */ -+ arc_c_min = 4<<20; - /* set max to 1/2 of all memory */ -- arc_c_max = MAX(arc_c * 4, arc_c_max); -+ arc_c_max = arc_c * 4; - -@@ -3839,3 +4111,3 @@ arc_init(void) - arc_c_max = zfs_arc_max; -- if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) -+ if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max) - arc_c_min = zfs_arc_min; -@@ -3845,4 +4117,4 @@ arc_init(void) - -- /* limit meta-data to 1/4 of the arc capacity */ -- arc_meta_limit = arc_c_max / 4; -+ /* limit meta-data to 3/4 of the arc capacity */ -+ arc_meta_limit = (3 * arc_c_max) / 4; - arc_meta_max = 0; -@@ -3853,5 +4125,2 @@ arc_init(void) - -- if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) -- arc_c_min = arc_meta_limit / 2; -- - /* if kmem_flags are set, lets try to use less memory */ -@@ -3898,2 +4167,9 @@ arc_init(void) - -+ arc_anon->arcs_state = ARC_STATE_ANON; -+ arc_mru->arcs_state = ARC_STATE_MRU; -+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; -+ arc_mfu->arcs_state = ARC_STATE_MFU; -+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; -+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; -+ - buf_init(); -@@ -3923,7 +4199,20 @@ arc_init(void) - -- if (zfs_write_limit_max == 0) -- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; -- else -- zfs_write_limit_shift = 0; -- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); -+ /* -+ * Calculate maximum amount of dirty data per pool. -+ * -+ * If it has been set by a module parameter, take that. -+ * Otherwise, use a percentage of physical memory defined by -+ * zfs_dirty_data_max_percent (default 10%) with a cap at -+ * zfs_dirty_data_max_max (default 25% of physical memory). -+ */ -+ if (zfs_dirty_data_max_max == 0) -+ zfs_dirty_data_max_max = physmem * PAGESIZE * -+ zfs_dirty_data_max_max_percent / 100; -+ -+ if (zfs_dirty_data_max == 0) { -+ zfs_dirty_data_max = physmem * PAGESIZE * -+ zfs_dirty_data_max_percent / 100; -+ zfs_dirty_data_max = MIN(zfs_dirty_data_max, -+ zfs_dirty_data_max_max); -+ } - } -@@ -3985,4 +4274,2 @@ arc_fini(void) - -- mutex_destroy(&zfs_write_limit_lock); -- - buf_fini(); -@@ -4335,2 +4622,9 @@ l2arc_write_done(zio_t *zio) - ab_prev = list_prev(buflist, ab); -+ abl2 = ab->b_l2hdr; -+ -+ /* -+ * Release the temporary compressed buffer as soon as possible. -+ */ -+ if (abl2->b_compress != ZIO_COMPRESS_OFF) -+ l2arc_release_cdata_buf(ab); - -@@ -4347,10 +4641,2 @@ l2arc_write_done(zio_t *zio) - -- abl2 = ab->b_l2hdr; -- -- /* -- * Release the temporary compressed buffer as soon as possible. -- */ -- if (abl2->b_compress != ZIO_COMPRESS_OFF) -- l2arc_release_cdata_buf(ab); -- - if (zio->io_error != 0) { -@@ -4362,3 +4648,3 @@ l2arc_write_done(zio_t *zio) - ab->b_l2hdr = NULL; -- kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4439,3 +4725,3 @@ l2arc_read_done(zio_t *zio) - } else { -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - } -@@ -4617,3 +4903,3 @@ top: - ab->b_l2hdr = NULL; -- kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); -+ kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4753,3 +5039,3 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - cb = kmem_alloc(sizeof (l2arc_write_callback_t), -- KM_PUSHPAGE); -+ KM_PUSHPAGE); - cb->l2wcb_dev = dev; -@@ -4763,5 +5049,5 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - */ -- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), -- KM_PUSHPAGE); -+ l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_PUSHPAGE); - l2hdr->b_dev = dev; -+ l2hdr->b_daddr = 0; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); -@@ -4781,2 +5067,3 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, - l2hdr->b_tmp_cdata = ab->b_buf->b_data; -+ l2hdr->b_hits = 0; - -@@ -5017,3 +5304,3 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) - hdr->b_size) != 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - zio_data_buf_free(cdata, csize); -@@ -5313,2 +5600,3 @@ EXPORT_SYMBOL(arc_read); - EXPORT_SYMBOL(arc_buf_remove_ref); -+EXPORT_SYMBOL(arc_buf_info); - EXPORT_SYMBOL(arc_getbuf_func); -@@ -5332,2 +5620,8 @@ MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); - -+module_param(zfs_arc_p_aggressive_disable, int, 0644); -+MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow"); -+ -+module_param(zfs_arc_p_dampener_disable, int, 0644); -+MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener"); -+ - module_param(zfs_arc_shrink_shift, int, 0644); -@@ -5335,5 +5629,2 @@ MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); - --module_param(zfs_arc_p_min_shift, int, 0644); --MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); -- - module_param(zfs_disable_dup_eviction, int, 0644); -diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c -index d196351..c3927e7 100644 ---- a/module/zfs/bplist.c -+++ b/module/zfs/bplist.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -54,2 +55,8 @@ bplist_append(bplist_t *bpl, const blkptr_t *bp) - -+/* -+ * To aid debugging, we keep the most recently removed entry. This way if -+ * we are in the callback, we can easily locate the entry. -+ */ -+static bplist_entry_t *bplist_iterate_last_removed; -+ - void -@@ -61,2 +68,3 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) - while ((bpe = list_head(&bpl->bpl_list))) { -+ bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); -diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c -index 1920da4..5787a6f 100644 ---- a/module/zfs/bpobj.c -+++ b/module/zfs/bpobj.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -368,2 +368,3 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - uint64_t used, comp, uncomp, subsubobjs; -+ ASSERTV(dmu_object_info_t doi); - -@@ -394,2 +395,5 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - -+ ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); -+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); -+ - mutex_enter(&bpo->bpo_lock); -@@ -416,2 +420,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) - 0, FTAG, &subdb, 0)); -+ /* -+ * Make sure that we are not asking dmu_write() -+ * to write more data than we have in our buffer. -+ */ -+ VERIFY3U(subdb->db_size, >=, -+ numsubsub * sizeof (subobj)); - dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, -diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c -index 73922db..c03cb1f 100644 ---- a/module/zfs/bptree.c -+++ b/module/zfs/bptree.c -@@ -22,3 +22,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -45,3 +45,3 @@ - * -- * Note that while bt_begin and bt_end are only ever incremented in this code -+ * Note that while bt_begin and bt_end are only ever incremented in this code, - * they are effectively reset to 0 every time the entire bptree is freed because -@@ -182,2 +182,3 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - bptree_entry_phys_t bte; -+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; - -@@ -190,9 +191,9 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - -+ if (zfs_recover) -+ flags |= TRAVERSE_HARD; - err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, -- bte.be_birth_txg, &bte.be_zb, -- TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST, -+ bte.be_birth_txg, &bte.be_zb, flags, - bptree_visit_cb, &ba); - if (free) { -- ASSERT(err == 0 || err == ERESTART); -- if (err != 0) { -+ if (err == ERESTART) { - /* save bookmark for future resume */ -@@ -204,7 +205,17 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, - break; -- } else { -- ba.ba_phys->bt_begin++; -- (void) dmu_free_range(os, obj, -- i * sizeof (bte), sizeof (bte), tx); - } -+ if (err != 0) { -+ /* -+ * We can not properly handle an i/o -+ * error, because the traversal code -+ * does not know how to resume from an -+ * arbitrary bookmark. -+ */ -+ zfs_panic_recover("error %u from " -+ "traverse_dataset_destroyed()", err); -+ } -+ -+ ba.ba_phys->bt_begin++; -+ (void) dmu_free_range(os, obj, -+ i * sizeof (bte), sizeof (bte), tx); - } -diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c -index faa6cc3..c8a5261 100644 ---- a/module/zfs/dbuf.c -+++ b/module/zfs/dbuf.c -@@ -23,3 +23,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -30,2 +30,3 @@ - #include -+#include - #include -@@ -65,4 +66,10 @@ static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); - -+/* -+ * Number of times that zfs_free_range() took the slow path while doing -+ * a zfs receive. A nonzero value indicates a potential performance problem. -+ */ -+uint64_t zfs_free_range_recv_miss; -+ - static void dbuf_destroy(dmu_buf_impl_t *db); --static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); - static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); -@@ -300,4 +307,6 @@ retry: - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_alloc() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_alloc() in the linux kernel -+ */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE); -@@ -319,2 +328,4 @@ retry: - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); -+ -+ dbuf_stats_init(h); - } -@@ -327,2 +338,4 @@ dbuf_fini(void) - -+ dbuf_stats_destroy(); -+ - for (i = 0; i < DBUF_MUTEXES; i++) -@@ -330,4 +343,6 @@ dbuf_fini(void) - #if defined(_KERNEL) && defined(HAVE_SPL) -- /* Large allocations which do not require contiguous pages -- * should be using vmem_free() in the linux kernel */ -+ /* -+ * Large allocations which do not require contiguous pages -+ * should be using vmem_free() in the linux kernel -+ */ - vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -@@ -548,3 +563,3 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) - ASSERT3P(db->db_buf, ==, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - db->db_state = DB_UNCACHED; -@@ -649,3 +664,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - if (db->db_state == DB_NOFILL) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -689,2 +704,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - } else { -+ /* -+ * Another reader came in while the dbuf was in flight -+ * between UNCACHED and CACHED. Either a writer will finish -+ * writing the buffer (sending the dbuf to CACHED) or the -+ * first reader's request will reach the read_done callback -+ * and send the dbuf to CACHED. Otherwise, a failure -+ * occurred and the dbuf went to UNCACHED. -+ */ - mutex_exit(&db->db_mtx); -@@ -697,2 +720,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - -+ /* Skip the wait per the caller's request. */ - mutex_enter(&db->db_mtx); -@@ -706,3 +730,3 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) - if (db->db_state == DB_UNCACHED) -- err = EIO; -+ err = SET_ERROR(EIO); - } -@@ -812,3 +836,3 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - /* free this block */ -- if (!BP_IS_HOLE(bp)) { -+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { - spa_t *spa; -@@ -819,2 +843,4 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; -+ dr->dt.dl.dr_nopwrite = B_FALSE; -+ - /* -@@ -833,5 +859,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) - * data blocks in the free range, so that any future readers will find -- * empty blocks. Also, if we happen accross any level-1 dbufs in the -+ * empty blocks. Also, if we happen across any level-1 dbufs in the - * range that have not already been marked dirty, mark them dirty so - * they stay in memory. -+ * -+ * This is a no-op if the dataset is in the middle of an incremental -+ * receive; see comment below for details. - */ -@@ -851,4 +880,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); -+ - mutex_enter(&dn->dn_dbufs_mtx); -- for (db = list_head(&dn->dn_dbufs); db; db = db_next) { -+ if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { -+ /* There can't be any dbufs in this range; no need to search. */ -+ mutex_exit(&dn->dn_dbufs_mtx); -+ return; -+ } else if (dmu_objset_is_receiving(dn->dn_objset)) { -+ /* -+ * If we are receiving, we expect there to be no dbufs in -+ * the range to be freed, because receive modifies each -+ * block at most once, and in offset order. If this is -+ * not the case, it can lead to performance problems, -+ * so note that we unexpectedly took the slow path. -+ */ -+ atomic_inc_64(&zfs_free_range_recv_miss); -+ } -+ -+ for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); -@@ -877,6 +922,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) - /* found a level 0 buffer in the range */ -- if (dbuf_undirty(db, tx)) -+ mutex_enter(&db->db_mtx); -+ if (dbuf_undirty(db, tx)) { -+ /* mutex has been dropped and dbuf destroyed */ - continue; -+ } - -- mutex_enter(&db->db_mtx); - if (db->db_state == DB_UNCACHED || -@@ -1007,3 +1054,3 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) - dbuf_set_data(db, buf); -- VERIFY(arc_buf_remove_ref(obuf, db) == 1); -+ VERIFY(arc_buf_remove_ref(obuf, db)); - db->db.db_size = size; -@@ -1197,2 +1244,4 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - } -+ if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) -+ dr->dr_accounted = db->db.db_size; - dr->dr_dbuf = db; -@@ -1280,3 +1329,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - mutex_enter(&db->db_mtx); -- /* possible race with dbuf_undirty() */ -+ /* -+ * Since we've dropped the mutex, it's possible that -+ * dbuf_undirty() might have changed this out from under us. -+ */ - if (db->db_last_dirty == dr || -@@ -1308,3 +1360,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - --static int -+/* -+ * Undirty a buffer in the transaction group referenced by the given -+ * transaction. Return whether this evicted the dbuf. -+ */ -+static boolean_t - dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -@@ -1317,4 +1373,5 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - ASSERT(db->db_blkid != DMU_BONUS_BLKID); -+ ASSERT0(db->db_level); -+ ASSERT(MUTEX_HELD(&db->db_mtx)); - -- mutex_enter(&db->db_mtx); - /* -@@ -1325,6 +1382,4 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - break; -- if (dr == NULL || dr->dr_txg < txg) { -- mutex_exit(&db->db_mtx); -- return (0); -- } -+ if (dr == NULL || dr->dr_txg < txg) -+ return (B_FALSE); - ASSERT(dr->dr_txg == txg); -@@ -1336,20 +1391,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - /* -- * If this buffer is currently held, we cannot undirty -- * it, since one of the current holders may be in the -- * middle of an update. Note that users of dbuf_undirty() -- * should not place a hold on the dbuf before the call. -- * Also note: we can get here with a spill block, so -- * test for that similar to how dbuf_dirty does. -+ * Note: This code will probably work even if there are concurrent -+ * holders, but it is untested in that scenerio, as the ZPL and -+ * ztest have additional locking (the range locks) that prevents -+ * that type of concurrent access. - */ -- if (refcount_count(&db->db_holds) > db->db_dirtycnt) { -- mutex_exit(&db->db_mtx); -- /* Make sure we don't toss this buffer at sync phase */ -- if (db->db_blkid != DMU_SPILL_BLKID) { -- mutex_enter(&dn->dn_mtx); -- dnode_clear_range(dn, db->db_blkid, 1, tx); -- mutex_exit(&dn->dn_mtx); -- } -- DB_DNODE_EXIT(db); -- return (0); -- } -+ ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt); - -@@ -1359,3 +1402,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - -- /* XXX would be nice to fix up dn_towrite_space[] */ -+ /* -+ * Any space we accounted for in dp_dirty_* will be cleaned up by -+ * dsl_pool_sync(). This is relatively rare so the discrepancy -+ * is not a big deal. -+ */ - -@@ -1382,17 +1429,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - -- if (db->db_level == 0) { -- if (db->db_state != DB_NOFILL) { -- dbuf_unoverride(dr); -+ if (db->db_state != DB_NOFILL) { -+ dbuf_unoverride(dr); - -- ASSERT(db->db_buf != NULL); -- ASSERT(dr->dt.dl.dr_data != NULL); -- if (dr->dt.dl.dr_data != db->db_buf) -- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, -- db) == 1); -- } -- } else { - ASSERT(db->db_buf != NULL); -- ASSERT(list_head(&dr->dt.di.dr_children) == NULL); -- mutex_destroy(&dr->dt.di.dr_mtx); -- list_destroy(&dr->dt.di.dr_children); -+ ASSERT(dr->dt.dl.dr_data != NULL); -+ if (dr->dt.dl.dr_data != db->db_buf) -+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); - } -@@ -1408,9 +1447,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) - dbuf_set_data(db, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); -- return (1); -+ return (B_TRUE); - } - -- mutex_exit(&db->db_mtx); -- return (0); -+ return (B_FALSE); - } -@@ -1513,3 +1551,3 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - bcopy(buf->b_data, db->db.db_data, db->db.db_size); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - xuio_stat_wbuf_copied(); -@@ -1531,6 +1569,6 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - dr->dt.dl.dr_data = buf; -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(db->db_buf, db)); - } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { - arc_release(db->db_buf, db); -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(db->db_buf, db)); - } -@@ -1548,3 +1586,3 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) - * "Clear" the contents of this dbuf. This will mark the dbuf -- * EVICTING and clear *most* of its references. Unfortunetely, -+ * EVICTING and clear *most* of its references. Unfortunately, - * when we are not holding the dn_dbufs_mtx, we can't clear the -@@ -1661,3 +1699,3 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - /* the buffer has no parent yet */ -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } else if (level < nlevels-1) { -@@ -1668,4 +1706,3 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - fail_sparse, NULL, parentp); -- } -- else { -+ } else { - __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, -@@ -1745,3 +1782,3 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - int blocksize = -- db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; -+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; - db->db.db_size = blocksize; -@@ -1766,2 +1803,5 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - list_insert_head(&dn->dn_dbufs, db); -+ if (db->db_level == 0 && db->db_blkid >= -+ dn->dn_unlisted_l0_blkid) -+ dn->dn_unlisted_l0_blkid = db->db_blkid + 1; - db->db_state = DB_UNCACHED; -@@ -1851,3 +1891,3 @@ dbuf_destroy(dmu_buf_impl_t *db) - void --dbuf_prefetch(dnode_t *dn, uint64_t blkid) -+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) - { -@@ -1875,4 +1915,2 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - if (bp && !BP_IS_HOLE(bp)) { -- int priority = dn->dn_type == DMU_OT_DDT_ZAP ? -- ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; -@@ -1885,3 +1923,3 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - (void) arc_read(NULL, dn->dn_objset->os_spa, -- bp, NULL, NULL, priority, -+ bp, NULL, NULL, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, -@@ -1894,3 +1932,3 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) - --#define DBUF_HOLD_IMPL_MAX_DEPTH 20 -+#define DBUF_HOLD_IMPL_MAX_DEPTH 20 - -@@ -1923,4 +1961,5 @@ top: - if (dh->dh_fail_sparse) { -- if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) -- dh->dh_err = ENOENT; -+ if (dh->dh_err == 0 && -+ dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) -+ dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { -@@ -2004,3 +2043,3 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - -- dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) * -+ dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE); -@@ -2010,3 +2049,3 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, - -- kmem_free(dh, sizeof(struct dbuf_hold_impl_data) * -+ kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * - DBUF_HOLD_IMPL_MAX_DEPTH); -@@ -2062,3 +2101,3 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) - if (db->db_blkid != DMU_SPILL_BLKID) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (blksz == 0) -@@ -2170,6 +2209,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) - dbuf_set_data(db, NULL); -- VERIFY(arc_buf_remove_ref(buf, db) == 1); -+ VERIFY(arc_buf_remove_ref(buf, db)); - dbuf_evict(db); - } else { -- VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); -+ VERIFY(!arc_buf_remove_ref(db->db_buf, db)); - -@@ -2274,2 +2313,9 @@ dmu_buf_freeable(dmu_buf_t *dbuf) - -+blkptr_t * -+dmu_buf_get_blkptr(dmu_buf_t *db) -+{ -+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; -+ return (dbi->db_blkptr); -+} -+ - static void -@@ -2319,3 +2365,4 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) - --/* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it -+/* -+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it - * is critical the we not allow the compiler to inline this function in to -@@ -2339,2 +2386,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - -+ /* Read the block if it hasn't been read yet. */ - if (db->db_buf == NULL) { -@@ -2349,2 +2397,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - dn = DB_DNODE(db); -+ /* Indirect block size must match what the dnode thinks it is. */ - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); -@@ -2353,2 +2402,3 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - -+ /* Provide the pending dirty record to child dbufs */ - db->db_data_pending = dr; -@@ -2366,3 +2416,4 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) - --/* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is -+/* -+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is - * critical the we not allow the compiler to inline this function in to -@@ -2611,2 +2662,34 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) - -+/* -+ * The SPA will call this callback several times for each zio - once -+ * for every physical child i/o (zio->io_phys_children times). This -+ * allows the DMU to monitor the progress of each logical i/o. For example, -+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z -+ * block. There may be a long delay before all copies/fragments are completed, -+ * so this callback allows us to retire dirty space gradually, as the physical -+ * i/os complete. -+ */ -+/* ARGSUSED */ -+static void -+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -+{ -+ dmu_buf_impl_t *db = arg; -+ objset_t *os = db->db_objset; -+ dsl_pool_t *dp = dmu_objset_pool(os); -+ dbuf_dirty_record_t *dr; -+ int delta = 0; -+ -+ dr = db->db_data_pending; -+ ASSERT3U(dr->dr_txg, ==, zio->io_txg); -+ -+ /* -+ * The callback will be called io_phys_children times. Retire one -+ * portion of our dirty space each time we are called. Any rounding -+ * error will be cleaned up by dsl_pool_sync()'s call to -+ * dsl_pool_undirty_space(). -+ */ -+ delta = dr->dr_accounted / zio->io_phys_children; -+ dsl_pool_undirty_space(dp, delta, zio->io_txg); -+} -+ - /* ARGSUSED */ -@@ -2624,3 +2707,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - -- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { -+ /* -+ * For nopwrites and rewrites we ensure that the bp matches our -+ * original and bypass all the accounting. -+ */ -+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { - ASSERT(BP_EQUAL(bp, bp_orig)); -@@ -2671,3 +2758,3 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, -- db) == 1); -+ db)); - else if (!arc_released(db->db_buf)) -@@ -2701,2 +2788,3 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) - db->db_data_pending = NULL; -+ - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); -@@ -2743,2 +2831,3 @@ dbuf_write_override_done(zio_t *zio) - -+/* Issue I/O to commit a dirty buffer to disk. */ - static void -@@ -2777,4 +2866,11 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - if (parent != dn->dn_dbuf) { -+ /* Our parent is an indirect block. */ -+ /* We have a dirty parent that has been scheduled for write. */ - ASSERT(parent && parent->db_data_pending); -+ /* Our parent's buffer is one level closer to the dnode. */ - ASSERT(db->db_level == parent->db_level-1); -+ /* -+ * We're about to modify our parent's db_data by modifying -+ * our block pointer, so the parent must be released. -+ */ - ASSERT(arc_released(parent->db_buf)); -@@ -2782,2 +2878,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - } else { -+ /* Our parent is the dnode itself. */ - ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && -@@ -2810,4 +2907,4 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - db->db_blkptr, data->b_data, arc_buf_size(data), &zp, -- dbuf_write_override_ready, dbuf_write_override_done, dr, -- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); -+ dbuf_write_override_ready, NULL, dbuf_write_override_done, -+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - mutex_enter(&db->db_mtx); -@@ -2815,3 +2912,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, -- dr->dt.dl.dr_copies); -+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); - mutex_exit(&db->db_mtx); -@@ -2821,3 +2918,3 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - db->db_blkptr, NULL, db->db.db_size, &zp, -- dbuf_write_nofill_ready, dbuf_write_nofill_done, db, -+ dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, - ZIO_PRIORITY_ASYNC_WRITE, -@@ -2829,4 +2926,4 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) - DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, -- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, -- ZIO_FLAG_MUSTSUCCEED, &zb); -+ dbuf_write_physdone, dbuf_write_done, db, -+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c -new file mode 100644 -index 0000000..0cad9ef ---- /dev/null -+++ b/module/zfs/dbuf_stats.c -@@ -0,0 +1,230 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#include -+#include -+#include -+ -+/* -+ * Calculate the index of the arc header for the state, disabled by default. -+ */ -+int zfs_dbuf_state_index = 0; -+ -+/* -+ * ========================================================================== -+ * Dbuf Hash Read Routines -+ * ========================================================================== -+ */ -+typedef struct dbuf_stats_t { -+ kmutex_t lock; -+ kstat_t *kstat; -+ dbuf_hash_table_t *hash; -+ int idx; -+} dbuf_stats_t; -+ -+static dbuf_stats_t dbuf_stats_hash_table; -+ -+static int -+dbuf_stats_hash_table_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, -+ "%-88s | %-124s | %s\n" -+ "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " -+ "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " -+ "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " -+ "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", -+ "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", -+ "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", -+ "atype", "index", "flags", "count", "asize", "access", -+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", -+ "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", -+ "bsize", "lvls", "dholds", "blocks", "dsize"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+int -+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) -+{ -+ arc_buf_info_t abi = { 0 }; -+ dmu_object_info_t doi = { 0 }; -+ dnode_t *dn = DB_DNODE(db); -+ -+ if (db->db_buf) -+ arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); -+ -+ if (dn) -+ __dmu_object_info_from_dnode(dn, &doi); -+ -+ size = snprintf(buf, size - 1, -+ "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " -+ "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " -+ "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " -+ "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", -+ /* dmu_buf_impl_t */ -+ spa_name(dn->dn_objset->os_spa), -+ (u_longlong_t)dmu_objset_id(db->db_objset), -+ (longlong_t)db->db.db_object, -+ (longlong_t)db->db_level, -+ (longlong_t)db->db_blkid, -+ (u_longlong_t)db->db.db_offset, -+ (u_longlong_t)db->db.db_size, -+ !!dbuf_is_metadata(db), -+ db->db_state, -+ (ulong_t)refcount_count(&db->db_holds), -+ /* arc_buf_info_t */ -+ abi.abi_state_type, -+ abi.abi_state_contents, -+ (longlong_t)abi.abi_state_index, -+ abi.abi_flags, -+ (ulong_t)abi.abi_datacnt, -+ (u_longlong_t)abi.abi_size, -+ (u_longlong_t)abi.abi_access, -+ (ulong_t)abi.abi_mru_hits, -+ (ulong_t)abi.abi_mru_ghost_hits, -+ (ulong_t)abi.abi_mfu_hits, -+ (ulong_t)abi.abi_mfu_ghost_hits, -+ (ulong_t)abi.abi_l2arc_hits, -+ (u_longlong_t)abi.abi_l2arc_dattr, -+ (u_longlong_t)abi.abi_l2arc_asize, -+ abi.abi_l2arc_compress, -+ (ulong_t)abi.abi_holds, -+ /* dmu_object_info_t */ -+ doi.doi_type, -+ doi.doi_bonus_type, -+ (ulong_t)doi.doi_data_block_size, -+ (ulong_t)doi.doi_metadata_block_size, -+ (u_longlong_t)doi.doi_bonus_size, -+ (ulong_t)doi.doi_indirection, -+ (ulong_t)refcount_count(&dn->dn_holds), -+ (u_longlong_t)doi.doi_fill_count, -+ (u_longlong_t)doi.doi_max_offset); -+ buf[size] = '\0'; -+ -+ return (size); -+} -+ -+static int -+dbuf_stats_hash_table_data(char *buf, size_t size, void *data) -+{ -+ dbuf_stats_t *dsh = (dbuf_stats_t *)data; -+ dbuf_hash_table_t *h = dsh->hash; -+ dmu_buf_impl_t *db; -+ int length, error = 0; -+ -+ ASSERT3S(dsh->idx, >=, 0); -+ ASSERT3S(dsh->idx, <=, h->hash_table_mask); -+ memset(buf, 0, size); -+ -+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); -+ for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { -+ /* -+ * Returning ENOMEM will cause the data and header functions -+ * to be called with a larger scratch buffers. -+ */ -+ if (size < 512) { -+ error = ENOMEM; -+ break; -+ } -+ -+ mutex_enter(&db->db_mtx); -+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); -+ -+ length = __dbuf_stats_hash_table_data(buf, size, db); -+ buf += length; -+ size -= length; -+ -+ mutex_exit(&db->db_mtx); -+ mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); -+ } -+ mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); -+ -+ return (error); -+} -+ -+static void * -+dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) -+{ -+ dbuf_stats_t *dsh = ksp->ks_private; -+ -+ ASSERT(MUTEX_HELD(&dsh->lock)); -+ -+ if (n <= dsh->hash->hash_table_mask) { -+ dsh->idx = n; -+ return (dsh); -+ } -+ -+ return (NULL); -+} -+ -+static void -+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) -+{ -+ dbuf_stats_t *dsh = &dbuf_stats_hash_table; -+ kstat_t *ksp; -+ -+ mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); -+ dsh->hash = hash; -+ -+ ksp = kstat_create("zfs", 0, "dbufs", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ dsh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &dsh->lock; -+ ksp->ks_ndata = UINT32_MAX; -+ ksp->ks_private = dsh; -+ kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, -+ dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+dbuf_stats_hash_table_destroy(void) -+{ -+ dbuf_stats_t *dsh = &dbuf_stats_hash_table; -+ kstat_t *ksp; -+ -+ ksp = dsh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_destroy(&dsh->lock); -+} -+ -+void -+dbuf_stats_init(dbuf_hash_table_t *hash) -+{ -+ dbuf_stats_hash_table_init(hash); -+} -+ -+void -+dbuf_stats_destroy(void) -+{ -+ dbuf_stats_hash_table_destroy(); -+} -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+module_param(zfs_dbuf_state_index, int, 0644); -+MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); -+#endif -diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c -index 286f3bb..070c831 100644 ---- a/module/zfs/ddt.c -+++ b/module/zfs/ddt.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -39,2 +39,5 @@ - -+static kmem_cache_t *ddt_cache; -+static kmem_cache_t *ddt_entry_cache; -+ - /* -@@ -172,3 +175,3 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - if (!ddt_object_exists(ddt, type, class)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -234,3 +237,3 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - if (!ddt_object_exists(ddt, type, class)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -517,3 +520,2 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) - -- /* XXX: Move to a slab */ - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); -@@ -661,2 +663,18 @@ ddt_exit(ddt_t *ddt) - -+void -+ddt_init(void) -+{ -+ ddt_cache = kmem_cache_create("ddt_cache", -+ sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -+ ddt_entry_cache = kmem_cache_create("ddt_entry_cache", -+ sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -+} -+ -+void -+ddt_fini(void) -+{ -+ kmem_cache_destroy(ddt_entry_cache); -+ kmem_cache_destroy(ddt_cache); -+} -+ - static ddt_entry_t * -@@ -666,4 +684,4 @@ ddt_alloc(const ddt_key_t *ddk) - -- /* XXX: Move to a slab */ -- dde = kmem_zalloc(sizeof (ddt_entry_t), KM_PUSHPAGE); -+ dde = kmem_cache_alloc(ddt_entry_cache, KM_PUSHPAGE); -+ bzero(dde, sizeof (ddt_entry_t)); - cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); -@@ -690,3 +708,3 @@ ddt_free(ddt_entry_t *dde) - cv_destroy(&dde->dde_cv); -- kmem_free(dde, sizeof (*dde)); -+ kmem_cache_free(ddt_entry_cache, dde); - } -@@ -815,4 +833,4 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) - -- /* XXX: Move to a slab */ -- ddt = kmem_zalloc(sizeof (*ddt), KM_PUSHPAGE | KM_NODEBUG); -+ ddt = kmem_cache_alloc(ddt_cache, KM_PUSHPAGE | KM_NODEBUG); -+ bzero(ddt, sizeof (ddt_t)); - -@@ -838,3 +856,3 @@ ddt_table_free(ddt_t *ddt) - mutex_destroy(&ddt->ddt_lock); -- kmem_free(ddt, sizeof (*ddt)); -+ kmem_cache_free(ddt_cache, ddt); - } -@@ -918,3 +936,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; -- dde = kmem_alloc(sizeof(ddt_entry_t), KM_PUSHPAGE); -+ dde = kmem_cache_alloc(ddt_entry_cache, KM_PUSHPAGE); - -@@ -925,3 +943,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - if (ddt_object_lookup(ddt, type, class, dde) == 0) { -- kmem_free(dde, sizeof(ddt_entry_t)); -+ kmem_cache_free(ddt_entry_cache, dde); - return (B_TRUE); -@@ -931,3 +949,3 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) - -- kmem_free(dde, sizeof(ddt_entry_t)); -+ kmem_cache_free(ddt_entry_cache, dde); - return (B_FALSE); -@@ -1206,3 +1224,3 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1211,3 +1229,3 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) - module_param(zfs_dedup_prefetch, int, 0644); --MODULE_PARM_DESC(zfs_dedup_prefetch,"Enable prefetching dedup-ed blks"); -+MODULE_PARM_DESC(zfs_dedup_prefetch, "Enable prefetching dedup-ed blks"); - #endif -diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c -index 65b14ab..a21ed45 100644 ---- a/module/zfs/ddt_zap.c -+++ b/module/zfs/ddt_zap.c -@@ -143,3 +143,3 @@ ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) - { -- return zap_count(os, object, count); -+ return (zap_count(os, object, count)); - } -diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c -index 0a90333..edad9b4 100644 ---- a/module/zfs/dmu.c -+++ b/module/zfs/dmu.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -43,2 +43,3 @@ - #include -+#include - #include -@@ -49,2 +50,7 @@ - -+/* -+ * Enable/disable nopwrite feature. -+ */ -+int zfs_nopwrite_enabled = 1; -+ - const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { -@@ -140,3 +146,3 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - if (db == NULL) { -- err = EIO; -+ err = SET_ERROR(EIO); - } else { -@@ -171,5 +177,5 @@ dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) - if (dn->dn_bonus != db) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else { -@@ -194,5 +200,5 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) - if (!DMU_OT_IS_VALID(type)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } else { -@@ -323,3 +329,3 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) - if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -@@ -328,3 +334,3 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) - if (!dn->dn_have_spill) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -366,3 +372,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - { -- dsl_pool_t *dp = NULL; - dmu_buf_t **dbp; -@@ -372,3 +377,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - zio_t *zio; -- hrtime_t start = 0; - -@@ -394,3 +398,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - rw_exit(&dn->dn_struct_rwlock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -398,8 +402,5 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - } -- dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_PUSHPAGE | KM_NODEBUG); -+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, -+ KM_PUSHPAGE | KM_NODEBUG); - -- if (dn->dn_objset->os_dsl_dataset) -- dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; -- if (dp && dsl_pool_sync_context(dp)) -- start = gethrtime(); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); -@@ -412,3 +413,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - zio_nowait(zio); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -424,5 +425,2 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - err = zio_wait(zio); -- /* track read overhead when we are in sync context */ -- if (dp && dsl_pool_sync_context(dp)) -- dp->dp_read_overhead += gethrtime() - start; - if (err) { -@@ -441,3 +439,3 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - if (db->db_state == DB_UNCACHED) -- err = EIO; -+ err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); -@@ -508,2 +506,12 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) - -+/* -+ * Issue prefetch i/os for the given blocks. -+ * -+ * Note: The assumption is that we *know* these blocks will be needed -+ * almost immediately. Therefore, the prefetch i/os will be issued at -+ * ZIO_PRIORITY_SYNC_READ -+ * -+ * Note: indirect blocks and other metadata will be read synchronously, -+ * causing this function to block if they are not already cached. -+ */ - void -@@ -513,3 +521,3 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - uint64_t blkid; -- int nblks, i, err; -+ int nblks, err; - -@@ -526,3 +534,3 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); -- dbuf_prefetch(dn, blkid); -+ dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); - rw_exit(&dn->dn_struct_rwlock); -@@ -543,4 +551,4 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - int blkshift = dn->dn_datablkshift; -- nblks = (P2ROUNDUP(offset+len, 1<> blkshift; -+ nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - -+ P2ALIGN(offset, 1 << blkshift)) >> blkshift; - } else { -@@ -550,5 +558,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - if (nblks != 0) { -+ int i; -+ - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) -- dbuf_prefetch(dn, blkid+i); -+ dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); - } -@@ -565,16 +575,20 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) - * data by simply searching the allocated level 1 indirects. -+ * -+ * On input, *start should be the first offset that does not need to be -+ * freed (e.g. "offset + length"). On return, *start will be the first -+ * offset that should be freed. - */ - static int --get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) -+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) - { -- uint64_t len = *start - limit; -- uint64_t blkcnt = 0; -- uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); -+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); -+ /* bytes of data covered by a level-1 indirect block */ - uint64_t iblkrange = - dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); -+ uint64_t blks; - -- ASSERT(limit <= *start); -+ ASSERT3U(minimum, <=, *start); - -- if (len <= iblkrange * maxblks) { -- *start = limit; -+ if (*start - minimum <= iblkrange * maxblks) { -+ *start = minimum; - return (0); -@@ -583,6 +597,12 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) - -- while (*start > limit && blkcnt < maxblks) { -+ for (blks = 0; *start > minimum && blks < maxblks; blks++) { - int err; - -- /* find next allocated L1 indirect */ -+ /* -+ * dnode_next_offset(BACKWARDS) will find an allocated L1 -+ * indirect block at or before the input offset. We must -+ * decrement *start so that it is at the end of the region -+ * to search. -+ */ -+ (*start)--; - err = dnode_next_offset(dn, -@@ -590,18 +610,15 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) - -- /* if there are no more, then we are done */ -+ /* if there are no indirect blocks before start, we are done */ - if (err == ESRCH) { -- *start = limit; -- return (0); -- } else if (err) { -+ *start = minimum; -+ break; -+ } else if (err != 0) { - return (err); - } -- blkcnt += 1; - -- /* reset offset to end of "next" block back */ -+ /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); -- if (*start <= limit) -- *start = limit; -- else -- *start -= 1; - } -+ if (*start < minimum) -+ *start = minimum; - return (0); -@@ -611,31 +628,29 @@ static int - dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, -- uint64_t length, boolean_t free_dnode) -+ uint64_t length) - { -- dmu_tx_t *tx; -- uint64_t object_size, start, end, len; -- boolean_t trunc = (length == DMU_OBJECT_END); -- int align, err; -- -- align = 1 << dn->dn_datablkshift; -- ASSERT(align > 0); -- object_size = align == 1 ? dn->dn_datablksz : -- (dn->dn_maxblkid + 1) << dn->dn_datablkshift; -- -- end = offset + length; -- if (trunc || end > object_size) -- end = object_size; -- if (end <= offset) -+ uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; -+ int err; -+ -+ if (offset >= object_size) - return (0); -- length = end - offset; - -- while (length) { -- start = end; -- /* assert(offset <= start) */ -- err = get_next_chunk(dn, &start, offset); -+ if (length == DMU_OBJECT_END || offset + length > object_size) -+ length = object_size - offset; -+ -+ while (length != 0) { -+ uint64_t chunk_end, chunk_begin; -+ dmu_tx_t *tx; -+ -+ chunk_end = chunk_begin = offset + length; -+ -+ /* move chunk_begin backwards to the beginning of this chunk */ -+ err = get_next_chunk(dn, &chunk_begin, offset); - if (err) - return (err); -- len = trunc ? DMU_OBJECT_END : end - start; -+ ASSERT3U(chunk_begin, >=, offset); -+ ASSERT3U(chunk_begin, <=, chunk_end); - - tx = dmu_tx_create(os); -- dmu_tx_hold_free(tx, dn->dn_object, start, len); -+ dmu_tx_hold_free(tx, dn->dn_object, -+ chunk_begin, chunk_end - chunk_begin); - err = dmu_tx_assign(tx, TXG_WAIT); -@@ -645,14 +660,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, - } -- -- dnode_free_range(dn, start, trunc ? -1 : len, tx); -- -- if (start == 0 && free_dnode) { -- ASSERT(trunc); -- dnode_free(dn, tx); -- } -- -- length -= end - start; -- -+ dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); - dmu_tx_commit(tx); -- end = start; -+ -+ length -= chunk_end - chunk_begin; - } -@@ -671,3 +678,13 @@ dmu_free_long_range(objset_t *os, uint64_t object, - return (err); -- err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); -+ err = dmu_free_long_range_impl(os, dn, offset, length); -+ -+ /* -+ * It is important to zero out the maxblkid when freeing the entire -+ * file, so that (a) subsequent calls to dmu_free_long_range_impl() -+ * will take the fast path, and (b) dnode_reallocate() can verify -+ * that the entire file has been freed. -+ */ -+ if (offset == 0 && length == DMU_OBJECT_END) -+ dn->dn_maxblkid = 0; -+ - dnode_rele(dn, FTAG); -@@ -677,5 +694,4 @@ dmu_free_long_range(objset_t *os, uint64_t object, - int --dmu_free_object(objset_t *os, uint64_t object) -+dmu_free_long_object(objset_t *os, uint64_t object) - { -- dnode_t *dn; - dmu_tx_t *tx; -@@ -683,22 +699,17 @@ dmu_free_object(objset_t *os, uint64_t object) - -- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, -- FTAG, &dn); -+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); - if (err != 0) - return (err); -- if (dn->dn_nlevels == 1) { -- tx = dmu_tx_create(os); -- dmu_tx_hold_bonus(tx, object); -- dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); -- err = dmu_tx_assign(tx, TXG_WAIT); -- if (err == 0) { -- dnode_free_range(dn, 0, DMU_OBJECT_END, tx); -- dnode_free(dn, tx); -- dmu_tx_commit(tx); -- } else { -- dmu_tx_abort(tx); -- } -+ -+ tx = dmu_tx_create(os); -+ dmu_tx_hold_bonus(tx, object); -+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); -+ err = dmu_tx_assign(tx, TXG_WAIT); -+ if (err == 0) { -+ err = dmu_object_free(os, object, tx); -+ dmu_tx_commit(tx); - } else { -- err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); -+ dmu_tx_abort(tx); - } -- dnode_rele(dn, FTAG); -+ - return (err); -@@ -869,5 +880,5 @@ static xuio_stats_t xuio_stats = { - --#define XUIOSTAT_INCR(stat, val) \ -- atomic_add_64(&xuio_stats.stat.value.ui64, (val)) --#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) -+#define XUIOSTAT_INCR(stat, val) \ -+ atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -@@ -995,31 +1006,38 @@ xuio_stat_wbuf_nocopy() - * Copy up to size bytes between arg_buf and req based on the data direction -- * described by the req. If an entire req's data cannot be transfered the -- * req's is updated such that it's current index and bv offsets correctly -- * reference any residual data which could not be copied. The return value -- * is the number of bytes successfully copied to arg_buf. -+ * described by the req. If an entire req's data cannot be transfered in one -+ * pass, you should pass in @req_offset to indicate where to continue. The -+ * return value is the number of bytes successfully copied to arg_buf. - */ - static int --dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) -+dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) - { -- struct bio_vec *bv; -+ struct bio_vec bv, *bvp; - struct req_iterator iter; - char *bv_buf; -- int tocpy; -+ int tocpy, bv_len, bv_offset; -+ int offset = 0; - -- *offset = 0; -- rq_for_each_segment(bv, req, iter) { -- -- /* Fully consumed the passed arg_buf */ -- ASSERT3S(*offset, <=, size); -- if (size == *offset) -- break; -+ rq_for_each_segment4(bv, bvp, req, iter) { -+ /* -+ * Fully consumed the passed arg_buf. We use goto here because -+ * rq_for_each_segment is a double loop -+ */ -+ ASSERT3S(offset, <=, size); -+ if (size == offset) -+ goto out; - -- /* Skip fully consumed bv's */ -- if (bv->bv_len == 0) -+ /* Skip already copied bv */ -+ if (req_offset >= bv.bv_len) { -+ req_offset -= bv.bv_len; - continue; -+ } - -- tocpy = MIN(bv->bv_len, size - *offset); -+ bv_len = bv.bv_len - req_offset; -+ bv_offset = bv.bv_offset + req_offset; -+ req_offset = 0; -+ -+ tocpy = MIN(bv_len, size - offset); - ASSERT3S(tocpy, >=, 0); - -- bv_buf = page_address(bv->bv_page) + bv->bv_offset; -+ bv_buf = page_address(bv.bv_page) + bv_offset; - ASSERT3P(bv_buf, !=, NULL); -@@ -1027,57 +1045,10 @@ dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) - if (rq_data_dir(req) == WRITE) -- memcpy(arg_buf + *offset, bv_buf, tocpy); -+ memcpy(arg_buf + offset, bv_buf, tocpy); - else -- memcpy(bv_buf, arg_buf + *offset, tocpy); -+ memcpy(bv_buf, arg_buf + offset, tocpy); - -- *offset += tocpy; -- bv->bv_offset += tocpy; -- bv->bv_len -= tocpy; -- } -- -- return 0; --} -- --static void --dmu_bio_put(struct bio *bio) --{ -- struct bio *bio_next; -- -- while (bio) { -- bio_next = bio->bi_next; -- bio_put(bio); -- bio = bio_next; -- } --} -- --static int --dmu_bio_clone(struct bio *bio, struct bio **bio_copy) --{ -- struct bio *bio_root = NULL; -- struct bio *bio_last = NULL; -- struct bio *bio_new; -- -- if (bio == NULL) -- return EINVAL; -- -- while (bio) { -- bio_new = bio_clone(bio, GFP_NOIO); -- if (bio_new == NULL) { -- dmu_bio_put(bio_root); -- return ENOMEM; -- } -- -- if (bio_last) { -- bio_last->bi_next = bio_new; -- bio_last = bio_new; -- } else { -- bio_root = bio_new; -- bio_last = bio_new; -- } -- -- bio = bio->bi_next; -+ offset += tocpy; - } -- -- *bio_copy = bio_root; -- -- return 0; -+out: -+ return (offset); - } -@@ -1089,5 +1060,5 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - uint64_t offset = blk_rq_pos(req) << 9; -- struct bio *bio_saved = req->bio; - dmu_buf_t **dbp; - int numbufs, i, err; -+ size_t req_offset; - -@@ -1098,3 +1069,3 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG, -- &numbufs, &dbp); -+ &numbufs, &dbp); - if (err) -@@ -1102,13 +1073,3 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - -- /* -- * Clone the bio list so the bv->bv_offset and bv->bv_len members -- * can be safely modified. The original bio list is relinked in to -- * the request when the function exits. This is required because -- * some file systems blindly assume that these values will remain -- * constant between bio_submit() and the IO completion callback. -- */ -- err = dmu_bio_clone(bio_saved, &req->bio); -- if (err) -- goto error; -- -+ req_offset = 0; - for (i = 0; i < numbufs; i++) { -@@ -1124,3 +1085,4 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - -- err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); -+ didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, -+ req_offset); - -@@ -1134,8 +1096,5 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) - offset += didcpy; -+ req_offset += didcpy; - err = 0; - } -- -- dmu_bio_put(req->bio); -- req->bio = bio_saved; --error: - dmu_buf_rele_array(dbp, numbufs, FTAG); -@@ -1150,7 +1109,5 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - uint64_t offset = blk_rq_pos(req) << 9; -- struct bio *bio_saved = req->bio; - dmu_buf_t **dbp; -- int numbufs; -- int err = 0; -- int i; -+ int numbufs, i, err; -+ size_t req_offset; - -@@ -1160,3 +1117,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, -- &numbufs, &dbp); -+ &numbufs, &dbp); - if (err) -@@ -1164,13 +1121,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- /* -- * Clone the bio list so the bv->bv_offset and bv->bv_len members -- * can be safely modified. The original bio list is relinked in to -- * the request when the function exits. This is required because -- * some file systems blindly assume that these values will remain -- * constant between bio_submit() and the IO completion callback. -- */ -- err = dmu_bio_clone(bio_saved, &req->bio); -- if (err) -- goto error; -- -+ req_offset = 0; - for (i = 0; i < numbufs; i++) { -@@ -1193,3 +1140,4 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); -+ didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, -+ req_offset); - -@@ -1206,2 +1154,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - offset += didcpy; -+ req_offset += didcpy; - err = 0; -@@ -1209,7 +1158,3 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) - -- dmu_bio_put(req->bio); -- req->bio = bio_saved; --error: - dmu_buf_rele_array(dbp, numbufs, FTAG); -- - return (err); -@@ -1384,3 +1329,3 @@ dmu_return_arcbuf(arc_buf_t *buf) - arc_return_buf(buf, FTAG); -- VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); -+ VERIFY(arc_buf_remove_ref(buf, FTAG)); - } -@@ -1476,2 +1421,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) - if (zio->io_error == 0) { -+ dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); -+ if (dr->dt.dl.dr_nopwrite) { -+ ASSERTV(blkptr_t *bp = zio->io_bp); -+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); -+ ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig)); -+ -+ ASSERT(BP_EQUAL(bp, bp_orig)); -+ ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); -+ ASSERT(zio_checksum_table[chksum].ci_dedup); -+ } - dr->dt.dl.dr_overridden_by = *zio->io_bp; -@@ -1497,7 +1452,18 @@ dmu_sync_late_arrival_done(zio_t *zio) - dmu_sync_arg_t *dsa = zio->io_private; -+ ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); - - if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { -- ASSERT(zio->io_bp->blk_birth == zio->io_txg); -- ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); -- zio_free(zio->io_spa, zio->io_txg, zio->io_bp); -+ /* -+ * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) -+ * then there is nothing to do here. Otherwise, free the -+ * newly allocated block in this txg. -+ */ -+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { -+ ASSERT(BP_EQUAL(bp, bp_orig)); -+ } else { -+ ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); -+ ASSERT(zio->io_bp->blk_birth == zio->io_txg); -+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); -+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp); -+ } - } -@@ -1522,3 +1488,4 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - dmu_tx_abort(tx); -- return (EIO); /* Make zl_get_data do txg_waited_synced() */ -+ /* Make zl_get_data do txg_waited_synced() */ -+ return (SET_ERROR(EIO)); - } -@@ -1533,4 +1500,4 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, -- dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, -- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, zb)); -+ dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, -+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL|ZIO_FLAG_FASTWRITE, zb)); - -@@ -1546,3 +1513,3 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - * -- * EEXIST: this txg has already been synced, so there's nothing to to. -+ * EEXIST: this txg has already been synced, so there's nothing to do. - * The caller should not log the write. -@@ -1578,3 +1545,2 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - ASSERT(pio != NULL); -- ASSERT(BP_IS_HOLE(bp)); - ASSERT(txg != 0); -@@ -1608,3 +1574,3 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -1630,5 +1596,22 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } - -+ ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); -+ -+ /* -+ * Assume the on-disk data is X, the current syncing data is Y, -+ * and the current in-memory data is Z (currently in dmu_sync). -+ * X and Z are identical but Y is has been modified. Normally, -+ * when X and Z are the same we will perform a nopwrite but if Y -+ * is different we must disable nopwrite since the resulting write -+ * of Y to disk can free the block containing X. If we allowed a -+ * nopwrite to occur the block pointing to Z would reference a freed -+ * block. Since this is a rare case we simplify this by disabling -+ * nopwrite if the current dmu_sync-ing dbuf has been modified in -+ * a previous transaction. -+ */ -+ if (dr->dr_next) -+ zp.zp_nopwrite = B_FALSE; -+ - ASSERT(dr->dr_txg == txg); -@@ -1642,3 +1625,3 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - mutex_exit(&db->db_mtx); -- return (EALREADY); -+ return (SET_ERROR(EALREADY)); - } -@@ -1657,4 +1640,5 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) - bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), -- DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done, -- dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb)); -+ DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, -+ NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, -+ ZIO_FLAG_CANFAIL, &zb)); - -@@ -1717,3 +1701,4 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - enum zio_checksum dedup_checksum = os->os_dedup_checksum; -- boolean_t dedup; -+ boolean_t dedup = B_FALSE; -+ boolean_t nopwrite = B_FALSE; - boolean_t dedup_verify = os->os_dedup_verify; -@@ -1722,3 +1707,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - /* -- * Determine checksum setting. -+ * We maintain different write policies for each of the following -+ * types of data: -+ * 1. metadata -+ * 2. preallocated blocks (i.e. level-0 blocks of a dump device) -+ * 3. all other level 0 blocks - */ -@@ -1726,2 +1715,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - /* -+ * XXX -- we should design a compression algorithm -+ * that specializes in arrays of bps. -+ */ -+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : -+ ZIO_COMPRESS_LZJB; -+ -+ /* - * Metadata always gets checksummed. If the data -@@ -1735,41 +1731,43 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - checksum = ZIO_CHECKSUM_FLETCHER_4; -- } else { -- checksum = zio_checksum_select(dn->dn_checksum, checksum); -- } -+ } else if (wp & WP_NOFILL) { -+ ASSERT(level == 0); - -- /* -- * Determine compression setting. -- */ -- if (ismd) { - /* -- * XXX -- we should design a compression algorithm -- * that specializes in arrays of bps. -+ * If we're writing preallocated blocks, we aren't actually -+ * writing them so don't set any policy properties. These -+ * blocks are currently only used by an external subsystem -+ * outside of zfs (i.e. dump) and not written by the zio -+ * pipeline. - */ -- compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : -- ZIO_COMPRESS_LZJB; -+ compress = ZIO_COMPRESS_OFF; -+ checksum = ZIO_CHECKSUM_OFF; - } else { - compress = zio_compress_select(dn->dn_compress, compress); -- } - -- /* -- * Determine dedup setting. If we are in dmu_sync(), we won't -- * actually dedup now because that's all done in syncing context; -- * but we do want to use the dedup checkum. If the checksum is not -- * strong enough to ensure unique signatures, force dedup_verify. -- */ -- dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); -- if (dedup) { -- checksum = dedup_checksum; -- if (!zio_checksum_table[checksum].ci_dedup) -- dedup_verify = 1; -- } -+ checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? -+ zio_checksum_select(dn->dn_checksum, checksum) : -+ dedup_checksum; - -- if (wp & WP_DMU_SYNC) -- dedup = 0; -+ /* -+ * Determine dedup setting. If we are in dmu_sync(), -+ * we won't actually dedup now because that's all -+ * done in syncing context; but we do want to use the -+ * dedup checkum. If the checksum is not strong -+ * enough to ensure unique signatures, force -+ * dedup_verify. -+ */ -+ if (dedup_checksum != ZIO_CHECKSUM_OFF) { -+ dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; -+ if (!zio_checksum_table[checksum].ci_dedup) -+ dedup_verify = B_TRUE; -+ } - -- if (wp & WP_NOFILL) { -- ASSERT(!ismd && level == 0); -- checksum = ZIO_CHECKSUM_OFF; -- compress = ZIO_COMPRESS_OFF; -- dedup = B_FALSE; -+ /* -+ * Enable nopwrite if we have a cryptographically secure -+ * checksum that has no known collisions (i.e. SHA-256) -+ * and compression is enabled. We don't enable nopwrite if -+ * dedup is enabled as the two features are mutually exclusive. -+ */ -+ nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup && -+ compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); - } -@@ -1783,2 +1781,3 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) - zp->zp_dedup_verify = dedup && dedup_verify; -+ zp->zp_nopwrite = nopwrite; - } -@@ -1817,12 +1816,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) - void --dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - { -- dnode_phys_t *dnp; -+ dnode_phys_t *dnp = dn->dn_phys; - int i; - -- rw_enter(&dn->dn_struct_rwlock, RW_READER); -- mutex_enter(&dn->dn_mtx); -- -- dnp = dn->dn_phys; -- - doi->doi_data_block_size = dn->dn_datablksz; -@@ -1837,3 +1831,3 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; -- doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; -+ doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; - doi->doi_fill_count = 0; -@@ -1841,2 +1835,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) - doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; -+} -+ -+void -+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) -+{ -+ rw_enter(&dn->dn_struct_rwlock, RW_READER); -+ mutex_enter(&dn->dn_mtx); -+ -+ __dmu_object_info_from_dnode(dn, doi); - -@@ -1963,3 +1966,3 @@ dmu_fini(void) - { -- arc_fini(); -+ arc_fini(); /* arc depends on l2arc, so arc must go first */ - l2arc_fini(); -@@ -1982,3 +1985,3 @@ EXPORT_SYMBOL(dmu_free_range); - EXPORT_SYMBOL(dmu_free_long_range); --EXPORT_SYMBOL(dmu_free_object); -+EXPORT_SYMBOL(dmu_free_long_object); - EXPORT_SYMBOL(dmu_read); -@@ -2003,2 +2006,6 @@ module_param(zfs_mdcomp_disable, int, 0644); - MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); -+ -+module_param(zfs_nopwrite_enabled, int, 0644); -+MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); -+ - #endif -diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c -index dc23778..a2cb2fc 100644 ---- a/module/zfs/dmu_diff.c -+++ b/module/zfs/dmu_diff.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -114,3 +115,3 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -137,3 +138,3 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -157,47 +158,45 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - int --dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) -+dmu_diff(const char *tosnap_name, const char *fromsnap_name, -+ struct vnode *vp, offset_t *offp) - { - struct diffarg da; -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap->os_dsl_dataset; -- dsl_dataset_t *findds; -- dsl_dataset_t *relds; -- int err = 0; -- -- /* make certain we are looking at snapshots */ -- if (!dsl_dataset_is_snapshot(ds) || !dsl_dataset_is_snapshot(fromds)) -- return (EINVAL); -- -- /* fromsnap must be earlier and from the same lineage as tosnap */ -- if (fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg) -- return (EXDEV); -- -- relds = NULL; -- findds = ds; -- -- while (fromds->ds_dir != findds->ds_dir) { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- if (!dsl_dir_is_clone(findds->ds_dir)) { -- if (relds) -- dsl_dataset_rele(relds, FTAG); -- return (EXDEV); -- } -- -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- findds->ds_dir->dd_phys->dd_origin_obj, FTAG, &findds); -- rw_exit(&dp->dp_config_rwlock); -- -- if (relds) -- dsl_dataset_rele(relds, FTAG); -+ dsl_dataset_t *fromsnap; -+ dsl_dataset_t *tosnap; -+ dsl_pool_t *dp; -+ int error; -+ uint64_t fromtxg; -+ -+ if (strchr(tosnap_name, '@') == NULL || -+ strchr(fromsnap_name, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ error = dsl_pool_hold(tosnap_name, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } - -- if (err) -- return (EXDEV); -+ error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } - -- relds = findds; -+ if (!dsl_dataset_is_before(tosnap, fromsnap)) { -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (SET_ERROR(EXDEV)); - } - -- if (relds) -- dsl_dataset_rele(relds, FTAG); -+ fromtxg = fromsnap->ds_phys->ds_creation_txg; -+ dsl_dataset_rele(fromsnap, FTAG); -+ -+ dsl_dataset_long_hold(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -209,7 +208,7 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) - -- err = traverse_dataset(ds, fromds->ds_phys->ds_creation_txg, -+ error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); - -- if (err) { -- da.da_err = err; -+ if (error != 0) { -+ da.da_err = error; - } else { -@@ -219,2 +218,5 @@ dmu_diff(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp, offset_t *offp) - -+ dsl_dataset_long_rele(tosnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ - return (da.da_err); -diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c -index 8bb3eb4..b6b82a2 100644 ---- a/module/zfs/dmu_object.c -+++ b/module/zfs/dmu_object.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -92,3 +93,3 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -114,3 +115,3 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - if (object == DMU_META_DNODE_OBJECT) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c -index 52d55d5..fc7c803 100644 ---- a/module/zfs/dmu_objset.c -+++ b/module/zfs/dmu_objset.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -@@ -46,2 +47,3 @@ - #include -+#include - -@@ -284,3 +286,3 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); -- if (err) { -+ if (err != 0) { - kmem_free(os, sizeof (objset_t)); -@@ -288,3 +290,3 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - if (err == ECKSUM) -- err = EIO; -+ err = SET_ERROR(EIO); - return (err); -@@ -324,30 +326,45 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - if (ds) { -- err = dsl_prop_register(ds, "primarycache", -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), - primary_cache_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "secondarycache", -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), - secondary_cache_changed_cb, os); -+ } - if (!dsl_dataset_is_snapshot(ds)) { -- if (err == 0) -- err = dsl_prop_register(ds, "checksum", -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "compression", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), - compression_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "copies", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_COPIES), - copies_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "dedup", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_DEDUP), - dedup_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "logbias", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_LOGBIAS), - logbias_changed_cb, os); -- if (err == 0) -- err = dsl_prop_register(ds, "sync", -+ } -+ if (err == 0) { -+ err = dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_SYNC), - sync_changed_cb, os); -+ } - } -- if (err) { -+ if (err != 0) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, -- &os->os_phys_buf) == 1); -+ &os->os_phys_buf)); - kmem_free(os, sizeof (objset_t)); -@@ -429,3 +446,6 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) - --/* called from zpl */ -+/* -+ * Holds the pool while the objset is held. Therefore only one objset -+ * can be held at a time. -+ */ - int -@@ -433,2 +453,3 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -436,9 +457,16 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - -- err = dsl_dataset_hold(name, tag, &ds); -- if (err) -+ err = dsl_pool_hold(name, tag, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_hold(dp, name, tag, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, tag); - return (err); -+ } - - err = dmu_objset_from_ds(ds, osp); -- if (err) -+ if (err != 0) { - dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ } - -@@ -447,3 +475,7 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) - --/* called from zpl */ -+/* -+ * dsl_pool must not be held when this is called. -+ * Upon successful return, there will be a longhold on the dataset, -+ * and the dsl_pool will not be held. -+ */ - int -@@ -452,2 +484,3 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -455,15 +488,21 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, - -- err = dsl_dataset_own(name, B_FALSE, tag, &ds); -- if (err) -+ err = dsl_pool_hold(name, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_own(dp, name, tag, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); - return (err); -+ } - - err = dmu_objset_from_ds(ds, osp); -- if (err) { -+ dsl_pool_rele(dp, FTAG); -+ if (err != 0) { - dsl_dataset_disown(ds, tag); - } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { -- dmu_objset_disown(*osp, tag); -- return (EINVAL); -+ dsl_dataset_disown(ds, tag); -+ return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { -- dmu_objset_disown(*osp, tag); -- return (EROFS); -+ dsl_dataset_disown(ds, tag); -+ return (SET_ERROR(EROFS)); - } -@@ -475,3 +514,37 @@ dmu_objset_rele(objset_t *os, void *tag) - { -+ dsl_pool_t *dp = dmu_objset_pool(os); - dsl_dataset_rele(os->os_dsl_dataset, tag); -+ dsl_pool_rele(dp, tag); -+} -+ -+/* -+ * When we are called, os MUST refer to an objset associated with a dataset -+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner -+ * == tag. We will then release and reacquire ownership of the dataset while -+ * holding the pool config_rwlock to avoid intervening namespace or ownership -+ * changes may occur. -+ * -+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to -+ * release the hold on its dataset and acquire a new one on the dataset of the -+ * same name so that it can be partially torn down and reconstructed. -+ */ -+void -+dmu_objset_refresh_ownership(objset_t *os, void *tag) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds, *newds; -+ char name[MAXNAMELEN]; -+ -+ ds = os->os_dsl_dataset; -+ VERIFY3P(ds, !=, NULL); -+ VERIFY3P(ds->ds_owner, ==, tag); -+ VERIFY(dsl_dataset_long_held(ds)); -+ -+ dsl_dataset_name(ds, name); -+ dp = dmu_objset_pool(os); -+ dsl_pool_config_enter(dp, FTAG); -+ dmu_objset_disown(os, tag); -+ VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); -+ VERIFY3P(newds, ==, os->os_dsl_dataset); -+ dsl_pool_config_exit(dp, FTAG); - } -@@ -484,3 +557,3 @@ dmu_objset_disown(objset_t *os, void *tag) - --int -+void - dmu_objset_evict_dbufs(objset_t *os) -@@ -519,5 +592,3 @@ dmu_objset_evict_dbufs(objset_t *os) - } -- dn = list_head(&os->os_dnodes); - mutex_exit(&os->os_lock); -- return (dn != DMU_META_DNODE(os)); - } -@@ -527,5 +598,6 @@ dmu_objset_evict(objset_t *os) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; - int t; - -+ dsl_dataset_t *ds = os->os_dsl_dataset; -+ - for (t = 0; t < TXG_SIZE; t++) -@@ -535,18 +607,26 @@ dmu_objset_evict(objset_t *os) - if (!dsl_dataset_is_snapshot(ds)) { -- VERIFY(0 == dsl_prop_unregister(ds, "checksum", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), - checksum_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "compression", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), - compression_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "copies", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_COPIES), - copies_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "dedup", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_DEDUP), - dedup_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "logbias", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_LOGBIAS), - logbias_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "sync", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_SYNC), - sync_changed_cb, os)); - } -- VERIFY(0 == dsl_prop_unregister(ds, "primarycache", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), - primary_cache_changed_cb, os)); -- VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", -+ VERIFY0(dsl_prop_unregister(ds, -+ zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), - secondary_cache_changed_cb, os)); -@@ -557,7 +637,3 @@ dmu_objset_evict(objset_t *os) - -- /* -- * We should need only a single pass over the dnode list, since -- * nothing can be added to the list at this point. -- */ -- (void) dmu_objset_evict_dbufs(os); -+ dmu_objset_evict_dbufs(os); - -@@ -572,3 +648,3 @@ dmu_objset_evict(objset_t *os) - -- VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); -+ VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); - -@@ -604,6 +680,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - ASSERT(dmu_tx_is_syncing(tx)); -+ - if (ds != NULL) -- VERIFY(0 == dmu_objset_from_ds(ds, &os)); -+ VERIFY0(dmu_objset_from_ds(ds, &os)); - else -- VERIFY(0 == dmu_objset_open_impl(spa, NULL, bp, &os)); -+ VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); - -@@ -655,11 +732,11 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - --struct oscarg { -- void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -- void *userarg; -- dsl_dataset_t *clone_origin; -- const char *lastname; -- dmu_objset_type_t type; -- uint64_t flags; -- cred_t *cr; --}; -+typedef struct dmu_objset_create_arg { -+ const char *doca_name; -+ cred_t *doca_cred; -+ void (*doca_userfunc)(objset_t *os, void *arg, -+ cred_t *cr, dmu_tx_t *tx); -+ void *doca_userarg; -+ dmu_objset_type_t doca_type; -+ uint64_t doca_flags; -+} dmu_objset_create_arg_t; - -@@ -667,24 +744,21 @@ struct oscarg { - static int --dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_create_check(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct oscarg *oa = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- int err; -- uint64_t ddobj; -- -- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, -- oa->lastname, sizeof (uint64_t), 1, &ddobj); -- if (err != ENOENT) -- return (err ? err : EEXIST); -+ dmu_objset_create_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ int error; - -- if (oa->clone_origin != NULL) { -- /* You can't clone across pools. */ -- if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) -- return (EXDEV); -+ if (strchr(doca->doca_name, '@') != NULL) -+ return (SET_ERROR(EINVAL)); - -- /* You can only clone snapshots, not the head datasets. */ -- if (!dsl_dataset_is_snapshot(oa->clone_origin)) -- return (EINVAL); -+ error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); -+ if (error != 0) -+ return (error); -+ if (tail == NULL) { -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EEXIST)); - } -+ dsl_dir_rele(pdd, FTAG); - -@@ -694,32 +768,31 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dmu_objset_create_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_create_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- spa_t *spa = dd->dd_pool->dp_spa; -- struct oscarg *oa = arg2; -+ dmu_objset_create_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ dsl_dataset_t *ds; - uint64_t obj; -+ blkptr_t *bp; -+ objset_t *os; - -- ASSERT(dmu_tx_is_syncing(tx)); -- -- obj = dsl_dataset_create_sync(dd, oa->lastname, -- oa->clone_origin, oa->flags, oa->cr, tx); -- -- if (oa->clone_origin == NULL) { -- dsl_pool_t *dp = dd->dd_pool; -- dsl_dataset_t *ds; -- blkptr_t *bp; -- objset_t *os; -+ VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); -- bp = dsl_dataset_get_blkptr(ds); -- ASSERT(BP_IS_HOLE(bp)); -+ obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, -+ doca->doca_cred, tx); - -- os = dmu_objset_create_impl(spa, ds, bp, oa->type, tx); -+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); -+ bp = dsl_dataset_get_blkptr(ds); -+ os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, -+ ds, bp, doca->doca_type, tx); - -- if (oa->userfunc) -- oa->userfunc(os, oa->userarg, oa->cr, tx); -- dsl_dataset_rele(ds, FTAG); -+ if (doca->doca_userfunc != NULL) { -+ doca->doca_userfunc(os, doca->doca_userarg, -+ doca->doca_cred, tx); - } - -- spa_history_log_internal(LOG_DS_CREATE, spa, tx, "dataset = %llu", obj); -+ spa_history_log_internal_ds(ds, "create", tx, ""); -+ dsl_dataset_rele(ds, FTAG); -+ dsl_dir_rele(pdd, FTAG); - } -@@ -730,125 +803,67 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - { -- dsl_dir_t *pdd; -- const char *tail; -- int err = 0; -- struct oscarg oa = { 0 }; -+ dmu_objset_create_arg_t doca; - -- ASSERT(strchr(name, '@') == NULL); -- err = dsl_dir_open(name, FTAG, &pdd, &tail); -- if (err) -- return (err); -- if (tail == NULL) { -- dsl_dir_close(pdd, FTAG); -- return (EEXIST); -- } -- -- oa.userfunc = func; -- oa.userarg = arg; -- oa.lastname = tail; -- oa.type = type; -- oa.flags = flags; -- oa.cr = CRED(); -+ doca.doca_name = name; -+ doca.doca_cred = CRED(); -+ doca.doca_flags = flags; -+ doca.doca_userfunc = func; -+ doca.doca_userarg = arg; -+ doca.doca_type = type; - -- err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, -- dmu_objset_create_sync, pdd, &oa, 5); -- dsl_dir_close(pdd, FTAG); -- return (err); -+ return (dsl_sync_task(name, -+ dmu_objset_create_check, dmu_objset_create_sync, &doca, 5)); - } - --int --dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) -+typedef struct dmu_objset_clone_arg { -+ const char *doca_clone; -+ const char *doca_origin; -+ cred_t *doca_cred; -+} dmu_objset_clone_arg_t; -+ -+/*ARGSUSED*/ -+static int -+dmu_objset_clone_check(void *arg, dmu_tx_t *tx) - { -+ dmu_objset_clone_arg_t *doca = arg; - dsl_dir_t *pdd; - const char *tail; -- int err = 0; -- struct oscarg oa = { 0 }; -+ int error; -+ dsl_dataset_t *origin; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - -- ASSERT(strchr(name, '@') == NULL); -- err = dsl_dir_open(name, FTAG, &pdd, &tail); -- if (err) -- return (err); -+ if (strchr(doca->doca_clone, '@') != NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); -+ if (error != 0) -+ return (error); - if (tail == NULL) { -- dsl_dir_close(pdd, FTAG); -- return (EEXIST); -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EEXIST)); - } -- -- oa.lastname = tail; -- oa.clone_origin = clone_origin; -- oa.flags = flags; -- oa.cr = CRED(); -- -- err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, -- dmu_objset_create_sync, pdd, &oa, 5); -- dsl_dir_close(pdd, FTAG); -- return (err); --} -- --int --dmu_objset_destroy(const char *name, boolean_t defer) --{ -- dsl_dataset_t *ds; -- int error; -- -- error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); -- if (error == 0) { -- error = dsl_dataset_destroy(ds, FTAG, defer); -- /* dsl_dataset_destroy() closes the ds. */ -+ /* You can't clone across pools. */ -+ if (pdd->dd_pool != dp) { -+ dsl_dir_rele(pdd, FTAG); -+ return (SET_ERROR(EXDEV)); - } -+ dsl_dir_rele(pdd, FTAG); - -- return (error); --} -- --struct snaparg { -- dsl_sync_task_group_t *dstg; -- char *snapname; -- char *htag; -- char failed[MAXPATHLEN]; -- boolean_t recursive; -- boolean_t needsuspend; -- boolean_t temporary; -- nvlist_t *props; -- struct dsl_ds_holdarg *ha; /* only needed in the temporary case */ -- dsl_dataset_t *newds; --}; -- --static int --snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- objset_t *os = arg1; -- struct snaparg *sn = arg2; -- int error; -- -- /* The props have already been checked by zfs_check_userprops(). */ -- -- error = dsl_dataset_snapshot_check(os->os_dsl_dataset, -- sn->snapname, tx); -- if (error) -+ error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); -+ if (error != 0) - return (error); - -- if (sn->temporary) { -- /* -- * Ideally we would just call -- * dsl_dataset_user_hold_check() and -- * dsl_dataset_destroy_check() here. However the -- * dataset we want to hold and destroy is the snapshot -- * that we just confirmed we can create, but it won't -- * exist until after these checks are run. Do any -- * checks we can here and if more checks are added to -- * those routines in the future, similar checks may be -- * necessary here. -- */ -- if (spa_version(os->os_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- /* -- * Not checking number of tags because the tag will be -- * unique, as it will be the only tag. -- */ -- if (strlen(sn->htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -- return (E2BIG); -+ /* You can't clone across pools. */ -+ if (origin->ds_dir->dd_pool != dp) { -+ dsl_dataset_rele(origin, FTAG); -+ return (SET_ERROR(EXDEV)); -+ } - -- sn->ha = kmem_alloc(sizeof(struct dsl_ds_holdarg), KM_PUSHPAGE); -- sn->ha->temphold = B_TRUE; -- sn->ha->htag = sn->htag; -+ /* You can only clone snapshots, not the head datasets. */ -+ if (!dsl_dataset_is_snapshot(origin)) { -+ dsl_dataset_rele(origin, FTAG); -+ return (SET_ERROR(EINVAL)); - } -- return (error); -+ dsl_dataset_rele(origin, FTAG); -+ -+ return (0); - } -@@ -856,85 +871,38 @@ snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) - { -- objset_t *os = arg1; -- dsl_dataset_t *ds = os->os_dsl_dataset; -- struct snaparg *sn = arg2; -- -- dsl_dataset_snapshot_sync(ds, sn->snapname, tx); -- -- if (sn->props) { -- dsl_props_arg_t pa; -- pa.pa_props = sn->props; -- pa.pa_source = ZPROP_SRC_LOCAL; -- dsl_props_set_sync(ds->ds_prev, &pa, tx); -- } -+ dmu_objset_clone_arg_t *doca = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *pdd; -+ const char *tail; -+ dsl_dataset_t *origin, *ds; -+ uint64_t obj; -+ char namebuf[MAXNAMELEN]; - -- if (sn->temporary) { -- struct dsl_ds_destroyarg da; -+ VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); -+ VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - -- dsl_dataset_user_hold_sync(ds->ds_prev, sn->ha, tx); -- kmem_free(sn->ha, sizeof (struct dsl_ds_holdarg)); -- sn->ha = NULL; -- sn->newds = ds->ds_prev; -+ obj = dsl_dataset_create_sync(pdd, tail, origin, 0, -+ doca->doca_cred, tx); - -- da.ds = ds->ds_prev; -- da.defer = B_TRUE; -- dsl_dataset_destroy_sync(&da, FTAG, tx); -- } -+ VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); -+ dsl_dataset_name(origin, namebuf); -+ spa_history_log_internal_ds(ds, "clone", tx, -+ "origin=%s (%llu)", namebuf, origin->ds_object); -+ dsl_dataset_rele(ds, FTAG); -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dir_rele(pdd, FTAG); - } - --static int --dmu_objset_snapshot_one(const char *name, void *arg) -+int -+dmu_objset_clone(const char *clone, const char *origin) - { -- struct snaparg *sn = arg; -- objset_t *os; -- int err; -- char *cp; -- -- /* -- * If the objset starts with a '%', then ignore it unless it was -- * explicitly named (ie, not recursive). These hidden datasets -- * are always inconsistent, and by not opening them here, we can -- * avoid a race with dsl_dir_destroy_check(). -- */ -- cp = strrchr(name, '/'); -- if (cp && cp[1] == '%' && sn->recursive) -- return (0); -- -- (void) strcpy(sn->failed, name); -+ dmu_objset_clone_arg_t doca; - -- /* -- * Check permissions if we are doing a recursive snapshot. The -- * permission checks for the starting dataset have already been -- * performed in zfs_secpolicy_snapshot() -- */ -- if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) -- return (err); -+ doca.doca_clone = clone; -+ doca.doca_origin = origin; -+ doca.doca_cred = CRED(); - -- err = dmu_objset_hold(name, sn, &os); -- if (err != 0) -- return (err); -- -- /* -- * If the objset is in an inconsistent state (eg, in the process -- * of being destroyed), don't snapshot it. As with %hidden -- * datasets, we return EBUSY if this name was explicitly -- * requested (ie, not recursive), and otherwise ignore it. -- */ -- if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { -- dmu_objset_rele(os, sn); -- return (sn->recursive ? 0 : EBUSY); -- } -- -- if (sn->needsuspend) { -- err = zil_suspend(dmu_objset_zil(os)); -- if (err) { -- dmu_objset_rele(os, sn); -- return (err); -- } -- } -- dsl_sync_task_create(sn->dstg, snapshot_check, snapshot_sync, -- os, sn, 3); -- -- return (0); -+ return (dsl_sync_task(clone, -+ dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5)); - } -@@ -942,72 +910,12 @@ dmu_objset_snapshot_one(const char *name, void *arg) - int --dmu_objset_snapshot(char *fsname, char *snapname, char *tag, -- nvlist_t *props, boolean_t recursive, boolean_t temporary, int cleanup_fd) -+dmu_objset_snapshot_one(const char *fsname, const char *snapname) - { -- dsl_sync_task_t *dst; -- struct snaparg *sn; -- spa_t *spa; -- minor_t minor; - int err; -+ char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); -+ nvlist_t *snaps = fnvlist_alloc(); - -- sn = kmem_alloc(sizeof (struct snaparg), KM_SLEEP); -- (void) strcpy(sn->failed, fsname); -- -- err = spa_open(fsname, &spa, FTAG); -- if (err) { -- kmem_free(sn, sizeof (struct snaparg)); -- return (err); -- } -- -- if (temporary) { -- if (cleanup_fd < 0) { -- spa_close(spa, FTAG); -- return (EINVAL); -- } -- if ((err = zfs_onexit_fd_hold(cleanup_fd, &minor)) != 0) { -- spa_close(spa, FTAG); -- return (err); -- } -- } -- -- sn->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- sn->snapname = snapname; -- sn->htag = tag; -- sn->props = props; -- sn->recursive = recursive; -- sn->needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -- sn->temporary = temporary; -- sn->ha = NULL; -- sn->newds = NULL; -- -- if (recursive) { -- err = dmu_objset_find(fsname, -- dmu_objset_snapshot_one, sn, DS_FIND_CHILDREN); -- } else { -- err = dmu_objset_snapshot_one(fsname, sn); -- } -- -- if (err == 0) -- err = dsl_sync_task_group_wait(sn->dstg); -- -- for (dst = list_head(&sn->dstg->dstg_tasks); dst; -- dst = list_next(&sn->dstg->dstg_tasks, dst)) { -- objset_t *os = dst->dst_arg1; -- dsl_dataset_t *ds = os->os_dsl_dataset; -- if (dst->dst_err) { -- dsl_dataset_name(ds, sn->failed); -- } else if (temporary) { -- dsl_register_onexit_hold_cleanup(sn->newds, tag, minor); -- } -- if (sn->needsuspend) -- zil_resume(dmu_objset_zil(os)); -- dmu_objset_rele(os, sn); -- } -- -- if (err) -- (void) strcpy(fsname, sn->failed); -- if (temporary) -- zfs_onexit_fd_rele(cleanup_fd); -- dsl_sync_task_group_destroy(sn->dstg); -- spa_close(spa, FTAG); -- kmem_free(sn, sizeof (struct snaparg)); -+ fnvlist_add_boolean(snaps, longsnap); -+ strfree(longsnap); -+ err = dsl_dataset_snapshot(snaps, NULL, NULL); -+ fnvlist_free(snaps); - return (err); -@@ -1052,5 +960,5 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) - -- ASSERT(bp == os->os_rootbp); -- ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); -- ASSERT(BP_GET_LEVEL(bp) == 0); -+ ASSERT3P(bp, ==, os->os_rootbp); -+ ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); -+ ASSERT0(BP_GET_LEVEL(bp)); - -@@ -1126,3 +1034,3 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) - DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, -- dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, -+ NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); -@@ -1161,4 +1069,4 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; -- while ((dr = list_head(list)) != NULL) { -- ASSERT(dr->dr_dbuf->db_level == 0); -+ while ((dr = list_head(list))) { -+ ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); -@@ -1221,3 +1129,3 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) - -- while ((dn = list_head(list)) != NULL) { -+ while ((dn = list_head(list))) { - int flags; -@@ -1324,3 +1232,4 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) - dmu_buf_impl_t *db = NULL; -- uint64_t *user = NULL, *group = NULL; -+ uint64_t *user = NULL; -+ uint64_t *group = NULL; - int flags = dn->dn_id_flags; -@@ -1435,5 +1344,5 @@ dmu_objset_userspace_upgrade(objset_t *os) - if (!dmu_objset_userused_enabled(os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (dmu_objset_is_snapshot(os)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1453,6 +1362,6 @@ dmu_objset_userspace_upgrade(objset_t *os) - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - - objerr = dmu_bonus_hold(os, obj, FTAG, &db); -- if (objerr) -+ if (objerr != 0) - continue; -@@ -1461,3 +1370,3 @@ dmu_objset_userspace_upgrade(objset_t *os) - objerr = dmu_tx_assign(tx, TXG_WAIT); -- if (objerr) { -+ if (objerr != 0) { - dmu_tx_abort(tx); -@@ -1529,3 +1438,3 @@ dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, - if (ds->ds_phys->ds_snapnames_zapobj == 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1544,4 +1453,6 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - -+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); -+ - if (ds->ds_phys->ds_snapnames_zapobj == 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1553,3 +1464,3 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1558,3 +1469,3 @@ dmu_snapshot_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -1576,3 +1487,3 @@ dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) - { -- return dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value); -+ return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value)); - } -@@ -1590,3 +1501,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - dd->dd_phys->dd_head_dataset_obj) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1598,3 +1509,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1603,3 +1514,3 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - zap_cursor_fini(&cursor); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -1616,27 +1527,103 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, - --struct findarg { -- int (*func)(const char *, void *); -- void *arg; --}; -- --/* ARGSUSED */ --static int --findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) --{ -- struct findarg *fa = arg; -- return (fa->func(dsname, fa->arg)); --} -- - /* -- * Find all objsets under name, and for each, call 'func(child_name, arg)'. -- * Perhaps change all callers to use dmu_objset_find_spa()? -+ * Find objsets under and including ddobj, call func(ds) on each. - */ - int --dmu_objset_find(char *name, int func(const char *, void *), void *arg, -- int flags) -+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, -+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) - { -- struct findarg fa; -- fa.func = func; -- fa.arg = arg; -- return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); -+ dsl_dir_t *dd; -+ dsl_dataset_t *ds; -+ zap_cursor_t zc; -+ zap_attribute_t *attr; -+ uint64_t thisobj; -+ int err; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); -+ if (err != 0) -+ return (err); -+ -+ /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ -+ if (dd->dd_myname[0] == '$') { -+ dsl_dir_rele(dd, FTAG); -+ return (0); -+ } -+ -+ thisobj = dd->dd_phys->dd_head_dataset_obj; -+ attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ -+ /* -+ * Iterate over all children. -+ */ -+ if (flags & DS_FIND_CHILDREN) { -+ for (zap_cursor_init(&zc, dp->dp_meta_objset, -+ dd->dd_phys->dd_child_dir_zapobj); -+ zap_cursor_retrieve(&zc, attr) == 0; -+ (void) zap_cursor_advance(&zc)) { -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); -+ -+ err = dmu_objset_find_dp(dp, attr->za_first_integer, -+ func, arg, flags); -+ if (err != 0) -+ break; -+ } -+ zap_cursor_fini(&zc); -+ -+ if (err != 0) { -+ dsl_dir_rele(dd, FTAG); -+ kmem_free(attr, sizeof (zap_attribute_t)); -+ return (err); -+ } -+ } -+ -+ /* -+ * Iterate over all snapshots. -+ */ -+ if (flags & DS_FIND_SNAPSHOTS) { -+ dsl_dataset_t *ds; -+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -+ -+ if (err == 0) { -+ uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; -+ dsl_dataset_rele(ds, FTAG); -+ -+ for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); -+ zap_cursor_retrieve(&zc, attr) == 0; -+ (void) zap_cursor_advance(&zc)) { -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); -+ -+ err = dsl_dataset_hold_obj(dp, -+ attr->za_first_integer, FTAG, &ds); -+ if (err != 0) -+ break; -+ err = func(dp, ds, arg); -+ dsl_dataset_rele(ds, FTAG); -+ if (err != 0) -+ break; -+ } -+ zap_cursor_fini(&zc); -+ } -+ } -+ -+ dsl_dir_rele(dd, FTAG); -+ kmem_free(attr, sizeof (zap_attribute_t)); -+ -+ if (err != 0) -+ return (err); -+ -+ /* -+ * Apply to self. -+ */ -+ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -+ if (err != 0) -+ return (err); -+ err = func(dp, ds, arg); -+ dsl_dataset_rele(ds, FTAG); -+ return (err); - } -@@ -1644,10 +1631,14 @@ dmu_objset_find(char *name, int func(const char *, void *), void *arg, - /* -- * Find all objsets under name, call func on each -+ * Find all objsets under name, and for each, call 'func(child_name, arg)'. -+ * The dp_config_rwlock must not be held when this is called, and it -+ * will not be held when the callback is called. -+ * Therefore this function should only be used when the pool is not changing -+ * (e.g. in syncing context), or the callback can deal with the possible races. - */ --int --dmu_objset_find_spa(spa_t *spa, const char *name, -- int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) -+static int -+dmu_objset_find_impl(spa_t *spa, const char *name, -+ int func(const char *, void *), void *arg, int flags) - { - dsl_dir_t *dd; -- dsl_pool_t *dp; -+ dsl_pool_t *dp = spa_get_dsl(spa); - dsl_dataset_t *ds; -@@ -1659,7 +1650,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- if (name == NULL) -- name = spa_name(spa); -- err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); -- if (err) -+ dsl_pool_config_enter(dp, FTAG); -+ -+ err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); -+ if (err != 0) { -+ dsl_pool_config_exit(dp, FTAG); - return (err); -+ } - -@@ -1667,3 +1660,4 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - if (dd->dd_myname[0] == '$') { -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ dsl_pool_config_exit(dp, FTAG); - return (0); -@@ -1673,3 +1667,2 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - attr = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -- dp = dd->dd_pool; - -@@ -1683,9 +1676,13 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - (void) zap_cursor_advance(&zc)) { -- ASSERT(attr->za_integer_length == sizeof (uint64_t)); -- ASSERT(attr->za_num_integers == 1); -+ ASSERT3U(attr->za_integer_length, ==, -+ sizeof (uint64_t)); -+ ASSERT3U(attr->za_num_integers, ==, 1); - - child = kmem_asprintf("%s/%s", name, attr->za_name); -- err = dmu_objset_find_spa(spa, child, func, arg, flags); -+ dsl_pool_config_exit(dp, FTAG); -+ err = dmu_objset_find_impl(spa, child, -+ func, arg, flags); -+ dsl_pool_config_enter(dp, FTAG); - strfree(child); -- if (err) -+ if (err != 0) - break; -@@ -1694,4 +1691,5 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- if (err) { -- dsl_dir_close(dd, FTAG); -+ if (err != 0) { -+ dsl_dir_rele(dd, FTAG); -+ dsl_pool_config_exit(dp, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); -@@ -1705,7 +1703,3 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - if (flags & DS_FIND_SNAPSHOTS) { -- if (!dsl_pool_sync_context(dp)) -- rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); -- if (!dsl_pool_sync_context(dp)) -- rw_exit(&dp->dp_config_rwlock); - -@@ -1718,5 +1712,5 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - (void) zap_cursor_advance(&zc)) { -- ASSERT(attr->za_integer_length == -+ ASSERT3U(attr->za_integer_length, ==, - sizeof (uint64_t)); -- ASSERT(attr->za_num_integers == 1); -+ ASSERT3U(attr->za_num_integers, ==, 1); - -@@ -1724,6 +1718,7 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - name, attr->za_name); -- err = func(spa, attr->za_first_integer, -- child, arg); -+ dsl_pool_config_exit(dp, FTAG); -+ err = func(child, arg); -+ dsl_pool_config_enter(dp, FTAG); - strfree(child); -- if (err) -+ if (err != 0) - break; -@@ -1734,44 +1729,29 @@ dmu_objset_find_spa(spa_t *spa, const char *name, - -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); -+ dsl_pool_config_exit(dp, FTAG); - -- if (err) -+ if (err != 0) - return (err); - -- /* -- * Apply to self if appropriate. -- */ -- err = func(spa, thisobj, name, arg); -- return (err); -+ /* Apply to self. */ -+ return (func(name, arg)); - } - --/* ARGSUSED */ -+/* -+ * See comment above dmu_objset_find_impl(). -+ */ - int --dmu_objset_prefetch(const char *name, void *arg) -+dmu_objset_find(char *name, int func(const char *, void *), void *arg, -+ int flags) - { -- dsl_dataset_t *ds; -- -- if (dsl_dataset_hold(name, FTAG, &ds)) -- return (0); -- -- if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { -- mutex_enter(&ds->ds_opening_lock); -- if (ds->ds_objset == NULL) { -- uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; -- zbookmark_t zb; -- -- SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, -- ZB_ROOT_LEVEL, ZB_ROOT_BLKID); -- -- (void) arc_read(NULL, dsl_dataset_get_spa(ds), -- &ds->ds_phys->ds_bp, NULL, NULL, -- ZIO_PRIORITY_ASYNC_READ, -- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, -- &aflags, &zb); -- } -- mutex_exit(&ds->ds_opening_lock); -- } -+ spa_t *spa; -+ int error; - -- dsl_dataset_rele(ds, FTAG); -- return (0); -+ error = spa_open(name, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ error = dmu_objset_find_impl(spa, name, func, arg, flags); -+ spa_close(spa, FTAG); -+ return (error); - } -@@ -1792,2 +1772,18 @@ dmu_objset_get_user(objset_t *os) - -+/* -+ * Determine name of filesystem, given name of snapshot. -+ * buf must be at least MAXNAMELEN bytes -+ */ -+int -+dmu_fsname(const char *snapname, char *buf) -+{ -+ char *atp = strchr(snapname, '@'); -+ if (atp == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (atp - snapname >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); -+ (void) strlcpy(buf, snapname, atp - snapname + 1); -+ return (0); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) -@@ -1805,4 +1801,2 @@ EXPORT_SYMBOL(dmu_objset_create); - EXPORT_SYMBOL(dmu_objset_clone); --EXPORT_SYMBOL(dmu_objset_destroy); --EXPORT_SYMBOL(dmu_objset_snapshot); - EXPORT_SYMBOL(dmu_objset_stats); -@@ -1813,4 +1807,2 @@ EXPORT_SYMBOL(dmu_objset_fsid_guid); - EXPORT_SYMBOL(dmu_objset_find); --EXPORT_SYMBOL(dmu_objset_find_spa); --EXPORT_SYMBOL(dmu_objset_prefetch); - EXPORT_SYMBOL(dmu_objset_byteswap); -diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c -index 54e7597..9264fbb 100644 ---- a/module/zfs/dmu_send.c -+++ b/module/zfs/dmu_send.c -@@ -24,4 +24,4 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -50,2 +50,4 @@ - #include -+#include -+#include - -@@ -55,2 +57,3 @@ int zfs_send_corrupt_data = B_FALSE; - static char *dmu_recv_tag = "dmu_recv_tag"; -+static const char *recv_clone_name = "%recv"; - -@@ -108,2 +111,28 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - -+ /* -+ * When we receive a free record, dbuf_free_range() assumes -+ * that the receiving system doesn't have any dbufs in the range -+ * being freed. This is always true because there is a one-record -+ * constraint: we only send one WRITE record for any given -+ * object+offset. We know that the one-record constraint is -+ * true because we always send data in increasing order by -+ * object,offset. -+ * -+ * If the increasing-order constraint ever changes, we should find -+ * another way to assert that the one-record constraint is still -+ * satisfied. -+ */ -+ ASSERT(object > dsp->dsa_last_data_object || -+ (object == dsp->dsa_last_data_object && -+ offset > dsp->dsa_last_data_offset)); -+ -+ /* -+ * If we are doing a non-incremental send, then there can't -+ * be any data in the dataset we're receiving into. Therefore -+ * a free record would simply be a no-op. Save space by not -+ * sending it to begin with. -+ */ -+ if (!dsp->dsa_incremental) -+ return (0); -+ - if (length != -1ULL && offset + length < offset) -@@ -122,3 +151,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -146,3 +175,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -160,3 +189,3 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - } else { -@@ -174,2 +203,11 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - -+ /* -+ * We send data in increasing object, offset order. -+ * See comment in dump_free() for details. -+ */ -+ ASSERT(object > dsp->dsa_last_data_object || -+ (object == dsp->dsa_last_data_object && -+ offset > dsp->dsa_last_data_offset)); -+ dsp->dsa_last_data_object = object; -+ dsp->dsa_last_data_offset = offset + blksz - 1; - -@@ -184,3 +222,3 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -204,5 +242,5 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - return (0); -@@ -218,3 +256,3 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -230,5 +268,5 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - if (dump_bytes(dsp, data, blksz)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - return (0); -@@ -241,2 +279,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - -+ /* See comment in dump_free(). */ -+ if (!dsp->dsa_incremental) -+ return (0); -+ - /* -@@ -252,3 +294,3 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -267,3 +309,3 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -295,3 +337,3 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) - sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - dsp->dsa_pending_op = PENDING_NONE; -@@ -312,13 +354,13 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) - if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - - if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -- /* free anything past the end of the file */ -+ /* Free anything past the end of the file. */ - if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * -- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) -- return (EINTR); -- if (dsp->dsa_err) -- return (EINTR); -+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) -+ return (SET_ERROR(EINTR)); -+ if (dsp->dsa_err != 0) -+ return (SET_ERROR(EINTR)); - return (0); -@@ -340,3 +382,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (issig(JUSTLOOKING) && issig(FORREAL)) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -364,3 +406,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -371,3 +413,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - err = dump_dnode(dsp, dnobj, blk+i); -- if (err) -+ if (err != 0) - break; -@@ -383,3 +425,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - &aflags, zb) != 0) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -405,3 +447,3 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - } else { -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -418,8 +460,10 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - --int --dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- int outfd, vnode_t *vp, offset_t *off) -+/* -+ * Releases dp, ds, and fromds, using the specified tag. -+ */ -+static int -+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, -+ dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) - { -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; -+ objset_t *os; - dmu_replay_record_t *drr; -@@ -429,29 +473,17 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- /* tosnap must be a snapshot */ -- if (ds->ds_phys->ds_next_snap_obj == 0) -- return (EINVAL); -- -- /* fromsnap must be an earlier snapshot from the same fs as tosnap */ -- if (fromds && (ds->ds_dir != fromds->ds_dir || -- fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) -- return (EXDEV); -- -- if (fromorigin) { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- if (fromsnap) -- return (EINVAL); -- -- if (dsl_dir_is_clone(ds->ds_dir)) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -- return (err); -- } else { -- fromorigin = B_FALSE; -- } -+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (SET_ERROR(EXDEV)); - } - -+ err = dmu_objset_from_ds(ds, &os); -+ if (err != 0) { -+ if (fromds != NULL) -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (err); -+ } - -@@ -464,9 +496,13 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - #ifdef _KERNEL -- if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { -+ if (dmu_objset_type(os) == DMU_OST_ZFS) { - uint64_t version; -- if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) { -+ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); -- return (EINVAL); -+ if (fromds != NULL) -+ dsl_dataset_rele(fromds, tag); -+ dsl_dataset_rele(ds, tag); -+ dsl_pool_rele(dp, tag); -+ return (SET_ERROR(EINVAL)); - } -- if (version == ZPL_VERSION_SA) { -+ if (version >= ZPL_VERSION_SA) { - DMU_SET_FEATUREFLAGS( -@@ -480,4 +516,4 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - ds->ds_phys->ds_creation_time; -- drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; -- if (fromorigin) -+ drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); -+ if (fromds != NULL && ds->ds_dir != fromds->ds_dir) - drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; -@@ -487,3 +523,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- if (fromds) -+ if (fromds != NULL) - drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; -@@ -491,6 +527,7 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -- if (fromds) -+ if (fromds != NULL) { - fromtxg = fromds->ds_phys->ds_creation_txg; -- if (fromorigin) -- dsl_dataset_rele(fromds, FTAG); -+ dsl_dataset_rele(fromds, tag); -+ fromds = NULL; -+ } - -@@ -502,3 +539,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - dsp->dsa_proc = curproc; -- dsp->dsa_os = tosnap; -+ dsp->dsa_os = os; - dsp->dsa_off = off; -@@ -507,2 +544,3 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - dsp->dsa_pending_op = PENDING_NONE; -+ dsp->dsa_incremental = (fromtxg != 0); - -@@ -512,2 +550,5 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - -+ dsl_dataset_long_hold(ds, FTAG); -+ dsl_pool_rele(dp, tag); -+ - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { -@@ -522,6 +563,6 @@ dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) -- err = EINTR; -+ err = SET_ERROR(EINTR); - -- if (err) { -- if (err == EINTR && dsp->dsa_err) -+ if (err != 0) { -+ if (err == EINTR && dsp->dsa_err != 0) - err = dsp->dsa_err; -@@ -548,2 +589,5 @@ out: - -+ dsl_dataset_long_rele(ds, FTAG); -+ dsl_dataset_rele(ds, tag); -+ - return (err); -@@ -552,35 +596,86 @@ out: - int --dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, -- uint64_t *sizep) -+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, -+ int outfd, vnode_t *vp, offset_t *off) - { -- dsl_dataset_t *ds = tosnap->os_dsl_dataset; -- dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ dsl_dataset_t *fromds = NULL; - int err; -- uint64_t size, recordsize; - -- /* tosnap must be a snapshot */ -- if (ds->ds_phys->ds_next_snap_obj == 0) -- return (EINVAL); -- -- /* fromsnap must be an earlier snapshot from the same fs as tosnap */ -- if (fromds && (ds->ds_dir != fromds->ds_dir || -- fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) -- return (EXDEV); -- -- if (fromorigin) { -- if (fromsnap) -- return (EINVAL); -- -- if (dsl_dir_is_clone(ds->ds_dir)) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -- return (err); -- } else { -- fromorigin = B_FALSE; -+ err = dsl_pool_hold(pool, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ -+ err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (fromsnap != 0) { -+ err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); -+ if (err != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ } -+ -+ return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); -+} -+ -+int -+dmu_send(const char *tosnap, const char *fromsnap, -+ int outfd, vnode_t *vp, offset_t *off) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ dsl_dataset_t *fromds = NULL; -+ int err; -+ -+ if (strchr(tosnap, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ err = dsl_pool_hold(tosnap, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ -+ err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (fromsnap != NULL) { -+ err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); -+ if (err != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (err); - } - } -+ return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); -+} -+ -+int -+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) -+{ -+ int err; -+ uint64_t size, recordsize; -+ ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ /* tosnap must be a snapshot */ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ /* -+ * fromsnap must be an earlier snapshot from the same fs as tosnap, -+ * or the origin's fs. -+ */ -+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) -+ return (SET_ERROR(EXDEV)); - -@@ -593,5 +688,3 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - &used, &comp, &size); -- if (fromorigin) -- dsl_dataset_rele(fromds, FTAG); -- if (err) -+ if (err != 0) - return (err); -@@ -614,7 +707,4 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_prop_get_ds(ds, "recordsize", -- sizeof (recordsize), 1, &recordsize, NULL); -- rw_exit(&dp->dp_config_rwlock); -- if (err) -+ err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); -+ if (err != 0) - return (err); -@@ -630,40 +720,73 @@ dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, - --struct recvbeginsyncarg { -- const char *tofs; -- const char *tosnap; -- dsl_dataset_t *origin; -- uint64_t fromguid; -- dmu_objset_type_t type; -- void *tag; -- boolean_t force; -- uint64_t dsflags; -- char clonelastname[MAXNAMELEN]; -- dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ -- cred_t *cr; --}; -+typedef struct dmu_recv_begin_arg { -+ const char *drba_origin; -+ dmu_recv_cookie_t *drba_cookie; -+ cred_t *drba_cred; -+ uint64_t drba_snapobj; -+} dmu_recv_begin_arg_t; - --/* ARGSUSED */ - static int --recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) -+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, -+ uint64_t fromguid) - { -- dsl_dir_t *dd = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; -- int err; -+ int error; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ -+ /* temporary clone name must not exist */ -+ error = zap_lookup(dp->dp_meta_objset, -+ ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, -+ 8, 1, &val); -+ if (error != ENOENT) -+ return (error == 0 ? EBUSY : error); - -- err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, -- strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); -+ /* new snapshot name must not exist */ -+ error = zap_lookup(dp->dp_meta_objset, -+ ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, -+ 8, 1, &val); -+ if (error != ENOENT) -+ return (error == 0 ? EEXIST : error); -+ -+ if (fromguid != 0) { -+ dsl_dataset_t *snap; -+ uint64_t obj = ds->ds_phys->ds_prev_snap_obj; -+ -+ /* Find snapshot in this dir that matches fromguid. */ -+ while (obj != 0) { -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap); -+ if (error != 0) -+ return (SET_ERROR(ENODEV)); -+ if (snap->ds_dir != ds->ds_dir) { -+ dsl_dataset_rele(snap, FTAG); -+ return (SET_ERROR(ENODEV)); -+ } -+ if (snap->ds_phys->ds_guid == fromguid) -+ break; -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_dataset_rele(snap, FTAG); -+ } -+ if (obj == 0) -+ return (SET_ERROR(ENODEV)); - -- if (err != ENOENT) -- return (err ? err : EEXIST); -+ if (drba->drba_cookie->drc_force) { -+ drba->drba_snapobj = obj; -+ } else { -+ /* -+ * If we are not forcing, there must be no -+ * changes since fromsnap. -+ */ -+ if (dsl_dataset_modified_since_snap(ds, snap)) { -+ dsl_dataset_rele(snap, FTAG); -+ return (SET_ERROR(ETXTBSY)); -+ } -+ drba->drba_snapobj = ds->ds_prev->ds_object; -+ } - -- if (rbsa->origin) { -- /* make sure it's a snap in the same pool */ -- if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) -- return (EXDEV); -- if (!dsl_dataset_is_snapshot(rbsa->origin)) -- return (EINVAL); -- if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) -- return (ENODEV); -+ dsl_dataset_rele(snap, FTAG); -+ } else { -+ /* if full, most recent snapshot must be $ORIGIN */ -+ if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) -+ return (SET_ERROR(ENODEV)); -+ drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; - } -@@ -671,113 +794,141 @@ recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) - return (0); -+ - } - --static void --recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static int -+dmu_recv_begin_check(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; -- uint64_t dsobj; -- -- /* Create and open new dataset. */ -- dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, -- rbsa->origin, flags, rbsa->cr, tx); -- VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, -- B_TRUE, dmu_recv_tag, &rbsa->ds)); -+ dmu_recv_begin_arg_t *drba = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb; -+ uint64_t fromguid = drrb->drr_fromguid; -+ int flags = drrb->drr_flags; -+ int error; -+ dsl_dataset_t *ds; -+ const char *tofs = drba->drba_cookie->drc_tofs; - -- if (rbsa->origin == NULL) { -- (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, -- rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); -- } -+ /* already checked */ -+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - -- spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, -- dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); --} -+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == -+ DMU_COMPOUNDSTREAM || -+ drrb->drr_type >= DMU_OST_NUMTYPES || -+ ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) -+ return (SET_ERROR(EINVAL)); - --/* ARGSUSED */ --static int --recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- int err; -- uint64_t val; -+ /* Verify pool version supports SA if SA_SPILL feature set */ -+ if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & -+ DMU_BACKUP_FEATURE_SA_SPILL) && -+ spa_version(dp->dp_spa) < SPA_VERSION_SA) { -+ return (SET_ERROR(ENOTSUP)); -+ } - -- /* must not have any changes since most recent snapshot */ -- if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) -- return (ETXTBSY); -+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); -+ if (error == 0) { -+ /* target fs already exists; recv into temp clone */ - -- /* new snapshot name must not exist */ -- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ /* Can't recv a clone into an existing fs */ -+ if (flags & DRR_FLAG_CLONE) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- if (rbsa->fromguid) { -- /* if incremental, most recent snapshot must match fromguid */ -- if (ds->ds_prev == NULL) -- return (ENODEV); -+ error = recv_begin_check_existing_impl(drba, ds, fromguid); -+ dsl_dataset_rele(ds, FTAG); -+ } else if (error == ENOENT) { -+ /* target fs does not exist; must be a full backup or clone */ -+ char buf[MAXNAMELEN]; - - /* -- * most recent snapshot must match fromguid, or there are no -- * changes since the fromguid one -+ * If it's a non-clone incremental, we are missing the -+ * target fs, so fail the recv. - */ -- if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { -- uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; -- uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; -- while (obj != 0) { -- dsl_dataset_t *snap; -- err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- obj, FTAG, &snap); -- if (err) -- return (ENODEV); -- if (snap->ds_phys->ds_creation_txg < birth) { -- dsl_dataset_rele(snap, FTAG); -- return (ENODEV); -- } -- if (snap->ds_phys->ds_guid == rbsa->fromguid) { -- dsl_dataset_rele(snap, FTAG); -- break; /* it's ok */ -- } -- obj = snap->ds_phys->ds_prev_snap_obj; -- dsl_dataset_rele(snap, FTAG); -+ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) -+ return (SET_ERROR(ENOENT)); -+ -+ /* Open the parent of tofs */ -+ ASSERT3U(strlen(tofs), <, MAXNAMELEN); -+ (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); -+ error = dsl_dataset_hold(dp, buf, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ if (drba->drba_origin != NULL) { -+ dsl_dataset_t *origin; -+ error = dsl_dataset_hold(dp, drba->drba_origin, -+ FTAG, &origin); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ if (!dsl_dataset_is_snapshot(origin)) { -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ if (origin->ds_phys->ds_guid != fromguid) { -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENODEV)); - } -- if (obj == 0) -- return (ENODEV); -+ dsl_dataset_rele(origin, FTAG); - } -- } else { -- /* if full, most recent snapshot must be $ORIGIN */ -- if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) -- return (ENODEV); -+ dsl_dataset_rele(ds, FTAG); -+ error = 0; - } -- -- /* temporary clone name must not exist */ -- err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_dir->dd_phys->dd_child_dir_zapobj, -- rbsa->clonelastname, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -- -- return (0); -+ return (error); - } - --/* ARGSUSED */ - static void --recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ohds = arg1; -- struct recvbeginsyncarg *rbsa = arg2; -- dsl_pool_t *dp = ohds->ds_dir->dd_pool; -- dsl_dataset_t *cds; -- uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; -+ dmu_recv_begin_arg_t *drba = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb; -+ const char *tofs = drba->drba_cookie->drc_tofs; -+ dsl_dataset_t *ds, *newds; - uint64_t dsobj; -+ int error; -+ uint64_t crflags; -+ -+ crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? -+ DS_FLAG_CI_DATASET : 0; -+ -+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); -+ if (error == 0) { -+ /* create temporary clone */ -+ dsl_dataset_t *snap = NULL; -+ if (drba->drba_snapobj != 0) { -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ drba->drba_snapobj, FTAG, &snap)); -+ } -+ dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, -+ snap, crflags, drba->drba_cred, tx); -+ dsl_dataset_rele(snap, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ } else { -+ dsl_dir_t *dd; -+ const char *tail; -+ dsl_dataset_t *origin = NULL; -+ -+ VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); -+ -+ if (drba->drba_origin != NULL) { -+ VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, -+ FTAG, &origin)); -+ } -+ -+ /* Create new dataset. */ -+ dsobj = dsl_dataset_create_sync(dd, -+ strrchr(tofs, '/') + 1, -+ origin, crflags, drba->drba_cred, tx); -+ if (origin != NULL) -+ dsl_dataset_rele(origin, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ drba->drba_cookie->drc_newfs = B_TRUE; -+ } -+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); - -- /* create and open the temporary clone */ -- dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, -- ohds->ds_prev, flags, rbsa->cr, tx); -- VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); -+ dmu_buf_will_dirty(newds->ds_dbuf, tx); -+ newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - -@@ -787,23 +938,10 @@ recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { -+ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { - (void) dmu_objset_create_impl(dp->dp_spa, -- cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); -+ newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); - } - -- rbsa->ds = cds; -+ drba->drba_cookie->drc_ds = newds; - -- spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, -- dp->dp_spa, tx, "dataset = %lld", dsobj); --} -- --static boolean_t --dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) --{ -- int featureflags; -- -- featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); -- -- /* Verify pool version supports SA if SA_SPILL feature set */ -- return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && -- (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); -+ spa_history_log_internal_ds(newds, "receive", tx, ""); - } -@@ -815,44 +953,7 @@ dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) - int --dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, -- boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) -+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, -+ boolean_t force, char *origin, dmu_recv_cookie_t *drc) - { -- int err = 0; -- boolean_t byteswap; -- struct recvbeginsyncarg rbsa = { 0 }; -- uint64_t versioninfo; -- int flags; -- dsl_dataset_t *ds; -- -- if (drrb->drr_magic == DMU_BACKUP_MAGIC) -- byteswap = FALSE; -- else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -- byteswap = TRUE; -- else -- return (EINVAL); -- -- rbsa.tofs = tofs; -- rbsa.tosnap = tosnap; -- rbsa.origin = origin ? origin->os_dsl_dataset : NULL; -- rbsa.fromguid = drrb->drr_fromguid; -- rbsa.type = drrb->drr_type; -- rbsa.tag = FTAG; -- rbsa.dsflags = 0; -- rbsa.cr = CRED(); -- versioninfo = drrb->drr_versioninfo; -- flags = drrb->drr_flags; -- -- if (byteswap) { -- rbsa.type = BSWAP_32(rbsa.type); -- rbsa.fromguid = BSWAP_64(rbsa.fromguid); -- versioninfo = BSWAP_64(versioninfo); -- flags = BSWAP_32(flags); -- } -- -- if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || -- rbsa.type >= DMU_OST_NUMTYPES || -- ((flags & DRR_FLAG_CLONE) && origin == NULL)) -- return (EINVAL); -- -- if (flags & DRR_FLAG_CI_DATA) -- rbsa.dsflags = DS_FLAG_CI_DATASET; -+ dmu_recv_begin_arg_t drba = { 0 }; -+ dmu_replay_record_t *drr; - -@@ -861,77 +962,37 @@ dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, - drc->drc_tosnap = tosnap; -- drc->drc_top_ds = top_ds; -+ drc->drc_tofs = tofs; - drc->drc_force = force; - -- /* -- * Process the begin in syncing context. -- */ -- -- /* open the dataset we are logically receiving into */ -- err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); -- if (err == 0) { -- if (dmu_recv_verify_features(ds, drrb)) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (ENOTSUP); -- } -- /* target fs already exists; recv into temp clone */ -- -- /* Can't recv a clone into an existing fs */ -- if (flags & DRR_FLAG_CLONE) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (EINVAL); -- } -- -- /* must not have an incremental recv already in progress */ -- if (!mutex_tryenter(&ds->ds_recvlock)) { -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (EBUSY); -- } -- -- /* tmp clone name is: tofs/%tosnap" */ -- (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), -- "%%%s", tosnap); -- rbsa.force = force; -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_existing_check, recv_existing_sync, ds, &rbsa, 5); -- if (err) { -- mutex_exit(&ds->ds_recvlock); -- dsl_dataset_rele(ds, dmu_recv_tag); -- return (err); -- } -- drc->drc_logical_ds = ds; -- drc->drc_real_ds = rbsa.ds; -- } else if (err == ENOENT) { -- /* target fs does not exist; must be a full backup or clone */ -- char *cp; -- -- /* -- * If it's a non-clone incremental, we are missing the -- * target fs, so fail the recv. -- */ -- if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) -- return (ENOENT); -- -- /* Open the parent of tofs */ -- cp = strrchr(tofs, '/'); -- *cp = '\0'; -- err = dsl_dataset_hold(tofs, FTAG, &ds); -- *cp = '/'; -- if (err) -- return (err); -+ if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -+ drc->drc_byteswap = B_TRUE; -+ else if (drrb->drr_magic != DMU_BACKUP_MAGIC) -+ return (SET_ERROR(EINVAL)); - -- if (dmu_recv_verify_features(ds, drrb)) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -- } -+ drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); -+ drr->drr_type = DRR_BEGIN; -+ drr->drr_u.drr_begin = *drc->drc_drrb; -+ if (drc->drc_byteswap) { -+ fletcher_4_incremental_byteswap(drr, -+ sizeof (dmu_replay_record_t), &drc->drc_cksum); -+ } else { -+ fletcher_4_incremental_native(drr, -+ sizeof (dmu_replay_record_t), &drc->drc_cksum); -+ } -+ kmem_free(drr, sizeof (dmu_replay_record_t)); - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); -- dsl_dataset_rele(ds, FTAG); -- if (err) -- return (err); -- drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; -- drc->drc_newfs = B_TRUE; -+ if (drc->drc_byteswap) { -+ drrb->drr_magic = BSWAP_64(drrb->drr_magic); -+ drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); -+ drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); -+ drrb->drr_type = BSWAP_32(drrb->drr_type); -+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); -+ drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); - } - -- return (err); -+ drba.drba_origin = origin; -+ drba.drba_cookie = drc; -+ drba.drba_cred = CRED(); -+ -+ return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, -+ &drba, 5)); - } -@@ -940,3 +1001,3 @@ struct restorearg { - int err; -- int byteswap; -+ boolean_t byteswap; - vnode_t *vp; -@@ -976,3 +1037,4 @@ free_guid_map_onexit(void *arg) - while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { -- dsl_dataset_rele(gmep->gme_ds, ca); -+ dsl_dataset_long_rele(gmep->gme_ds, gmep); -+ dsl_dataset_rele(gmep->gme_ds, gmep); - kmem_free(gmep, sizeof (guid_map_entry_t)); -@@ -1001,6 +1063,6 @@ restore_read(struct restorearg *ra, int len) - if (resid == len - done) -- ra->err = EINVAL; -+ ra->err = SET_ERROR(EINVAL); - ra->voff += len - done - resid; - done = len - resid; -- if (ra->err) -+ if (ra->err != 0) - return (NULL); -@@ -1115,3 +1177,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - drro->drr_bonuslen > DN_MAX_BONUSLEN) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1121,3 +1183,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - if (err != 0 && err != ENOENT) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1125,3 +1187,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); -- if (ra->err) -+ if (ra->err != 0) - return (ra->err); -@@ -1134,3 +1196,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1148,4 +1210,4 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - } -- if (err) { -- return (EINVAL); -+ if (err != 0) { -+ return (SET_ERROR(EINVAL)); - } -@@ -1155,3 +1217,3 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1192,3 +1254,3 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, - if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1202,4 +1264,4 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, - -- err = dmu_free_object(os, obj); -- if (err) -+ err = dmu_free_long_object(os, obj); -+ if (err != 0) - return (err); -@@ -1219,3 +1281,3 @@ restore_write(struct restorearg *ra, objset_t *os, - !DMU_OT_IS_VALID(drrw->drr_type)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1226,3 +1288,3 @@ restore_write(struct restorearg *ra, objset_t *os, - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1233,3 +1295,3 @@ restore_write(struct restorearg *ra, objset_t *os, - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1268,3 +1330,3 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1278,6 +1340,6 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - &where)) == NULL) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else { -@@ -1296,3 +1358,3 @@ restore_write_byref(struct restorearg *ra, objset_t *os, - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_tx_abort(tx); -@@ -1317,3 +1379,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - drrs->drr_length > SPA_MAXBLOCKSIZE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1324,3 +1386,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - if (dmu_object_info(os, drrs->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1337,3 +1399,3 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) - err = dmu_tx_assign(tx, TXG_WAIT); -- if (err) { -+ if (err != 0) { - dmu_buf_rele(db, FTAG); -@@ -1366,6 +1428,6 @@ restore_free(struct restorearg *ra, objset_t *os, - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1376,2 +1438,12 @@ restore_free(struct restorearg *ra, objset_t *os, - -+/* used to destroy the drc_ds on error */ -+static void -+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) -+{ -+ char name[MAXNAMELEN]; -+ dsl_dataset_name(drc->drc_ds, name); -+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); -+ (void) dsl_destroy_head(name); -+} -+ - /* -@@ -1389,32 +1461,4 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - -- if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) -- ra.byteswap = TRUE; -- -- { -- /* compute checksum of drr_begin record */ -- dmu_replay_record_t *drr; -- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); -- -- drr->drr_type = DRR_BEGIN; -- drr->drr_u.drr_begin = *drc->drc_drrb; -- if (ra.byteswap) { -- fletcher_4_incremental_byteswap(drr, -- sizeof (dmu_replay_record_t), &ra.cksum); -- } else { -- fletcher_4_incremental_native(drr, -- sizeof (dmu_replay_record_t), &ra.cksum); -- } -- kmem_free(drr, sizeof (dmu_replay_record_t)); -- } -- -- if (ra.byteswap) { -- struct drr_begin *drrb = drc->drc_drrb; -- drrb->drr_magic = BSWAP_64(drrb->drr_magic); -- drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); -- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); -- drrb->drr_type = BSWAP_32(drrb->drr_type); -- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); -- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); -- } -- -+ ra.byteswap = drc->drc_byteswap; -+ ra.cksum = drc->drc_cksum; - ra.vp = vp; -@@ -1425,5 +1469,5 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - /* these were verified in dmu_recv_begin */ -- ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == -+ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, - DMU_SUBSTREAM); -- ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); -+ ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); - -@@ -1432,5 +1476,5 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - */ -- VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); -+ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); - -- ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); -+ ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); - -@@ -1443,3 +1487,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (cleanup_fd == -1) { -- ra.err = EBADF; -+ ra.err = SET_ERROR(EBADF); - goto out; -@@ -1447,3 +1491,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); -- if (ra.err) { -+ if (ra.err != 0) { - cleanup_fd = -1; -@@ -1461,3 +1505,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - action_handlep); -- if (ra.err) -+ if (ra.err != 0) - goto out; -@@ -1466,3 +1510,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - (void **)&ra.guid_to_ds_map); -- if (ra.err) -+ if (ra.err != 0) - goto out; -@@ -1480,3 +1524,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (issig(JUSTLOOKING) && issig(FORREAL)) { -- ra.err = EINTR; -+ ra.err = SET_ERROR(EINTR); - goto out; -@@ -1534,3 +1578,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) -- ra.err = ECKSUM; -+ ra.err = SET_ERROR(ECKSUM); - goto out; -@@ -1544,3 +1588,3 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, - default: -- ra.err = EINVAL; -+ ra.err = SET_ERROR(EINVAL); - goto out; -@@ -1560,10 +1604,3 @@ out: - */ -- txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); -- -- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, -- B_FALSE); -- if (drc->drc_real_ds != drc->drc_logical_ds) { -- mutex_exit(&drc->drc_logical_ds->ds_recvlock); -- dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); -- } -+ dmu_recv_cleanup_ds(drc); - } -@@ -1575,15 +1612,61 @@ out: - --struct recvendsyncarg { -- char *tosnap; -- uint64_t creation_time; -- uint64_t toguid; --}; -- - static int --recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_end_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct recvendsyncarg *resa = arg2; -+ dmu_recv_cookie_t *drc = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ int error; -+ -+ ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); -+ -+ if (!drc->drc_newfs) { -+ dsl_dataset_t *origin_head; -+ -+ error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); -+ if (error != 0) -+ return (error); -+ if (drc->drc_force) { -+ /* -+ * We will destroy any snapshots in tofs (i.e. before -+ * origin_head) that are after the origin (which is -+ * the snap before drc_ds, because drc_ds can not -+ * have any snaps of its own). -+ */ -+ uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; -+ while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { -+ dsl_dataset_t *snap; -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap); -+ if (error != 0) -+ return (error); -+ if (snap->ds_dir != origin_head->ds_dir) -+ error = SET_ERROR(EINVAL); -+ if (error == 0) { -+ error = dsl_destroy_snapshot_check_impl( -+ snap, B_FALSE); -+ } -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_dataset_rele(snap, FTAG); -+ if (error != 0) -+ return (error); -+ } -+ } -+ error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, -+ origin_head, drc->drc_force, drc->drc_owner, tx); -+ if (error != 0) { -+ dsl_dataset_rele(origin_head, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_snapshot_check_impl(origin_head, -+ drc->drc_tosnap, tx, B_TRUE); -+ dsl_dataset_rele(origin_head, FTAG); -+ if (error != 0) -+ return (error); - -- return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); -+ error = dsl_destroy_head_check_impl(drc->drc_ds, 1); -+ } else { -+ error = dsl_dataset_snapshot_check_impl(drc->drc_ds, -+ drc->drc_tosnap, tx, B_TRUE); -+ } -+ return (error); - } -@@ -1591,17 +1674,81 @@ recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dmu_recv_end_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct recvendsyncarg *resa = arg2; -+ dmu_recv_cookie_t *drc = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ -+ spa_history_log_internal_ds(drc->drc_ds, "finish receiving", -+ tx, "snap=%s", drc->drc_tosnap); -+ -+ if (!drc->drc_newfs) { -+ dsl_dataset_t *origin_head; -+ -+ VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, -+ &origin_head)); -+ -+ if (drc->drc_force) { -+ /* -+ * Destroy any snapshots of drc_tofs (origin_head) -+ * after the origin (the snap before drc_ds). -+ */ -+ uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; -+ while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { -+ dsl_dataset_t *snap; -+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, -+ &snap)); -+ ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); -+ obj = snap->ds_phys->ds_prev_snap_obj; -+ dsl_destroy_snapshot_sync_impl(snap, -+ B_FALSE, tx); -+ dsl_dataset_rele(snap, FTAG); -+ } -+ } -+ VERIFY3P(drc->drc_ds->ds_prev, ==, -+ origin_head->ds_prev); -+ -+ dsl_dataset_clone_swap_sync_impl(drc->drc_ds, -+ origin_head, tx); -+ dsl_dataset_snapshot_sync_impl(origin_head, -+ drc->drc_tosnap, tx); -+ -+ /* set snapshot's creation time and guid */ -+ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); -+ origin_head->ds_prev->ds_phys->ds_creation_time = -+ drc->drc_drrb->drr_creation_time; -+ origin_head->ds_prev->ds_phys->ds_guid = -+ drc->drc_drrb->drr_toguid; -+ origin_head->ds_prev->ds_phys->ds_flags &= -+ ~DS_FLAG_INCONSISTENT; -+ -+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx); -+ origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ -+ dsl_dataset_rele(origin_head, FTAG); -+ dsl_destroy_head_sync_impl(drc->drc_ds, tx); -+ -+ if (drc->drc_owner != NULL) -+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); -+ } else { -+ dsl_dataset_t *ds = drc->drc_ds; - -- dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); -+ dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); - -- /* set snapshot's creation time and guid */ -- dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -- ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; -- ds->ds_prev->ds_phys->ds_guid = resa->toguid; -- ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ /* set snapshot's creation time and guid */ -+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -+ ds->ds_prev->ds_phys->ds_creation_time = -+ drc->drc_drrb->drr_creation_time; -+ ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; -+ ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; - -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; -+ } -+ drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; -+ /* -+ * Release the hold from dmu_recv_begin. This must be done before -+ * we return to open context, so that when we free the dataset's dnode, -+ * we can evict its bonus buffer. -+ */ -+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); -+ drc->drc_ds = NULL; - } -@@ -1609,6 +1756,5 @@ recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static int --add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) -+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) - { -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; -+ dsl_pool_t *dp; - dsl_dataset_t *snapds; -@@ -1619,6 +1765,8 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); -+ err = dsl_pool_hold(name, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); -+ err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); - if (err == 0) { -- gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); - gmep->guid = snapds->ds_phys->ds_guid; -@@ -1626,5 +1774,8 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - avl_add(guid_map, gmep); -+ dsl_dataset_long_hold(snapds, gmep); -+ } else { -+ kmem_free(gmep, sizeof (*gmep)); - } - -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_rele(dp, FTAG); - return (err); -@@ -1632,2 +1783,4 @@ add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) - -+static int dmu_recv_end_modified_blocks = 3; -+ - static int -@@ -1635,38 +1788,24 @@ dmu_recv_existing_end(dmu_recv_cookie_t *drc) - { -- struct recvendsyncarg resa; -- dsl_dataset_t *ds = drc->drc_logical_ds; -- int err, myerr; -- -- if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { -- err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, -- drc->drc_force); -- if (err) -- goto out; -- } else { -- mutex_exit(&ds->ds_recvlock); -- dsl_dataset_rele(ds, dmu_recv_tag); -- (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, -- B_FALSE); -- return (EBUSY); -- } -+ int error; -+ -+#ifdef _KERNEL -+ char *name; - -- resa.creation_time = drc->drc_drrb->drr_creation_time; -- resa.toguid = drc->drc_drrb->drr_toguid; -- resa.tosnap = drc->drc_tosnap; -+ /* -+ * We will be destroying the ds; make sure its origin is unmounted if -+ * necessary. -+ */ -+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP); -+ dsl_dataset_name(drc->drc_ds, name); -+ zfs_destroy_unmount_origin(name); -+ kmem_free(name, MAXNAMELEN); -+#endif - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_end_check, recv_end_sync, ds, &resa, 3); -- if (err) { -- /* swap back */ -- (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); -- } -+ error = dsl_sync_task(drc->drc_tofs, -+ dmu_recv_end_check, dmu_recv_end_sync, drc, -+ dmu_recv_end_modified_blocks); - --out: -- mutex_exit(&ds->ds_recvlock); -- if (err == 0 && drc->drc_guid_to_ds_map != NULL) -- (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); -- dsl_dataset_disown(ds, dmu_recv_tag); -- myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); -- ASSERT0(myerr); -- return (err); -+ if (error != 0) -+ dmu_recv_cleanup_ds(drc); -+ return (error); - } -@@ -1676,29 +1815,16 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) - { -- struct recvendsyncarg resa; -- dsl_dataset_t *ds = drc->drc_logical_ds; -- int err; -- -- /* -- * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() -- * expects it to have a ds_user_ptr (and zil), but clone_swap() -- * can close it. -- */ -- txg_wait_synced(ds->ds_dir->dd_pool, 0); -- -- resa.creation_time = drc->drc_drrb->drr_creation_time; -- resa.toguid = drc->drc_drrb->drr_toguid; -- resa.tosnap = drc->drc_tosnap; -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- recv_end_check, recv_end_sync, ds, &resa, 3); -- if (err) { -- /* clean up the fs we just recv'd into */ -- (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); -- } else { -- if (drc->drc_guid_to_ds_map != NULL) -- (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); -- /* release the hold from dmu_recv_begin */ -- dsl_dataset_disown(ds, dmu_recv_tag); -+ int error; -+ -+ error = dsl_sync_task(drc->drc_tofs, -+ dmu_recv_end_check, dmu_recv_end_sync, drc, -+ dmu_recv_end_modified_blocks); -+ -+ if (error != 0) { -+ dmu_recv_cleanup_ds(drc); -+ } else if (drc->drc_guid_to_ds_map != NULL) { -+ (void) add_ds_to_guidmap(drc->drc_tofs, -+ drc->drc_guid_to_ds_map, -+ drc->drc_newsnapobj); - } -- return (err); -+ return (error); - } -@@ -1706,8 +1832,25 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) - int --dmu_recv_end(dmu_recv_cookie_t *drc) -+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) - { -- if (drc->drc_logical_ds != drc->drc_real_ds) -- return (dmu_recv_existing_end(drc)); -- else -+ drc->drc_owner = owner; -+ -+ if (drc->drc_newfs) - return (dmu_recv_new_end(drc)); -+ else -+ return (dmu_recv_existing_end(drc)); - } -+ -+/* -+ * Return TRUE if this objset is currently being received into. -+ */ -+boolean_t -+dmu_objset_is_receiving(objset_t *os) -+{ -+ return (os->os_dsl_dataset != NULL && -+ os->os_dsl_dataset->ds_owner == dmu_recv_tag); -+} -+ -+#if defined(_KERNEL) -+module_param(zfs_send_corrupt_data, int, 0644); -+MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); -+#endif -diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c -index 1c39723..bd291c6 100644 ---- a/module/zfs/dmu_traverse.c -+++ b/module/zfs/dmu_traverse.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -63,2 +63,4 @@ typedef struct traverse_data { - -+#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD) -+ - static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, -@@ -210,7 +212,4 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - { -- zbookmark_t czb; - int err = 0, lasterr = 0; - arc_buf_t *buf = NULL; -- prefetch_data_t *pd = td->td_pfd; -- boolean_t hard = td->td_flags & TRAVERSE_HARD; - boolean_t pause = B_FALSE; -@@ -236,12 +235,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - -- if (pd && !pd->pd_exited && -- ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || -+ if (td->td_pfd && !td->td_pfd->pd_exited && -+ ((td->td_pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { -- mutex_enter(&pd->pd_mtx); -- ASSERT(pd->pd_blks_fetched >= 0); -- while (pd->pd_blks_fetched == 0 && !pd->pd_exited) -- cv_wait(&pd->pd_cv, &pd->pd_mtx); -- pd->pd_blks_fetched--; -- cv_broadcast(&pd->pd_cv); -- mutex_exit(&pd->pd_mtx); -+ mutex_enter(&td->td_pfd->pd_mtx); -+ ASSERT(td->td_pfd->pd_blks_fetched >= 0); -+ while (td->td_pfd->pd_blks_fetched == 0 && -+ !td->td_pfd->pd_exited) -+ cv_wait(&td->td_pfd->pd_cv, &td->td_pfd->pd_mtx); -+ td->td_pfd->pd_blks_fetched--; -+ cv_broadcast(&td->td_pfd->pd_cv); -+ mutex_exit(&td->td_pfd->pd_mtx); - } -@@ -261,5 +261,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - uint32_t flags = ARC_WAIT; -- int i; -- blkptr_t *cbp; -- int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; -+ int32_t i; -+ int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; -+ zbookmark_t *czb; - -@@ -267,11 +267,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -- cbp = buf->b_data; -+ -+ czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE); - - for (i = 0; i < epb; i++) { -- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, -+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); -- traverse_prefetch_metadata(td, &cbp[i], &czb); -+ traverse_prefetch_metadata(td, -+ &((blkptr_t *)buf->b_data)[i], czb); - } -@@ -280,8 +282,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - for (i = 0; i < epb; i++) { -- SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, -+ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); -- err = traverse_visitbp(td, dnp, &cbp[i], &czb); -- if (err) { -- if (!hard) -+ err = traverse_visitbp(td, dnp, -+ &((blkptr_t *)buf->b_data)[i], czb); -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -290,6 +293,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - } -+ -+ kmem_free(czb, sizeof (zbookmark_t)); -+ - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; -- int i; -- int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; -+ int32_t i; -+ int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - -@@ -297,3 +303,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -@@ -310,4 +316,4 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - zb->zb_blkid * epb + i); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -323,3 +329,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); -- if (err) -+ if (err != 0) - return (err); -@@ -331,5 +337,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- prefetch_dnode_metadata(td, &osp->os_userused_dnode, -- zb->zb_objset, DMU_USERUSED_OBJECT); - prefetch_dnode_metadata(td, &osp->os_groupused_dnode, -+ zb->zb_objset, DMU_GROUPUSED_OBJECT); -+ prefetch_dnode_metadata(td, &osp->os_userused_dnode, - zb->zb_objset, DMU_USERUSED_OBJECT); -@@ -339,3 +345,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - DMU_META_DNODE_OBJECT); -- if (err && hard) { -+ if (err && TD_HARD(td)) { - lasterr = err; -@@ -344,7 +350,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- dnp = &osp->os_userused_dnode; -+ dnp = &osp->os_groupused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, -- DMU_USERUSED_OBJECT); -+ DMU_GROUPUSED_OBJECT); - } -- if (err && hard) { -+ if (err && TD_HARD(td)) { - lasterr = err; -@@ -353,5 +359,5 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { -- dnp = &osp->os_groupused_dnode; -+ dnp = &osp->os_userused_dnode; - err = traverse_dnode(td, dnp, zb->zb_objset, -- DMU_GROUPUSED_OBJECT); -+ DMU_USERUSED_OBJECT); - } -@@ -363,3 +369,3 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, - post: -- if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { -+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) { - err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); -@@ -371,3 +377,3 @@ post: - ASSERT3U(err, ==, ERESTART); -- ASSERT(!hard); -+ ASSERT(!TD_HARD(td)); - traverse_pause(td, zb); -@@ -402,3 +408,2 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - zbookmark_t czb; -- boolean_t hard = (td->td_flags & TRAVERSE_HARD); - -@@ -407,4 +412,4 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - break; -@@ -417,4 +422,4 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, - err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); -- if (err) { -- if (!hard) -+ if (err != 0) { -+ if (!TD_HARD(td)) - return (err); -@@ -436,3 +441,3 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - if (pfd->pd_cancel) -- return (EINTR); -+ return (SET_ERROR(EINTR)); - -@@ -500,5 +505,5 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE); -- pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE); -- czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE); -+ td = kmem_alloc(sizeof (traverse_data_t), KM_PUSHPAGE); -+ pd = kmem_zalloc(sizeof (prefetch_data_t), KM_PUSHPAGE); -+ czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE); - -@@ -519,11 +524,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -+ SET_BOOKMARK(czb, td->td_objset, -+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); -+ - /* See comment on ZIL traversal in dsl_scan_visitds. */ -- if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { -- objset_t *os; -+ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { -+ uint32_t flags = ARC_WAIT; -+ objset_phys_t *osp; -+ arc_buf_t *buf; - -- err = dmu_objset_from_ds(ds, &os); -- if (err) -+ err = arc_read(NULL, td->td_spa, rootbp, -+ arc_getbuf_func, &buf, -+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); -+ if (err != 0) - return (err); - -- traverse_zil(td, &os->os_zil_header); -+ osp = buf->b_data; -+ traverse_zil(td, &osp->os_zil_header); -+ (void) arc_buf_remove_ref(buf, &buf); - } -@@ -535,4 +549,2 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- SET_BOOKMARK(czb, td->td_objset, -- ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(td, NULL, rootbp, czb); -@@ -549,5 +561,5 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, - -- kmem_free(czb, sizeof(zbookmark_t)); -- kmem_free(pd, sizeof(struct prefetch_data)); -- kmem_free(td, sizeof(struct traverse_data)); -+ kmem_free(czb, sizeof (zbookmark_t)); -+ kmem_free(pd, sizeof (struct prefetch_data)); -+ kmem_free(td, sizeof (struct traverse_data)); - -@@ -593,3 +605,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - txg_start, NULL, flags, func, arg); -- if (err) -+ if (err != 0) - return (err); -@@ -602,3 +614,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - err = dmu_object_info(mos, obj, &doi); -- if (err) { -+ if (err != 0) { - if (!hard) -@@ -613,6 +625,6 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ dsl_pool_config_enter(dp, FTAG); - err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (err) { -+ dsl_pool_config_exit(dp, FTAG); -+ if (err != 0) { - if (!hard) -@@ -626,3 +638,3 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, - dsl_dataset_rele(ds, FTAG); -- if (err) { -+ if (err != 0) { - if (!hard) -diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c -index fd71413..4f2fae2 100644 ---- a/module/zfs/dmu_tx.c -+++ b/module/zfs/dmu_tx.c -@@ -50,8 +50,7 @@ dmu_tx_stats_t dmu_tx_stats = { - { "dmu_tx_group", KSTAT_DATA_UINT64 }, -- { "dmu_tx_how", KSTAT_DATA_UINT64 }, - { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, - { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, -- { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, - { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, -- { "dmu_tx_write_limit", KSTAT_DATA_UINT64 }, -+ { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, -+ { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_quota", KSTAT_DATA_UINT64 }, -@@ -66,3 +65,3 @@ dmu_tx_create_dd(dsl_dir_t *dd) - tx->tx_dir = dd; -- if (dd) -+ if (dd != NULL) - tx->tx_pool = dd->dd_pool; -@@ -72,2 +71,3 @@ dmu_tx_create_dd(dsl_dir_t *dd) - offsetof(dmu_tx_callback_t, dcb_node)); -+ tx->tx_start = gethrtime(); - #ifdef DEBUG_DMU_TX -@@ -178,3 +178,3 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) - if (db == NULL) -- return (EIO); -+ return (SET_ERROR(EIO)); - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); -@@ -389,3 +389,3 @@ out: - 2 * DMU_MAX_ACCESS) -- err = EFBIG; -+ err = SET_ERROR(EFBIG); - -@@ -467,3 +467,3 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) - -- if (blkid >= dn->dn_maxblkid) { -+ if (blkid > dn->dn_maxblkid) { - rw_exit(&dn->dn_struct_rwlock); -@@ -472,3 +472,3 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) - if (blkid + nblks > dn->dn_maxblkid) -- nblks = dn->dn_maxblkid - blkid; -+ nblks = dn->dn_maxblkid - blkid + 1; - -@@ -606,4 +606,3 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - dnode_t *dn; -- uint64_t start, end, i; -- int err, shift; -+ int err; - zio_t *zio; -@@ -617,10 +616,2 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - dn = txh->txh_dnode; -- -- /* first block */ -- if (off != 0) -- dmu_tx_count_write(txh, off, 1); -- /* last block */ -- if (len != DMU_OBJECT_END) -- dmu_tx_count_write(txh, off+len, 1); -- - dmu_tx_count_dnode(txh); -@@ -632,12 +623,44 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) - -+ dmu_tx_count_dnode(txh); -+ -+ /* -+ * For i/o error checking, we read the first and last level-0 -+ * blocks if they are not aligned, and all the level-1 blocks. -+ * -+ * Note: dbuf_free_range() assumes that we have not instantiated -+ * any level-0 dbufs that will be completely freed. Therefore we must -+ * exercise care to not read or count the first and last blocks -+ * if they are blocksize-aligned. -+ */ -+ if (dn->dn_datablkshift == 0) { -+ if (off != 0 || len < dn->dn_datablksz) -+ dmu_tx_count_write(txh, 0, dn->dn_datablksz); -+ } else { -+ /* first block will be modified if it is not aligned */ -+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) -+ dmu_tx_count_write(txh, off, 1); -+ /* last block will be modified if it is not aligned */ -+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) -+ dmu_tx_count_write(txh, off+len, 1); -+ } -+ - /* -- * For i/o error checking, read the first and last level-0 -- * blocks, and all the level-1 blocks. The above count_write's -- * have already taken care of the level-0 blocks. -+ * Check level-1 blocks. - */ - if (dn->dn_nlevels > 1) { -- shift = dn->dn_datablkshift + dn->dn_indblkshift - -+ int shift = dn->dn_datablkshift + dn->dn_indblkshift - - SPA_BLKPTRSHIFT; -- start = off >> shift; -- end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; -+ uint64_t start = off >> shift; -+ uint64_t end = (off + len) >> shift; -+ uint64_t i; -+ -+ ASSERT(dn->dn_indblkshift != 0); -+ -+ /* -+ * dnode_reallocate() can result in an object with indirect -+ * blocks having an odd data block size. In this case, -+ * just check the single block. -+ */ -+ if (dn->dn_datablkshift == 0) -+ start = end = 0; - -@@ -918,4 +941,140 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) - -+/* -+ * If we can't do 10 iops, something is wrong. Let us go ahead -+ * and hit zfs_dirty_data_max. -+ */ -+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ -+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ -+ -+/* -+ * We delay transactions when we've determined that the backend storage -+ * isn't able to accommodate the rate of incoming writes. -+ * -+ * If there is already a transaction waiting, we delay relative to when -+ * that transaction finishes waiting. This way the calculated min_time -+ * is independent of the number of threads concurrently executing -+ * transactions. -+ * -+ * If we are the only waiter, wait relative to when the transaction -+ * started, rather than the current time. This credits the transaction for -+ * "time already served", e.g. reading indirect blocks. -+ * -+ * The minimum time for a transaction to take is calculated as: -+ * min_time = scale * (dirty - min) / (max - dirty) -+ * min_time is then capped at zfs_delay_max_ns. -+ * -+ * The delay has two degrees of freedom that can be adjusted via tunables. -+ * The percentage of dirty data at which we start to delay is defined by -+ * zfs_delay_min_dirty_percent. This should typically be at or above -+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to -+ * delay after writing at full speed has failed to keep up with the incoming -+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly -+ * speaking, this variable determines the amount of delay at the midpoint of -+ * the curve. -+ * -+ * delay -+ * 10ms +-------------------------------------------------------------*+ -+ * | *| -+ * 9ms + *+ -+ * | *| -+ * 8ms + *+ -+ * | * | -+ * 7ms + * + -+ * | * | -+ * 6ms + * + -+ * | * | -+ * 5ms + * + -+ * | * | -+ * 4ms + * + -+ * | * | -+ * 3ms + * + -+ * | * | -+ * 2ms + (midpoint) * + -+ * | | ** | -+ * 1ms + v *** + -+ * | zfs_delay_scale ----------> ******** | -+ * 0 +-------------------------------------*********----------------+ -+ * 0% <- zfs_dirty_data_max -> 100% -+ * -+ * Note that since the delay is added to the outstanding time remaining on the -+ * most recent transaction, the delay is effectively the inverse of IOPS. -+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve -+ * was chosen such that small changes in the amount of accumulated dirty data -+ * in the first 3/4 of the curve yield relatively small differences in the -+ * amount of delay. -+ * -+ * The effects can be easier to understand when the amount of delay is -+ * represented on a log scale: -+ * -+ * delay -+ * 100ms +-------------------------------------------------------------++ -+ * + + -+ * | | -+ * + *+ -+ * 10ms + *+ -+ * + ** + -+ * | (midpoint) ** | -+ * + | ** + -+ * 1ms + v **** + -+ * + zfs_delay_scale ----------> ***** + -+ * | **** | -+ * + **** + -+ * 100us + ** + -+ * + * + -+ * | * | -+ * + * + -+ * 10us + * + -+ * + + -+ * | | -+ * + + -+ * +--------------------------------------------------------------+ -+ * 0% <- zfs_dirty_data_max -> 100% -+ * -+ * Note here that only as the amount of dirty data approaches its limit does -+ * the delay start to increase rapidly. The goal of a properly tuned system -+ * should be to keep the amount of dirty data out of that range by first -+ * ensuring that the appropriate limits are set for the I/O scheduler to reach -+ * optimal throughput on the backend storage, and then by changing the value -+ * of zfs_delay_scale to increase the steepness of the curve. -+ */ -+static void -+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) -+{ -+ dsl_pool_t *dp = tx->tx_pool; -+ uint64_t delay_min_bytes = -+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; -+ hrtime_t wakeup, min_tx_time, now; -+ -+ if (dirty <= delay_min_bytes) -+ return; -+ -+ /* -+ * The caller has already waited until we are under the max. -+ * We make them pass us the amount of dirty data so we don't -+ * have to handle the case of it being >= the max, which could -+ * cause a divide-by-zero if it's == the max. -+ */ -+ ASSERT3U(dirty, <, zfs_dirty_data_max); -+ -+ now = gethrtime(); -+ min_tx_time = zfs_delay_scale * -+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); -+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); -+ if (now > tx->tx_start + min_tx_time) -+ return; -+ -+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, -+ uint64_t, min_tx_time); -+ -+ mutex_enter(&dp->dp_lock); -+ wakeup = MAX(tx->tx_start + min_tx_time, -+ dp->dp_last_wakeup + min_tx_time); -+ dp->dp_last_wakeup = wakeup; -+ mutex_exit(&dp->dp_lock); -+ -+ zfs_sleep_until(wakeup); -+} -+ - static int --dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) -+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) - { -@@ -947,4 +1106,11 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - txg_how != TXG_WAIT) -- return (EIO); -+ return (SET_ERROR(EIO)); -+ -+ return (SET_ERROR(ERESTART)); -+ } - -+ if (!tx->tx_waited && -+ dsl_pool_need_dirty_delay(tx->tx_pool)) { -+ tx->tx_wait_dirty = B_TRUE; -+ DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); - return (ERESTART); -@@ -971,3 +1137,3 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - DMU_TX_STAT_BUMP(dmu_tx_group); -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -988,11 +1154,2 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) - /* -- * NB: This check must be after we've held the dnodes, so that -- * the dmu_tx_unassign() logic will work properly -- */ -- if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) { -- DMU_TX_STAT_BUMP(dmu_tx_how); -- return (ERESTART); -- } -- -- /* - * If a snapshot has been taken since we made our estimates, -@@ -1051,2 +1208,6 @@ dmu_tx_unassign(dmu_tx_t *tx) - -+ /* -+ * Walk the transaction's hold list, removing the hold on the -+ * associated dnode, and notifying waiters if the refcount drops to 0. -+ */ - for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; -@@ -1078,3 +1239,3 @@ dmu_tx_unassign(dmu_tx_t *tx) - * a new one. This should be used when you're not holding locks. -- * If will only fail if we're truly out of space (or over quota). -+ * It will only fail if we're truly out of space (or over quota). - * -@@ -1085,10 +1246,9 @@ dmu_tx_unassign(dmu_tx_t *tx) - * -- * (3) A specific txg. Use this if you need to ensure that multiple -- * transactions all sync in the same txg. Like TXG_NOWAIT, it -- * returns ERESTART if it can't assign you into the requested txg. -+ * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() -+ * has already been called on behalf of this operation (though -+ * most likely on a different tx). - */ - int --dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) -+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) - { -- hrtime_t before, after; - int err; -@@ -1096,6 +1256,11 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) - ASSERT(tx->tx_txg == 0); -- ASSERT(txg_how != 0); -+ ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || -+ txg_how == TXG_WAITED); - ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - -- before = gethrtime(); -+ if (txg_how == TXG_WAITED) -+ tx->tx_waited = B_TRUE; -+ -+ /* If we might wait, we must not hold the config lock. */ -+ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); - -@@ -1112,7 +1277,2 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) - -- after = gethrtime(); -- -- dsl_pool_tx_assign_add_usecs(tx->tx_pool, -- (after - before) / NSEC_PER_USEC); -- - return (0); -@@ -1124,12 +1284,46 @@ dmu_tx_wait(dmu_tx_t *tx) - spa_t *spa = tx->tx_pool->dp_spa; -+ dsl_pool_t *dp = tx->tx_pool; -+ hrtime_t before; - - ASSERT(tx->tx_txg == 0); -+ ASSERT(!dsl_pool_config_held(tx->tx_pool)); - -- /* -- * It's possible that the pool has become active after this thread -- * has tried to obtain a tx. If that's the case then his -- * tx_lasttried_txg would not have been assigned. -- */ -- if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { -- txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); -+ before = gethrtime(); -+ -+ if (tx->tx_wait_dirty) { -+ uint64_t dirty; -+ -+ /* -+ * dmu_tx_try_assign() has determined that we need to wait -+ * because we've consumed much or all of the dirty buffer -+ * space. -+ */ -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_total >= zfs_dirty_data_max) -+ DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); -+ while (dp->dp_dirty_total >= zfs_dirty_data_max) -+ cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); -+ dirty = dp->dp_dirty_total; -+ mutex_exit(&dp->dp_lock); -+ -+ dmu_tx_delay(tx, dirty); -+ -+ tx->tx_wait_dirty = B_FALSE; -+ -+ /* -+ * Note: setting tx_waited only has effect if the caller -+ * used TX_WAIT. Otherwise they are going to destroy -+ * this tx and try again. The common case, zfs_write(), -+ * uses TX_WAIT. -+ */ -+ tx->tx_waited = B_TRUE; -+ } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { -+ /* -+ * If the pool is suspended we need to wait until it -+ * is resumed. Note that it's possible that the pool -+ * has become active after this thread has tried to -+ * obtain a tx. If that's the case then tx_lasttried_txg -+ * would not have been set. -+ */ -+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); - } else if (tx->tx_needassign_txh) { -@@ -1143,4 +1337,10 @@ dmu_tx_wait(dmu_tx_t *tx) - } else { -+ /* -+ * A dnode is assigned to the quiescing txg. Wait for its -+ * transaction to complete. -+ */ - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); - } -+ -+ spa_tx_assign_add_nsecs(spa, gethrtime() - before); - } -@@ -1171,2 +1371,6 @@ dmu_tx_commit(dmu_tx_t *tx) - -+ /* -+ * Go through the transaction's hold list and remove holds on -+ * associated dnodes, notifying waiters if no holds remain. -+ */ - while ((txh = list_head(&tx->tx_holds))) { -@@ -1252,2 +1456,9 @@ dmu_tx_get_txg(dmu_tx_t *tx) - -+dsl_pool_t * -+dmu_tx_pool(dmu_tx_t *tx) -+{ -+ ASSERT(tx->tx_pool != NULL); -+ return (tx->tx_pool); -+} -+ - void -diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c -index 1763bae..9bc9191 100644 ---- a/module/zfs/dmu_zfetch.c -+++ b/module/zfs/dmu_zfetch.c -@@ -25,2 +25,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #include -@@ -50,3 +54,3 @@ unsigned long zfetch_array_rd_sz = 1024 * 1024; - /* forward decls for static routines */ --static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); -+static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); - static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); -@@ -54,3 +58,3 @@ static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); - static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); --static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); -+static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); - static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); -@@ -106,5 +110,5 @@ kstat_t *zfetch_ksp; - * -- * If no co-linear streams are found, return NULL. -+ * Returns whether co-linear streams were found. - */ --static int -+static boolean_t - dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) -@@ -136,3 +140,4 @@ dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) - z_walk->zst_offset = zh->zst_offset; -- z_walk->zst_direction = diff < 0 ? -1 : 1; -+ z_walk->zst_direction = diff < 0 ? -+ ZFETCH_BACKWARD : ZFETCH_FORWARD; - z_walk->zst_stride = -@@ -154,3 +159,4 @@ dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) - z_walk->zst_offset = zh->zst_offset; -- z_walk->zst_direction = diff < 0 ? -1 : 1; -+ z_walk->zst_direction = diff < 0 ? -+ ZFETCH_BACKWARD : ZFETCH_FORWARD; - z_walk->zst_stride = -@@ -289,3 +295,3 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) - for (i = 0; i < fetchsz; i++) { -- dbuf_prefetch(dn, blkid + i); -+ dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); - } -@@ -328,3 +334,3 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) - */ --static int -+static boolean_t - dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) -@@ -641,3 +647,3 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) - zstream_t *newstream; -- int fetched; -+ boolean_t fetched; - int inserted; -@@ -701,3 +707,4 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) - } -- newstream = kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); -+ newstream = -+ kmem_zalloc(sizeof (zstream_t), KM_PUSHPAGE); - } -@@ -741,2 +748 @@ MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read"); - #endif -- -diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c -index d8d6651..25c7775 100644 ---- a/module/zfs/dnode.c -+++ b/module/zfs/dnode.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -76,3 +76,7 @@ dnode_cons(void *arg, void *unused, int kmflag) - -- refcount_create(&dn->dn_holds); -+ /* -+ * Every dbuf has a reference, and dropping a tracked reference is -+ * O(number of references), so don't track dn_holds. -+ */ -+ refcount_create_untracked(&dn->dn_holds); - refcount_create(&dn->dn_tx_holds); -@@ -115,2 +119,3 @@ dnode_cons(void *arg, void *unused, int kmflag) - dn->dn_dbufs_count = 0; -+ dn->dn_unlisted_l0_blkid = 0; - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), -@@ -167,2 +172,3 @@ dnode_dest(void *arg, void *unused) - ASSERT0(dn->dn_dbufs_count); -+ ASSERT0(dn->dn_unlisted_l0_blkid); - list_destroy(&dn->dn_dbufs); -@@ -470,2 +476,3 @@ dnode_destroy(dnode_t *dn) - dn->dn_id_flags = 0; -+ dn->dn_unlisted_l0_blkid = 0; - -@@ -701,2 +708,3 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) - ndn->dn_dbufs_count = odn->dn_dbufs_count; -+ ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; - ndn->dn_bonus = odn->dn_bonus; -@@ -735,2 +743,3 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) - odn->dn_dbufs_count = 0; -+ odn->dn_unlisted_l0_blkid = 0; - odn->dn_bonus = NULL; -@@ -1029,8 +1038,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (dn == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - type = dn->dn_type; - if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - DNODE_VERIFY(dn); -@@ -1042,3 +1051,3 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (object == 0 || object >= DN_MAX_OBJECT) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1060,3 +1069,3 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, - if (db == NULL) -- return (EIO); -+ return (SET_ERROR(EIO)); - err = dbuf_read(db, NULL, DB_RF_CANFAIL); -@@ -1369,3 +1378,3 @@ fail: - rw_exit(&dn->dn_struct_rwlock); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1522,3 +1531,3 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) - -- if (len == -1ULL) { -+ if (len == DMU_OBJECT_END) { - len = UINT64_MAX - off; -@@ -1782,5 +1791,4 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) - /* -- * Call when we think we're going to write/free space in open context. -- * Be conservative (ie. OK to write less than this or free more than -- * this, but don't write more or free less). -+ * Call when we think we're going to write/free space in open context to track -+ * the amount of memory in use by the currently open txg. - */ -@@ -1791,10 +1799,10 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) - dsl_dataset_t *ds = os->os_dsl_dataset; -+ int64_t aspace = spa_get_asize(os->os_spa, space); - -- if (space > 0) -- space = spa_get_asize(os->os_spa, space); -- -- if (ds) -- dsl_dir_willuse_space(ds->ds_dir, space, tx); -+ if (ds != NULL) { -+ dsl_dir_willuse_space(ds->ds_dir, aspace, tx); -+ dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); -+ } - -- dmu_tx_willuse_space(tx, space); -+ dmu_tx_willuse_space(tx, aspace); - } -@@ -1802,10 +1810,12 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) - /* -- * This function scans a block at the indicated "level" looking for -- * a hole or data (depending on 'flags'). If level > 0, then we are -- * scanning an indirect block looking at its pointers. If level == 0, -- * then we are looking at a block of dnodes. If we don't find what we -- * are looking for in the block, we return ESRCH. Otherwise, return -- * with *offset pointing to the beginning (if searching forwards) or -- * end (if searching backwards) of the range covered by the block -- * pointer we matched on (or dnode). -+ * Scans a block at the indicated "level" looking for a hole or data, -+ * depending on 'flags'. -+ * -+ * If level > 0, then we are scanning an indirect block looking at its -+ * pointers. If level == 0, then we are looking at a block of dnodes. -+ * -+ * If we don't find what we are looking for in the block, we return ESRCH. -+ * Otherwise, return with *offset pointing to the beginning (if searching -+ * forwards) or end (if searching backwards) of the range covered by the -+ * block pointer we matched on (or dnode). - * -@@ -1855,3 +1865,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - */ -- return (ESRCH); -+ return (SET_ERROR(ESRCH)); - } -@@ -1871,3 +1881,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - */ -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } else if (lvl == 0) { -@@ -1884,3 +1894,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - if (i < 0 || i == blkfill) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } else { -@@ -1916,3 +1926,3 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - if (i < 0 || i >= epb) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1960,3 +1970,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - if (dn->dn_phys->dn_nlevels == 0) { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - goto out; -@@ -1969,3 +1979,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - } else { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1990,3 +2000,3 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, - initial_offset < *offset : initial_offset > *offset)) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - out: -diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c -index 76e6037..0ff25d2 100644 ---- a/module/zfs/dnode_sync.c -+++ b/module/zfs/dnode_sync.c -@@ -304,3 +304,3 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, - /* -- * free_range: Traverse the indicated range of the provided file -+ * Traverse the indicated range of the provided file - * and "free" all the blocks contained there. -@@ -372,3 +372,3 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) - /* -- * Try to kick all the dnodes dbufs out of the cache... -+ * Try to kick all the dnode's dbufs out of the cache... - */ -@@ -483,2 +483,3 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); -+ ASSERT3P(dn->dn_bonus, ==, NULL); - -diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c -index d9e8bd3..5eb8c01 100644 ---- a/module/zfs/dsl_dataset.c -+++ b/module/zfs/dsl_dataset.c -@@ -22,4 +22,5 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 2014 RackTop Systems. - */ -@@ -47,8 +48,4 @@ - #include -- --static char *dsl_reaper = "the grim reaper"; -- --static dsl_checkfunc_t dsl_dataset_destroy_begin_check; --static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; --static dsl_syncfunc_t dsl_dataset_set_reservation_sync; -+#include -+#include - -@@ -65,5 +62,2 @@ static dsl_syncfunc_t dsl_dataset_set_reservation_sync; - --#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) -- -- - /* -@@ -111,5 +105,4 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) - } -- dmu_buf_will_dirty(ds->ds_dbuf, tx); - -- mutex_enter(&ds->ds_dir->dd_lock); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_lock); -@@ -125,3 +118,2 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) - DD_USED_REFRSRV, DD_USED_HEAD, tx); -- mutex_exit(&ds->ds_dir->dd_lock); - } -@@ -162,3 +154,2 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, - -- mutex_enter(&ds->ds_dir->dd_lock); - mutex_enter(&ds->ds_lock); -@@ -173,3 +164,2 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, - DD_USED_REFRSRV, DD_USED_HEAD, tx); -- mutex_exit(&ds->ds_dir->dd_lock); - } else { -@@ -258,3 +248,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - -- ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); -+ ASSERT(ds->ds_owner == NULL); - -@@ -266,3 +256,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - if (ds->ds_prev) { -- dsl_dataset_drop_ref(ds->ds_prev, ds); -+ dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = NULL; -@@ -271,10 +261,6 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - bplist_destroy(&ds->ds_pending_deadlist); -- if (db != NULL) { -+ if (ds->ds_phys->ds_deadlist_obj != 0) - dsl_deadlist_close(&ds->ds_deadlist); -- } else { -- ASSERT(ds->ds_deadlist.dl_dbuf == NULL); -- ASSERT(!ds->ds_deadlist.dl_oldfmt); -- } - if (ds->ds_dir) -- dsl_dir_close(ds->ds_dir, ds); -+ dsl_dir_rele(ds->ds_dir, ds); - -@@ -283,6 +269,4 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - -@@ -291,3 +275,3 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) - --static int -+int - dsl_dataset_get_snapname(dsl_dataset_t *ds) -@@ -307,3 +291,3 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) - FTAG, &headdbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -336,4 +320,4 @@ dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) - --static int --dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) -+int -+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) - { -@@ -357,4 +341,4 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) - --static int --dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, -+int -+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -@@ -367,7 +351,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -+ ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -376,4 +359,6 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dmu_object_info_from_db(dbuf, &doi); -- if (doi.doi_type != DMU_OT_DSL_DATASET) -- return (EINVAL); -+ if (doi.doi_type != DMU_OT_DSL_DATASET) { -+ dmu_buf_rele(dbuf, tag); -+ return (SET_ERROR(EINVAL)); -+ } - -@@ -390,8 +375,5 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); -- mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); -- -- rw_init(&ds->ds_rwlock, NULL, RW_DEFAULT, NULL); -- cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); -+ refcount_create(&ds->ds_longholds); - -@@ -405,11 +387,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (err == 0) { -- err = dsl_dir_open_obj(dp, -+ err = dsl_dir_hold_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); - } -- if (err) { -+ if (err != 0) { - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - bplist_destroy(&ds->ds_pending_deadlist); -@@ -423,4 +403,4 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - ds->ds_snapname[0] = '\0'; -- if (ds->ds_phys->ds_prev_snap_obj) { -- err = dsl_dataset_get_ref(dp, -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, -@@ -440,25 +420,10 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { -- /* -- * In sync context, we're called with either no lock -- * or with the write lock. If we're not syncing, -- * we're always called with the read lock held. -- */ -- boolean_t need_lock = -- !RW_WRITE_HELD(&dp->dp_config_rwlock) && -- dsl_pool_sync_context(dp); -- -- if (need_lock) -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- -- err = dsl_prop_get_ds(ds, -- "refreservation", sizeof (uint64_t), 1, -- &ds->ds_reserved, NULL); -+ err = dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ &ds->ds_reserved); - if (err == 0) { -- err = dsl_prop_get_ds(ds, -- "refquota", sizeof (uint64_t), 1, -- &ds->ds_quota, NULL); -+ err = dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ &ds->ds_quota); - } -- -- if (need_lock) -- rw_exit(&dp->dp_config_rwlock); - } else { -@@ -467,7 +432,4 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - -- if (err == 0) { -- winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, -- dsl_dataset_evict); -- } -- if (err || winner) { -+ if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, -+ &ds->ds_phys, dsl_dataset_evict)) != NULL) { - bplist_destroy(&ds->ds_pending_deadlist); -@@ -475,11 +437,9 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - if (ds->ds_prev) -- dsl_dataset_drop_ref(ds->ds_prev, ds); -- dsl_dir_close(ds->ds_dir, ds); -+ dsl_dataset_rele(ds->ds_prev, ds); -+ dsl_dir_rele(ds->ds_dir, ds); - mutex_destroy(&ds->ds_lock); -- mutex_destroy(&ds->ds_recvlock); - mutex_destroy(&ds->ds_opening_lock); -- rw_destroy(&ds->ds_rwlock); -- cv_destroy(&ds->ds_exclusive_cv); -+ refcount_destroy(&ds->ds_longholds); - kmem_free(ds, sizeof (dsl_dataset_t)); -- if (err) { -+ if (err != 0) { - dmu_buf_rele(dbuf, tag); -@@ -498,9 +458,2 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); -- mutex_enter(&ds->ds_lock); -- if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { -- mutex_exit(&ds->ds_lock); -- dmu_buf_rele(ds->ds_dbuf, tag); -- return (ENOENT); -- } -- mutex_exit(&ds->ds_lock); - *dsp = ds; -@@ -509,87 +462,7 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, - --static int --dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) --{ -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- /* -- * In syncing context we don't want the rwlock lock: there -- * may be an existing writer waiting for sync phase to -- * finish. We don't need to worry about such writers, since -- * sync phase is single-threaded, so the writer can't be -- * doing anything while we are active. -- */ -- if (dsl_pool_sync_context(dp)) { -- ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); -- return (0); -- } -- -- /* -- * Normal users will hold the ds_rwlock as a READER until they -- * are finished (i.e., call dsl_dataset_rele()). "Owners" will -- * drop their READER lock after they set the ds_owner field. -- * -- * If the dataset is being destroyed, the destroy thread will -- * obtain a WRITER lock for exclusive access after it's done its -- * open-context work and then change the ds_owner to -- * dsl_reaper once destruction is assured. So threads -- * may block here temporarily, until the "destructability" of -- * the dataset is determined. -- */ -- ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); -- mutex_enter(&ds->ds_lock); -- while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { -- rw_exit(&dp->dp_config_rwlock); -- cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); -- if (DSL_DATASET_IS_DESTROYED(ds)) { -- mutex_exit(&ds->ds_lock); -- dsl_dataset_drop_ref(ds, tag); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- return (ENOENT); -- } -- /* -- * The dp_config_rwlock lives above the ds_lock. And -- * we need to check DSL_DATASET_IS_DESTROYED() while -- * holding the ds_lock, so we have to drop and reacquire -- * the ds_lock here. -- */ -- mutex_exit(&ds->ds_lock); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- mutex_enter(&ds->ds_lock); -- } -- mutex_exit(&ds->ds_lock); -- return (0); --} -- --int --dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, -- dsl_dataset_t **dsp) --{ -- int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); -- -- if (err) -- return (err); -- return (dsl_dataset_hold_ref(*dsp, tag)); --} -- - int --dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, -+dsl_dataset_hold(dsl_pool_t *dp, const char *name, - void *tag, dsl_dataset_t **dsp) - { -- int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); -- if (err) -- return (err); -- if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { -- dsl_dataset_rele(*dsp, tag); -- *dsp = NULL; -- return (EBUSY); -- } -- return (0); --} -- --int --dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) --{ - dsl_dir_t *dd; -- dsl_pool_t *dp; - const char *snapname; -@@ -598,17 +471,12 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - -- err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); -- if (err) -+ err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); -+ if (err != 0) - return (err); - -- dp = dd->dd_pool; -+ ASSERT(dsl_pool_config_held(dp)); - obj = dd->dd_phys->dd_head_dataset_obj; -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- if (obj) -- err = dsl_dataset_get_ref(dp, obj, tag, dsp); -+ if (obj != 0) -+ err = dsl_dataset_hold_obj(dp, obj, tag, dsp); - else -- err = ENOENT; -- if (err) -- goto out; -- -- err = dsl_dataset_hold_ref(*dsp, tag); -+ err = SET_ERROR(ENOENT); - -@@ -616,3 +484,3 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - if (err == 0 && snapname != NULL) { -- dsl_dataset_t *ds = NULL; -+ dsl_dataset_t *ds; - -@@ -620,4 +488,4 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - dsl_dataset_rele(*dsp, tag); -- err = ENOENT; -- goto out; -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(ENOENT)); - } -@@ -627,8 +495,6 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - if (err == 0) -- err = dsl_dataset_get_ref(dp, obj, tag, &ds); -+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); - -- ASSERT3U((err == 0), ==, (ds != NULL)); -- -- if (ds) { -+ if (err == 0) { - mutex_enter(&ds->ds_lock); -@@ -638,9 +504,7 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) - mutex_exit(&ds->ds_lock); -- err = dsl_dataset_hold_ref(ds, tag); -- *dsp = err ? NULL : ds; -+ *dsp = ds; - } - } --out: -- rw_exit(&dp->dp_config_rwlock); -- dsl_dir_close(dd, FTAG); -+ -+ dsl_dir_rele(dd, FTAG); - return (err); -@@ -649,11 +513,12 @@ out: - int --dsl_dataset_own(const char *name, boolean_t inconsistentok, -+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **dsp) - { -- int err = dsl_dataset_hold(name, tag, dsp); -- if (err) -+ int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); -+ if (err != 0) - return (err); -- if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { -+ if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); -- return (EBUSY); -+ *dsp = NULL; -+ return (SET_ERROR(EBUSY)); - } -@@ -662,2 +527,45 @@ dsl_dataset_own(const char *name, boolean_t inconsistentok, - -+int -+dsl_dataset_own(dsl_pool_t *dp, const char *name, -+ void *tag, dsl_dataset_t **dsp) -+{ -+ int err = dsl_dataset_hold(dp, name, tag, dsp); -+ if (err != 0) -+ return (err); -+ if (!dsl_dataset_tryown(*dsp, tag)) { -+ dsl_dataset_rele(*dsp, tag); -+ return (SET_ERROR(EBUSY)); -+ } -+ return (0); -+} -+ -+/* -+ * See the comment above dsl_pool_hold() for details. In summary, a long -+ * hold is used to prevent destruction of a dataset while the pool hold -+ * is dropped, allowing other concurrent operations (e.g. spa_sync()). -+ * -+ * The dataset and pool must be held when this function is called. After it -+ * is called, the pool hold may be released while the dataset is still held -+ * and accessed. -+ */ -+void -+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) -+{ -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); -+ (void) refcount_add(&ds->ds_longholds, tag); -+} -+ -+void -+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) -+{ -+ (void) refcount_remove(&ds->ds_longholds, tag); -+} -+ -+/* Return B_TRUE if there are any long holds on this dataset. */ -+boolean_t -+dsl_dataset_long_held(dsl_dataset_t *ds) -+{ -+ return (!refcount_is_zero(&ds->ds_longholds)); -+} -+ - void -@@ -669,3 +577,3 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) - dsl_dir_name(ds->ds_dir, name); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { -@@ -687,33 +595,2 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) - --static int --dsl_dataset_namelen(dsl_dataset_t *ds) --{ -- int result; -- -- if (ds == NULL) { -- result = 3; /* "mos" */ -- } else { -- result = dsl_dir_namelen(ds->ds_dir); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- if (ds->ds_snapname[0]) { -- ++result; /* adding one for the @-sign */ -- if (!MUTEX_HELD(&ds->ds_lock)) { -- mutex_enter(&ds->ds_lock); -- result += strlen(ds->ds_snapname); -- mutex_exit(&ds->ds_lock); -- } else { -- result += strlen(ds->ds_snapname); -- } -- } -- } -- -- return (result); --} -- --void --dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) --{ -- dmu_buf_rele(ds->ds_dbuf, tag); --} -- - void -@@ -721,6 +598,3 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) - { -- if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { -- rw_exit(&ds->ds_rwlock); -- } -- dsl_dataset_drop_ref(ds, tag); -+ dmu_buf_rele(ds->ds_dbuf, tag); - } -@@ -730,4 +604,3 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - { -- ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || -- (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); -+ ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL); - -@@ -735,9 +608,6 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - ds->ds_owner = NULL; -- if (RW_WRITE_HELD(&ds->ds_rwlock)) { -- rw_exit(&ds->ds_rwlock); -- cv_broadcast(&ds->ds_exclusive_cv); -- } - mutex_exit(&ds->ds_lock); -- if (ds->ds_dbuf) -- dsl_dataset_drop_ref(ds, tag); -+ dsl_dataset_long_rele(ds, tag); -+ if (ds->ds_dbuf != NULL) -+ dsl_dataset_rele(ds, tag); - else -@@ -747,3 +617,3 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) - boolean_t --dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) -+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) - { -@@ -752,7 +622,5 @@ dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) - mutex_enter(&ds->ds_lock); -- if (ds->ds_owner == NULL && -- (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { -+ if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { - ds->ds_owner = tag; -- if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) -- rw_exit(&ds->ds_rwlock); -+ dsl_dataset_long_hold(ds, tag); - gotit = TRUE; -@@ -763,10 +631,2 @@ dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) - --void --dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) --{ -- ASSERT3P(owner, ==, ds->ds_owner); -- if (!RW_WRITE_HELD(&ds->ds_rwlock)) -- rw_enter(&ds->ds_rwlock, RW_WRITER); --} -- - uint64_t -@@ -791,3 +651,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); -- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); -+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); -@@ -809,3 +669,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } else { -- dsl_dataset_t *ohds; -+ dsl_dataset_t *ohds; /* head of the origin snapshot */ - -@@ -826,3 +686,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); -@@ -838,5 +698,4 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } -- VERIFY(0 == zap_add_int(mos, -- origin->ds_phys->ds_next_clones_obj, -- dsobj, tx)); -+ VERIFY0(zap_add_int(mos, -+ origin->ds_phys->ds_next_clones_obj, dsobj, tx)); - } -@@ -852,3 +711,3 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - } -- VERIFY3U(0, ==, zap_add_int(mos, -+ VERIFY0(zap_add_int(mos, - origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); -@@ -868,2 +727,12 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - -+static void -+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ objset_t *os; -+ -+ VERIFY0(dmu_objset_from_ds(ds, &os)); -+ bzero(&os->os_zil_header, sizeof (os->os_zil_header)); -+ dsl_dataset_dirty(ds, tx); -+} -+ - uint64_t -@@ -876,2 +745,3 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - -+ ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(lastname[0] != '@'); -@@ -879,5 +749,6 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); -- VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); -+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - -- dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); -+ dsobj = dsl_dataset_create_sync_dd(dd, origin, -+ flags & ~DS_CREATE_FLAG_NODIRTY, tx); - -@@ -885,3 +756,3 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - -@@ -891,10 +762,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - */ -- if (origin != NULL) { -+ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { - dsl_dataset_t *ds; -- objset_t *os; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); -- bzero(&os->os_zil_header, sizeof (os->os_zil_header)); -- dsl_dataset_dirty(ds, tx); -+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -+ dsl_dataset_zero_zil(ds, tx); - dsl_dataset_rele(ds, FTAG); -@@ -906,331 +774,2 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - /* -- * The snapshots must all be in the same pool. -- */ --int --dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed) --{ -- int err; -- dsl_sync_task_t *dst; -- spa_t *spa; -- nvpair_t *pair; -- dsl_sync_task_group_t *dstg; -- -- pair = nvlist_next_nvpair(snaps, NULL); -- if (pair == NULL) -- return (0); -- -- err = spa_open(nvpair_name(pair), &spa, FTAG); -- if (err) -- return (err); -- dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- -- for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -- pair = nvlist_next_nvpair(snaps, pair)) { -- dsl_dataset_t *ds; -- -- err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds); -- if (err == 0) { -- struct dsl_ds_destroyarg *dsda; -- -- dsl_dataset_make_exclusive(ds, dstg); -- dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), -- KM_SLEEP); -- dsda->ds = ds; -- dsda->defer = defer; -- dsl_sync_task_create(dstg, dsl_dataset_destroy_check, -- dsl_dataset_destroy_sync, dsda, dstg, 0); -- } else if (err == ENOENT) { -- err = 0; -- } else { -- (void) strcpy(failed, nvpair_name(pair)); -- break; -- } -- } -- -- if (err == 0) -- err = dsl_sync_task_group_wait(dstg); -- -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- struct dsl_ds_destroyarg *dsda = dst->dst_arg1; -- dsl_dataset_t *ds = dsda->ds; -- -- /* -- * Return the file system name that triggered the error -- */ -- if (dst->dst_err) { -- dsl_dataset_name(ds, failed); -- } -- ASSERT3P(dsda->rm_origin, ==, NULL); -- dsl_dataset_disown(ds, dstg); -- kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); -- } -- -- dsl_sync_task_group_destroy(dstg); -- spa_close(spa, FTAG); -- return (err); -- --} -- --static boolean_t --dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) --{ -- boolean_t might_destroy = B_FALSE; -- -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && -- DS_IS_DEFER_DESTROY(ds)) -- might_destroy = B_TRUE; -- mutex_exit(&ds->ds_lock); -- -- return (might_destroy); --} -- --/* -- * If we're removing a clone, and these three conditions are true: -- * 1) the clone's origin has no other children -- * 2) the clone's origin has no user references -- * 3) the clone's origin has been marked for deferred destruction -- * Then, prepare to remove the origin as part of this sync task group. -- */ --static int --dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) --{ -- dsl_dataset_t *ds = dsda->ds; -- dsl_dataset_t *origin = ds->ds_prev; -- -- if (dsl_dataset_might_destroy_origin(origin)) { -- char *name; -- int namelen; -- int error; -- -- namelen = dsl_dataset_namelen(origin) + 1; -- name = kmem_alloc(namelen, KM_SLEEP); -- dsl_dataset_name(origin, name); --#ifdef _KERNEL -- error = zfs_unmount_snap(name, NULL); -- if (error) { -- kmem_free(name, namelen); -- return (error); -- } --#endif -- error = dsl_dataset_own(name, B_TRUE, tag, &origin); -- kmem_free(name, namelen); -- if (error) -- return (error); -- dsda->rm_origin = origin; -- dsl_dataset_make_exclusive(origin, tag); -- } -- -- return (0); --} -- --/* -- * ds must be opened as OWNER. On return (whether successful or not), -- * ds will be closed and caller can no longer dereference it. -- */ --int --dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) --{ -- int err; -- dsl_sync_task_group_t *dstg; -- objset_t *os; -- dsl_dir_t *dd; -- uint64_t obj; -- struct dsl_ds_destroyarg dsda = { 0 }; -- dsl_dataset_t *dummy_ds; -- -- dsda.ds = ds; -- -- if (dsl_dataset_is_snapshot(ds)) { -- /* Destroying a snapshot is simpler */ -- dsl_dataset_make_exclusive(ds, tag); -- -- dsda.defer = defer; -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_destroy_check, dsl_dataset_destroy_sync, -- &dsda, tag, 0); -- ASSERT3P(dsda.rm_origin, ==, NULL); -- goto out; -- } else if (defer) { -- err = EINVAL; -- goto out; -- } -- -- dd = ds->ds_dir; -- dummy_ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); -- dummy_ds->ds_dir = dd; -- dummy_ds->ds_object = ds->ds_object; -- -- if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds), -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -- /* -- * Check for errors and mark this ds as inconsistent, in -- * case we crash while freeing the objects. -- */ -- err = dsl_sync_task_do(dd->dd_pool, -- dsl_dataset_destroy_begin_check, -- dsl_dataset_destroy_begin_sync, ds, NULL, 0); -- if (err) -- goto out_free; -- -- err = dmu_objset_from_ds(ds, &os); -- if (err) -- goto out_free; -- -- /* -- * Remove all objects while in the open context so that -- * there is less work to do in the syncing context. -- */ -- for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, -- ds->ds_phys->ds_prev_snap_txg)) { -- /* -- * Ignore errors, if there is not enough disk space -- * we will deal with it in dsl_dataset_destroy_sync(). -- */ -- (void) dmu_free_object(os, obj); -- } -- if (err != ESRCH) -- goto out_free; -- -- /* -- * Sync out all in-flight IO. -- */ -- txg_wait_synced(dd->dd_pool, 0); -- -- /* -- * If we managed to free all the objects in open -- * context, the user space accounting should be zero. -- */ -- if (ds->ds_phys->ds_bp.blk_fill == 0 && -- dmu_objset_userused_enabled(os)) { -- ASSERTV(uint64_t count); -- -- ASSERT(zap_count(os, DMU_USERUSED_OBJECT, -- &count) != 0 || count == 0); -- ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, -- &count) != 0 || count == 0); -- } -- } -- -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); -- err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); -- rw_exit(&dd->dd_pool->dp_config_rwlock); -- -- if (err) -- goto out_free; -- -- /* -- * Blow away the dsl_dir + head dataset. -- */ -- dsl_dataset_make_exclusive(ds, tag); -- /* -- * If we're removing a clone, we might also need to remove its -- * origin. -- */ -- do { -- dsda.need_prep = B_FALSE; -- if (dsl_dir_is_clone(dd)) { -- err = dsl_dataset_origin_rm_prep(&dsda, tag); -- if (err) { -- dsl_dir_close(dd, FTAG); -- goto out_free; -- } -- } -- -- dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); -- dsl_sync_task_create(dstg, dsl_dataset_destroy_check, -- dsl_dataset_destroy_sync, &dsda, tag, 0); -- dsl_sync_task_create(dstg, dsl_dir_destroy_check, -- dsl_dir_destroy_sync, dummy_ds, FTAG, 0); -- err = dsl_sync_task_group_wait(dstg); -- dsl_sync_task_group_destroy(dstg); -- -- /* -- * We could be racing against 'zfs release' or 'zfs destroy -d' -- * on the origin snap, in which case we can get EBUSY if we -- * needed to destroy the origin snap but were not ready to -- * do so. -- */ -- if (dsda.need_prep) { -- ASSERT(err == EBUSY); -- ASSERT(dsl_dir_is_clone(dd)); -- ASSERT(dsda.rm_origin == NULL); -- } -- } while (dsda.need_prep); -- -- if (dsda.rm_origin != NULL) -- dsl_dataset_disown(dsda.rm_origin, tag); -- -- /* if it is successful, dsl_dir_destroy_sync will close the dd */ -- if (err) -- dsl_dir_close(dd, FTAG); -- --out_free: -- kmem_free(dummy_ds, sizeof (dsl_dataset_t)); --out: -- dsl_dataset_disown(ds, tag); -- return (err); --} -- --blkptr_t * --dsl_dataset_get_blkptr(dsl_dataset_t *ds) --{ -- return (&ds->ds_phys->ds_bp); --} -- --void --dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) --{ -- ASSERT(dmu_tx_is_syncing(tx)); -- /* If it's the meta-objset, set dp_meta_rootbp */ -- if (ds == NULL) { -- tx->tx_pool->dp_meta_rootbp = *bp; -- } else { -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_bp = *bp; -- } --} -- --spa_t * --dsl_dataset_get_spa(dsl_dataset_t *ds) --{ -- return (ds->ds_dir->dd_pool->dp_spa); --} -- --void --dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) --{ -- dsl_pool_t *dp; -- -- if (ds == NULL) /* this is the meta-objset */ -- return; -- -- ASSERT(ds->ds_objset != NULL); -- -- if (ds->ds_phys->ds_next_snap_obj != 0) -- panic("dirtying snapshot!"); -- -- dp = ds->ds_dir->dd_pool; -- -- if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { -- /* up the hold count until we can be written out */ -- dmu_buf_add_ref(ds->ds_dbuf, ds); -- } --} -- --boolean_t --dsl_dataset_is_dirty(dsl_dataset_t *ds) --{ -- int t; -- -- for (t = 0; t < TXG_SIZE; t++) { -- if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, -- ds, t)) -- return (B_TRUE); -- } -- return (B_FALSE); --} -- --/* - * The unique space in the head dataset can be calculated by subtracting -@@ -1242,3 +781,3 @@ dsl_dataset_is_dirty(dsl_dataset_t *ds) - */ --static void -+void - dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) -@@ -1266,232 +805,6 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) - --struct killarg { -- dsl_dataset_t *ds; -- dmu_tx_t *tx; --}; -- --/* ARGSUSED */ --static int --kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, -- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) --{ -- struct killarg *ka = arg; -- dmu_tx_t *tx = ka->tx; -- -- if (bp == NULL) -- return (0); -- -- if (zb->zb_level == ZB_ZIL_LEVEL) { -- ASSERT(zilog != NULL); -- /* -- * It's a block in the intent log. It has no -- * accounting, so just free it. -- */ -- dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); -- } else { -- ASSERT(zilog == NULL); -- ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); -- (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); -- } -- -- return (0); --} -- --/* ARGSUSED */ --static int --dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- uint64_t count; -- int err; -- -- /* -- * Can't delete a head dataset if there are snapshots of it. -- * (Except if the only snapshots are from the branch we cloned -- * from.) -- */ -- if (ds->ds_prev != NULL && -- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -- return (EBUSY); -- -- /* -- * This is really a dsl_dir thing, but check it here so that -- * we'll be less likely to leave this dataset inconsistent & -- * nearly destroyed. -- */ -- err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); -- if (err) -- return (err); -- if (count != 0) -- return (EEXIST); -- -- return (0); --} -- --/* ARGSUSED */ --static void --dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- -- /* Mark it as inconsistent on-disk, in case we crash */ -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -- -- spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, -- "dataset = %llu", ds->ds_object); --} -- --static int --dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, -+void -+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, - dmu_tx_t *tx) - { -- dsl_dataset_t *ds = dsda->ds; -- dsl_dataset_t *ds_prev = ds->ds_prev; -- -- if (dsl_dataset_might_destroy_origin(ds_prev)) { -- struct dsl_ds_destroyarg ndsda = {0}; -- -- /* -- * If we're not prepared to remove the origin, don't remove -- * the clone either. -- */ -- if (dsda->rm_origin == NULL) { -- dsda->need_prep = B_TRUE; -- return (EBUSY); -- } -- -- ndsda.ds = ds_prev; -- ndsda.is_origin_rm = B_TRUE; -- return (dsl_dataset_destroy_check(&ndsda, tag, tx)); -- } -- -- /* -- * If we're not going to remove the origin after all, -- * undo the open context setup. -- */ -- if (dsda->rm_origin != NULL) { -- dsl_dataset_disown(dsda->rm_origin, tag); -- dsda->rm_origin = NULL; -- } -- -- return (0); --} -- --/* -- * If you add new checks here, you may need to add -- * additional checks to the "temporary" case in -- * snapshot_check() in dmu_objset.c. -- */ --/* ARGSUSED */ --int --dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- struct dsl_ds_destroyarg *dsda = arg1; -- dsl_dataset_t *ds = dsda->ds; -- -- /* we have an owner hold, so noone else can destroy us */ -- ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); -- -- /* -- * Only allow deferred destroy on pools that support it. -- * NOTE: deferred destroy is only supported on snapshots. -- */ -- if (dsda->defer) { -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -- SPA_VERSION_USERREFS) -- return (ENOTSUP); -- ASSERT(dsl_dataset_is_snapshot(ds)); -- return (0); -- } -- -- /* -- * Can't delete a head dataset if there are snapshots of it. -- * (Except if the only snapshots are from the branch we cloned -- * from.) -- */ -- if (ds->ds_prev != NULL && -- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -- return (EBUSY); -- -- /* -- * If we made changes this txg, traverse_dsl_dataset won't find -- * them. Try again. -- */ -- if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) -- return (EAGAIN); -- -- if (dsl_dataset_is_snapshot(ds)) { -- /* -- * If this snapshot has an elevated user reference count, -- * we can't destroy it yet. -- */ -- if (ds->ds_userrefs > 0 && !dsda->releasing) -- return (EBUSY); -- -- mutex_enter(&ds->ds_lock); -- /* -- * Can't delete a branch point. However, if we're destroying -- * a clone and removing its origin due to it having a user -- * hold count of 0 and having been marked for deferred destroy, -- * it's OK for the origin to have a single clone. -- */ -- if (ds->ds_phys->ds_num_children > -- (dsda->is_origin_rm ? 2 : 1)) { -- mutex_exit(&ds->ds_lock); -- return (EEXIST); -- } -- mutex_exit(&ds->ds_lock); -- } else if (dsl_dir_is_clone(ds->ds_dir)) { -- return (dsl_dataset_origin_check(dsda, arg2, tx)); -- } -- -- /* XXX we should do some i/o error checking... */ -- return (0); --} -- --struct refsarg { -- kmutex_t lock; -- boolean_t gone; -- kcondvar_t cv; --}; -- --/* ARGSUSED */ --static void --dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) --{ -- struct refsarg *arg = argv; -- -- mutex_enter(&arg->lock); -- arg->gone = TRUE; -- cv_signal(&arg->cv); -- mutex_exit(&arg->lock); --} -- --static void --dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) --{ -- struct refsarg arg; -- -- mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); -- cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); -- arg.gone = FALSE; -- (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, -- dsl_dataset_refs_gone); -- dmu_buf_rele(ds->ds_dbuf, tag); -- mutex_enter(&arg.lock); -- while (!arg.gone) -- cv_wait(&arg.cv, &arg.lock); -- ASSERT(arg.gone); -- mutex_exit(&arg.lock); -- ds->ds_dbuf = NULL; -- ds->ds_phys = NULL; -- mutex_destroy(&arg.lock); -- cv_destroy(&arg.cv); --} -- --static void --remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) --{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -@@ -1512,6 +825,5 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) - */ -- if (err != ENOENT) { -+ if (err != ENOENT) - VERIFY0(err); -- } -- ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, -+ ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); -@@ -1520,121 +832,26 @@ remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) - --static void --dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) --{ -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- zap_cursor_t zc; -- zap_attribute_t za; -- -- /* -- * If it is the old version, dd_clones doesn't exist so we can't -- * find the clones, but deadlist_remove_key() is a no-op so it -- * doesn't matter. -- */ -- if (ds->ds_dir->dd_phys->dd_clones == 0) -- return; -- -- for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); -- zap_cursor_retrieve(&zc, &za) == 0; -- zap_cursor_advance(&zc)) { -- dsl_dataset_t *clone; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- za.za_first_integer, FTAG, &clone)); -- if (clone->ds_dir->dd_origin_txg > mintxg) { -- dsl_deadlist_remove_key(&clone->ds_deadlist, -- mintxg, tx); -- dsl_dataset_remove_clones_key(clone, mintxg, tx); -- } -- dsl_dataset_rele(clone, FTAG); -- } -- zap_cursor_fini(&zc); -+blkptr_t * -+dsl_dataset_get_blkptr(dsl_dataset_t *ds) -+{ -+ return (&ds->ds_phys->ds_bp); - } - --struct process_old_arg { -- dsl_dataset_t *ds; -- dsl_dataset_t *ds_prev; -- boolean_t after_branch_point; -- zio_t *pio; -- uint64_t used, comp, uncomp; --}; -- --static int --process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -+void -+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) - { -- struct process_old_arg *poa = arg; -- dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; -- -- if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { -- dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); -- if (poa->ds_prev && !poa->after_branch_point && -- bp->blk_birth > -- poa->ds_prev->ds_phys->ds_prev_snap_txg) { -- poa->ds_prev->ds_phys->ds_unique_bytes += -- bp_get_dsize_sync(dp->dp_spa, bp); -- } -+ ASSERT(dmu_tx_is_syncing(tx)); -+ /* If it's the meta-objset, set dp_meta_rootbp */ -+ if (ds == NULL) { -+ tx->tx_pool->dp_meta_rootbp = *bp; - } else { -- poa->used += bp_get_dsize_sync(dp->dp_spa, bp); -- poa->comp += BP_GET_PSIZE(bp); -- poa->uncomp += BP_GET_UCSIZE(bp); -- dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_bp = *bp; - } -- return (0); - } - --static void --process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, -- dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) --{ -- struct process_old_arg poa = { 0 }; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- -- ASSERT(ds->ds_deadlist.dl_oldfmt); -- ASSERT(ds_next->ds_deadlist.dl_oldfmt); -- -- poa.ds = ds; -- poa.ds_prev = ds_prev; -- poa.after_branch_point = after_branch_point; -- poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, -- process_old_cb, &poa, tx)); -- VERIFY0(zio_wait(poa.pio)); -- ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); -- -- /* change snapused */ -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -- -poa.used, -poa.comp, -poa.uncomp, tx); -- -- /* swap next's deadlist to our deadlist */ -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_close(&ds_next->ds_deadlist); -- SWITCH64(ds_next->ds_phys->ds_deadlist_obj, -- ds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&ds_next->ds_deadlist, mos, -- ds_next->ds_phys->ds_deadlist_obj); --} -- --static int --old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -+spa_t * -+dsl_dataset_get_spa(dsl_dataset_t *ds) - { -- int err; -- struct killarg ka; -- -- /* -- * Free everything that we point to (that's born after -- * the previous snapshot, if we are a clone) -- * -- * NB: this should be very quick, because we already -- * freed all the objects in open context. -- */ -- ka.ds = ds; -- ka.tx = tx; -- err = traverse_dataset(ds, -- ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, -- kill_blkptr, &ka); -- ASSERT0(err); -- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); -- -- return (err); -+ return (ds->ds_dir->dd_pool->dp_spa); - } -@@ -1642,355 +859,33 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) - void --dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) -+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) - { -- struct dsl_ds_destroyarg *dsda = arg1; -- dsl_dataset_t *ds = dsda->ds; -- int err = 0; -- int after_branch_point = FALSE; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- dsl_dataset_t *ds_prev = NULL; -- boolean_t wont_destroy; -- uint64_t obj; -- -- wont_destroy = (dsda->defer && -- (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); -- -- ASSERT(ds->ds_owner || wont_destroy); -- ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); -- ASSERT(ds->ds_prev == NULL || -- ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); -- ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ dsl_pool_t *dp; - -- if (wont_destroy) { -- ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; -+ if (ds == NULL) /* this is the meta-objset */ - return; -- } -- -- /* signal any waiters that this dataset is going away */ -- mutex_enter(&ds->ds_lock); -- ds->ds_owner = dsl_reaper; -- cv_broadcast(&ds->ds_exclusive_cv); -- mutex_exit(&ds->ds_lock); -- -- /* Remove our reservation */ -- if (ds->ds_reserved != 0) { -- dsl_prop_setarg_t psa; -- uint64_t value = 0; -- -- dsl_prop_setarg_init_uint64(&psa, "refreservation", -- (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -- &value); -- psa.psa_effective_value = 0; /* predict default value */ -- -- dsl_dataset_set_reservation_sync(ds, &psa, tx); -- ASSERT0(ds->ds_reserved); -- } -- -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); -- -- dsl_scan_ds_destroyed(ds, tx); -- -- obj = ds->ds_object; -- -- if (ds->ds_phys->ds_prev_snap_obj != 0) { -- if (ds->ds_prev) { -- ds_prev = ds->ds_prev; -- } else { -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); -- } -- after_branch_point = -- (ds_prev->ds_phys->ds_next_snap_obj != obj); -- -- dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); -- if (after_branch_point && -- ds_prev->ds_phys->ds_next_clones_obj != 0) { -- remove_from_next_clones(ds_prev, obj, tx); -- if (ds->ds_phys->ds_next_snap_obj != 0) { -- VERIFY(0 == zap_add_int(mos, -- ds_prev->ds_phys->ds_next_clones_obj, -- ds->ds_phys->ds_next_snap_obj, tx)); -- } -- } -- if (after_branch_point && -- ds->ds_phys->ds_next_snap_obj == 0) { -- /* This clone is toast. */ -- ASSERT(ds_prev->ds_phys->ds_num_children > 1); -- ds_prev->ds_phys->ds_num_children--; -- -- /* -- * If the clone's origin has no other clones, no -- * user holds, and has been marked for deferred -- * deletion, then we should have done the necessary -- * destroy setup for it. -- */ -- if (ds_prev->ds_phys->ds_num_children == 1 && -- ds_prev->ds_userrefs == 0 && -- DS_IS_DEFER_DESTROY(ds_prev)) { -- ASSERT3P(dsda->rm_origin, !=, NULL); -- } else { -- ASSERT3P(dsda->rm_origin, ==, NULL); -- } -- } else if (!after_branch_point) { -- ds_prev->ds_phys->ds_next_snap_obj = -- ds->ds_phys->ds_next_snap_obj; -- } -- } -- -- if (dsl_dataset_is_snapshot(ds)) { -- dsl_dataset_t *ds_next; -- uint64_t old_unique; -- uint64_t used = 0, comp = 0, uncomp = 0; -- -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); -- ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); -- -- old_unique = ds_next->ds_phys->ds_unique_bytes; -- -- dmu_buf_will_dirty(ds_next->ds_dbuf, tx); -- ds_next->ds_phys->ds_prev_snap_obj = -- ds->ds_phys->ds_prev_snap_obj; -- ds_next->ds_phys->ds_prev_snap_txg = -- ds->ds_phys->ds_prev_snap_txg; -- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, -- ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); -- -- -- if (ds_next->ds_deadlist.dl_oldfmt) { -- process_old_deadlist(ds, ds_prev, ds_next, -- after_branch_point, tx); -- } else { -- /* Adjust prev's unique space. */ -- if (ds_prev && !after_branch_point) { -- dsl_deadlist_space_range(&ds_next->ds_deadlist, -- ds_prev->ds_phys->ds_prev_snap_txg, -- ds->ds_phys->ds_prev_snap_txg, -- &used, &comp, &uncomp); -- ds_prev->ds_phys->ds_unique_bytes += used; -- } -- -- /* Adjust snapused. */ -- dsl_deadlist_space_range(&ds_next->ds_deadlist, -- ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -- &used, &comp, &uncomp); -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -- -used, -comp, -uncomp, tx); -- -- /* Move blocks to be freed to pool's free list. */ -- dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, -- &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, -- tx); -- dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, -- DD_USED_HEAD, used, comp, uncomp, tx); -- -- /* Merge our deadlist into next's and free it. */ -- dsl_deadlist_merge(&ds_next->ds_deadlist, -- ds->ds_phys->ds_deadlist_obj, tx); -- } -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -- -- /* Collapse range in clone heads */ -- dsl_dataset_remove_clones_key(ds, -- ds->ds_phys->ds_creation_txg, tx); -- -- if (dsl_dataset_is_snapshot(ds_next)) { -- dsl_dataset_t *ds_nextnext; -- dsl_dataset_t *hds; -- -- /* -- * Update next's unique to include blocks which -- * were previously shared by only this snapshot -- * and it. Those blocks will be born after the -- * prev snap and before this snap, and will have -- * died after the next snap and before the one -- * after that (ie. be on the snap after next's -- * deadlist). -- */ -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds_next->ds_phys->ds_next_snap_obj, -- FTAG, &ds_nextnext)); -- dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, -- ds->ds_phys->ds_prev_snap_txg, -- ds->ds_phys->ds_creation_txg, -- &used, &comp, &uncomp); -- ds_next->ds_phys->ds_unique_bytes += used; -- dsl_dataset_rele(ds_nextnext, FTAG); -- ASSERT3P(ds_next->ds_prev, ==, NULL); -- -- /* Collapse range in this head. */ -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_head_dataset_obj, -- FTAG, &hds)); -- dsl_deadlist_remove_key(&hds->ds_deadlist, -- ds->ds_phys->ds_creation_txg, tx); -- dsl_dataset_rele(hds, FTAG); -- -- } else { -- ASSERT3P(ds_next->ds_prev, ==, ds); -- dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); -- ds_next->ds_prev = NULL; -- if (ds_prev) { -- VERIFY(0 == dsl_dataset_get_ref(dp, -- ds->ds_phys->ds_prev_snap_obj, -- ds_next, &ds_next->ds_prev)); -- } -- -- dsl_dataset_recalc_head_uniq(ds_next); -- -- /* -- * Reduce the amount of our unconsmed refreservation -- * being charged to our parent by the amount of -- * new unique data we have gained. -- */ -- if (old_unique < ds_next->ds_reserved) { -- int64_t mrsdelta; -- uint64_t new_unique = -- ds_next->ds_phys->ds_unique_bytes; -- -- ASSERT(old_unique <= new_unique); -- mrsdelta = MIN(new_unique - old_unique, -- ds_next->ds_reserved - old_unique); -- dsl_dir_diduse_space(ds->ds_dir, -- DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); -- } -- } -- dsl_dataset_rele(ds_next, FTAG); -- } else { -- zfeature_info_t *async_destroy = -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; -- objset_t *os; -- -- /* -- * There's no next snapshot, so this is a head dataset. -- * Destroy the deadlist. Unless it's a clone, the -- * deadlist should be empty. (If it's a clone, it's -- * safe to ignore the deadlist contents.) -- */ -- dsl_deadlist_close(&ds->ds_deadlist); -- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -- ds->ds_phys->ds_deadlist_obj = 0; -- -- VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); -- -- if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { -- err = old_synchronous_dataset_destroy(ds, tx); -- } else { -- /* -- * Move the bptree into the pool's list of trees to -- * clean up and update space accounting information. -- */ -- uint64_t used, comp, uncomp; -- -- zil_destroy_sync(dmu_objset_zil(os), tx); -- -- if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { -- spa_feature_incr(dp->dp_spa, async_destroy, tx); -- dp->dp_bptree_obj = bptree_alloc(mos, tx); -- VERIFY(zap_add(mos, -- DMU_POOL_DIRECTORY_OBJECT, -- DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, -- &dp->dp_bptree_obj, tx) == 0); -- } - -- used = ds->ds_dir->dd_phys->dd_used_bytes; -- comp = ds->ds_dir->dd_phys->dd_compressed_bytes; -- uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; -- -- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || -- ds->ds_phys->ds_unique_bytes == used); -- -- bptree_add(mos, dp->dp_bptree_obj, -- &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, -- used, comp, uncomp, tx); -- dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -- -used, -comp, -uncomp, tx); -- dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -- used, comp, uncomp, tx); -- } -+ ASSERT(ds->ds_objset != NULL); - -- if (ds->ds_prev != NULL) { -- if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -- VERIFY3U(0, ==, zap_remove_int(mos, -- ds->ds_prev->ds_dir->dd_phys->dd_clones, -- ds->ds_object, tx)); -- } -- dsl_dataset_rele(ds->ds_prev, ds); -- ds->ds_prev = ds_prev = NULL; -- } -- } -+ if (ds->ds_phys->ds_next_snap_obj != 0) -+ panic("dirtying snapshot!"); - -- /* -- * This must be done after the dsl_traverse(), because it will -- * re-open the objset. -- */ -- if (ds->ds_objset) { -- dmu_objset_evict(ds->ds_objset); -- ds->ds_objset = NULL; -- } -+ dp = ds->ds_dir->dd_pool; - -- if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { -- /* Erase the link in the dir */ -- dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -- ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; -- ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); -- err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); -- ASSERT(err == 0); -- } else { -- /* remove from snapshot namespace */ -- dsl_dataset_t *ds_head; -- ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); -- VERIFY(0 == dsl_dataset_hold_obj(dp, -- ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); -- VERIFY(0 == dsl_dataset_get_snapname(ds)); --#ifdef ZFS_DEBUG -- { -- uint64_t val; -- -- err = dsl_dataset_snap_lookup(ds_head, -- ds->ds_snapname, &val); -- ASSERT0(err); -- ASSERT3U(val, ==, obj); -- } --#endif -- err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); -- ASSERT(err == 0); -- dsl_dataset_rele(ds_head, FTAG); -+ if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { -+ /* up the hold count until we can be written out */ -+ dmu_buf_add_ref(ds->ds_dbuf, ds); - } -+} - -- if (ds_prev && ds->ds_prev != ds_prev) -- dsl_dataset_rele(ds_prev, FTAG); -- -- spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -- spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, -- "dataset = %llu", ds->ds_object); -- -- if (ds->ds_phys->ds_next_clones_obj != 0) { -- ASSERTV(uint64_t count); -- ASSERT(0 == zap_count(mos, -- ds->ds_phys->ds_next_clones_obj, &count) && count == 0); -- VERIFY(0 == dmu_object_free(mos, -- ds->ds_phys->ds_next_clones_obj, tx)); -- } -- if (ds->ds_phys->ds_props_obj != 0) -- VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); -- if (ds->ds_phys->ds_userrefs_obj != 0) -- VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); -- dsl_dir_close(ds->ds_dir, ds); -- ds->ds_dir = NULL; -- dsl_dataset_drain_refs(ds, tag); -- VERIFY(0 == dmu_object_free(mos, obj, tx)); -- -- if (dsda->rm_origin) { -- /* -- * Remove the origin of the clone we just destroyed. -- */ -- struct dsl_ds_destroyarg ndsda = {0}; -+boolean_t -+dsl_dataset_is_dirty(dsl_dataset_t *ds) -+{ -+ int t; - -- ndsda.ds = dsda->rm_origin; -- dsl_dataset_destroy_sync(&ndsda, tag, tx); -+ for (t = 0; t < TXG_SIZE; t++) { -+ if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, -+ ds, t)) -+ return (B_TRUE); - } -+ return (B_FALSE); - } -@@ -2013,6 +908,6 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - - /* -- * Propogate any reserved space for this snapshot to other -+ * Propagate any reserved space for this snapshot to other - * snapshot checks in this sync group. -@@ -2025,10 +920,20 @@ dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) - -+typedef struct dsl_dataset_snapshot_arg { -+ nvlist_t *ddsa_snaps; -+ nvlist_t *ddsa_props; -+ nvlist_t *ddsa_errors; -+} dsl_dataset_snapshot_arg_t; -+ - int --dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx, boolean_t recv) - { -- dsl_dataset_t *ds = arg1; -- const char *snapname = arg2; -- int err; -+ int error; - uint64_t value; - -+ ds->ds_trysnap_txg = tx->tx_txg; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ - /* -@@ -2038,25 +943,29 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - - /* -- * Check for conflicting name snapshot name. -+ * Check for conflicting snapshot name. - */ -- err = dsl_dataset_snap_lookup(ds, snapname, &value); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ error = dsl_dataset_snap_lookup(ds, snapname, &value); -+ if (error == 0) -+ return (SET_ERROR(EEXIST)); -+ if (error != ENOENT) -+ return (error); - - /* -- * Check that the dataset's name is not too long. Name consists -- * of the dataset's length + 1 for the @-sign + snapshot name's length -+ * We don't allow taking snapshots of inconsistent datasets, such as -+ * those into which we are currently receiving. However, if we are -+ * creating this snapshot as part of a receive, this check will be -+ * executed atomically with respect to the completion of the receive -+ * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this -+ * case we ignore this, knowing it will be fixed up for us shortly in -+ * dmu_recv_end_sync(). - */ -- if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (!recv && DS_IS_INCONSISTENT(ds)) -+ return (SET_ERROR(EBUSY)); - -- err = dsl_dataset_snapshot_reserve_space(ds, tx); -- if (err) -- return (err); -+ error = dsl_dataset_snapshot_reserve_space(ds, tx); -+ if (error != 0) -+ return (error); - -- ds->ds_trysnap_txg = tx->tx_txg; - return (0); -@@ -2064,7 +973,50 @@ dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) - -+static int -+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_arg_t *ddsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ int rv = 0; -+ -+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { -+ int error = 0; -+ dsl_dataset_t *ds; -+ char *name, *atp; -+ char dsname[MAXNAMELEN]; -+ -+ name = nvpair_name(pair); -+ if (strlen(name) >= MAXNAMELEN) -+ error = SET_ERROR(ENAMETOOLONG); -+ if (error == 0) { -+ atp = strchr(name, '@'); -+ if (atp == NULL) -+ error = SET_ERROR(EINVAL); -+ if (error == 0) -+ (void) strlcpy(dsname, name, atp - name + 1); -+ } -+ if (error == 0) -+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_dataset_snapshot_check_impl(ds, -+ atp + 1, tx, B_FALSE); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error != 0) { -+ if (ddsa->ddsa_errors != NULL) { -+ fnvlist_add_int32(ddsa->ddsa_errors, -+ name, error); -+ } -+ rv = error; -+ } -+ } -+ return (rv); -+} -+ - void --dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, -+ dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- const char *snapname = arg2; - dsl_pool_t *dp = ds->ds_dir->dd_pool; -@@ -2074,5 +1026,16 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - objset_t *mos = dp->dp_meta_objset; -- int err; -+ ASSERTV(static zil_header_t zero_zil); -+ ASSERTV(objset_t *os); -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ /* -+ * If we are on an old pool, the zil must not be active, in which -+ * case it will be zeroed. Usually zil_suspend() accomplishes this. -+ */ -+ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || -+ dmu_objset_from_ds(ds, &os) != 0 || -+ bcmp(&os->os_phys->os_zil_header, &zero_zil, -+ sizeof (zero_zil)) == 0); - -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - -@@ -2088,3 +1051,3 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); -- VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); -+ VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); - dmu_buf_will_dirty(dbuf, tx); -@@ -2123,5 +1086,5 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } else if (next_clones_obj != 0) { -- remove_from_next_clones(ds->ds_prev, -+ dsl_dataset_remove_from_next_clones(ds->ds_prev, - dsphys->ds_next_snap_obj, tx); -- VERIFY3U(0, ==, zap_add_int(mos, -+ VERIFY0(zap_add_int(mos, - next_clones_obj, dsobj, tx)); -@@ -2144,5 +1107,2 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - dmu_buf_will_dirty(ds->ds_dbuf, tx); -- zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", -- ds->ds_dir->dd_myname, snapname, dsobj, -- ds->ds_phys->ds_prev_snap_txg); - ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, -@@ -2161,9 +1121,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, -- snapname, 8, 1, &dsobj, tx); -- ASSERT(err == 0); -+ VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, -+ snapname, 8, 1, &dsobj, tx)); - - if (ds->ds_prev) -- dsl_dataset_drop_ref(ds->ds_prev, ds); -- VERIFY(0 == dsl_dataset_get_ref(dp, -+ dsl_dataset_rele(ds->ds_prev, ds); -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); -@@ -2174,6 +1133,208 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, -- "dataset = %llu", dsobj); -+ spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); - } - -+static void -+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_arg_t *ddsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { -+ dsl_dataset_t *ds; -+ char *name, *atp; -+ char dsname[MAXNAMELEN]; -+ -+ name = nvpair_name(pair); -+ atp = strchr(name, '@'); -+ (void) strlcpy(dsname, name, atp - name + 1); -+ VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); -+ -+ dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); -+ if (ddsa->ddsa_props != NULL) { -+ dsl_props_set_sync_impl(ds->ds_prev, -+ ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The snapshots must all be in the same pool. -+ * All-or-nothing: if there are any failures, nothing will be modified. -+ */ -+int -+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) -+{ -+ dsl_dataset_snapshot_arg_t ddsa; -+ nvpair_t *pair; -+ boolean_t needsuspend; -+ int error; -+ spa_t *spa; -+ char *firstname; -+ nvlist_t *suspended = NULL; -+ -+ pair = nvlist_next_nvpair(snaps, NULL); -+ if (pair == NULL) -+ return (0); -+ firstname = nvpair_name(pair); -+ -+ error = spa_open(firstname, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -+ spa_close(spa, FTAG); -+ -+ if (needsuspend) { -+ suspended = fnvlist_alloc(); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char fsname[MAXNAMELEN]; -+ char *snapname = nvpair_name(pair); -+ char *atp; -+ void *cookie; -+ -+ atp = strchr(snapname, '@'); -+ if (atp == NULL) { -+ error = SET_ERROR(EINVAL); -+ break; -+ } -+ (void) strlcpy(fsname, snapname, atp - snapname + 1); -+ -+ error = zil_suspend(fsname, &cookie); -+ if (error != 0) -+ break; -+ fnvlist_add_uint64(suspended, fsname, -+ (uintptr_t)cookie); -+ } -+ } -+ -+ ddsa.ddsa_snaps = snaps; -+ ddsa.ddsa_props = props; -+ ddsa.ddsa_errors = errors; -+ -+ if (error == 0) { -+ error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, -+ dsl_dataset_snapshot_sync, &ddsa, -+ fnvlist_num_pairs(snaps) * 3); -+ } -+ -+ if (suspended != NULL) { -+ for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(suspended, pair)) { -+ zil_resume((void *)(uintptr_t) -+ fnvpair_value_uint64(pair)); -+ } -+ fnvlist_free(suspended); -+ } -+ -+#ifdef _KERNEL -+ if (error == 0) { -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char *snapname = nvpair_name(pair); -+ zvol_create_minors(snapname); -+ } -+ } -+#endif -+ -+ return (error); -+} -+ -+typedef struct dsl_dataset_snapshot_tmp_arg { -+ const char *ddsta_fsname; -+ const char *ddsta_snapname; -+ minor_t ddsta_cleanup_minor; -+ const char *ddsta_htag; -+} dsl_dataset_snapshot_tmp_arg_t; -+ -+static int -+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ -+ error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, -+ tx, B_FALSE); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOTSUP)); -+ } -+ error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, -+ B_TRUE, tx); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+static void -+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); -+ -+ dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); -+ dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, -+ ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); -+ dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); -+ -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, -+ minor_t cleanup_minor, const char *htag) -+{ -+ dsl_dataset_snapshot_tmp_arg_t ddsta; -+ int error; -+ spa_t *spa; -+ boolean_t needsuspend; -+ void *cookie; -+ -+ ddsta.ddsta_fsname = fsname; -+ ddsta.ddsta_snapname = snapname; -+ ddsta.ddsta_cleanup_minor = cleanup_minor; -+ ddsta.ddsta_htag = htag; -+ -+ error = spa_open(fsname, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); -+ spa_close(spa, FTAG); -+ -+ if (needsuspend) { -+ error = zil_suspend(fsname, &cookie); -+ if (error != 0) -+ return (error); -+ } -+ -+ error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, -+ dsl_dataset_snapshot_tmp_sync, &ddsta, 3); -+ -+ if (needsuspend) -+ zil_resume(cookie); -+ return (error); -+} -+ -+ - void -@@ -2202,11 +1363,9 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - zap_attribute_t za; -- nvlist_t *propval; -- nvlist_t *val; -+ nvlist_t *propval = fnvlist_alloc(); -+ nvlist_t *val = fnvlist_alloc(); - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - - /* -- * There may me missing entries in ds_next_clones_obj -+ * There may be missing entries in ds_next_clones_obj - * due to a bug in a previous version of the code. -@@ -2215,8 +1374,7 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - if (ds->ds_phys->ds_next_clones_obj != 0) { -- ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, -+ VERIFY0(zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); - } -- if (count != ds->ds_phys->ds_num_children - 1) { -+ if (count != ds->ds_phys->ds_num_children - 1) - goto fail; -- } - for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj); -@@ -2226,20 +1384,6 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - char buf[ZFS_MAXNAMELEN]; -- /* -- * Even though we hold the dp_config_rwlock, the dataset -- * may fail to open, returning ENOENT. If there is a -- * thread concurrently attempting to destroy this -- * dataset, it will have the ds_rwlock held for -- * RW_WRITER. Our call to dsl_dataset_hold_obj() -> -- * dsl_dataset_hold_ref() will fail its -- * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the -- * dp_config_rwlock, and wait for the destroy progress -- * and signal ds_exclusive_cv. If the destroy was -- * successful, we will see that -- * DSL_DATASET_IS_DESTROYED(), and return ENOENT. -- */ -- if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -- za.za_first_integer, FTAG, &clone) != 0) -- continue; -+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -+ za.za_first_integer, FTAG, &clone)); - dsl_dir_name(clone->ds_dir, buf); -- VERIFY(nvlist_add_boolean(val, buf) == 0); -+ fnvlist_add_boolean(val, buf); - dsl_dataset_rele(clone, FTAG); -@@ -2247,5 +1391,4 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) - zap_cursor_fini(&zc); -- VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0); -- VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), -- propval) == 0); -+ fnvlist_add_nvlist(propval, ZPROP_VALUE, val); -+ fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); - fail: -@@ -2253,3 +1396,2 @@ fail: - nvlist_free(propval); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - } -@@ -2260,4 +1402,22 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - uint64_t refd, avail, uobjs, aobjs, ratio; -+ ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); - -- dsl_dir_stats(ds->ds_dir, nv); -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : -+ (ds->ds_phys->ds_uncompressed_bytes * 100 / -+ ds->ds_phys->ds_compressed_bytes); -+ -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, -+ ds->ds_phys->ds_uncompressed_bytes); -+ -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, -+ ds->ds_phys->ds_unique_bytes); -+ get_clones_stat(ds, nv); -+ } else { -+ dsl_dir_stats(ds->ds_dir, nv); -+ } - -@@ -2292,6 +1452,4 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); -- rw_exit(&dp->dp_config_rwlock); - if (err == 0) { -@@ -2307,18 +1465,2 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) - -- ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : -- (ds->ds_phys->ds_uncompressed_bytes * 100 / -- ds->ds_phys->ds_compressed_bytes); -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); -- -- if (ds->ds_phys->ds_next_snap_obj) { -- /* -- * This is a snapshot; override the dd's space used with -- * our unique space and compression ratio. -- */ -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, -- ds->ds_phys->ds_unique_bytes); -- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); -- -- get_clones_stat(ds, nv); -- } - } -@@ -2328,2 +1470,5 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - { -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ ASSERT(dsl_pool_config_held(dp)); -+ - stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; -@@ -2331,3 +1476,4 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - stat->dds_guid = ds->ds_phys->ds_guid; -- if (ds->ds_phys->ds_next_snap_obj) { -+ stat->dds_origin[0] = '\0'; -+ if (dsl_dataset_is_snapshot(ds)) { - stat->dds_is_snapshot = B_TRUE; -@@ -2337,17 +1483,12 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) - stat->dds_num_clones = 0; -- } - -- /* clone origin is really a dsl_dir thing... */ -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- if (dsl_dir_is_clone(ds->ds_dir)) { -- dsl_dataset_t *ods; -+ if (dsl_dir_is_clone(ds->ds_dir)) { -+ dsl_dataset_t *ods; - -- VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, -- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); -- dsl_dataset_name(ods, stat->dds_origin); -- dsl_dataset_drop_ref(ods, FTAG); -- } else { -- stat->dds_origin[0] = '\0'; -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); -+ dsl_dataset_name(ods, stat->dds_origin); -+ dsl_dataset_rele(ods, FTAG); -+ } - } -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - } -@@ -2384,13 +1525,10 @@ dsl_dataset_space(dsl_dataset_t *ds, - boolean_t --dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) -+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) - { -- ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool); -- -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -- if (ds->ds_prev == NULL) -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); -+ if (snap == NULL) - return (B_FALSE); - if (ds->ds_phys->ds_bp.blk_birth > -- ds->ds_prev->ds_phys->ds_creation_txg) { -- objset_t *os, *os_prev; -+ snap->ds_phys->ds_creation_txg) { -+ objset_t *os, *os_snap; - /* -@@ -2402,6 +1540,6 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) - return (B_TRUE); -- if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) -+ if (dmu_objset_from_ds(snap, &os_snap) != 0) - return (B_TRUE); - return (bcmp(&os->os_phys->os_meta_dnode, -- &os_prev->os_phys->os_meta_dnode, -+ &os_snap->os_phys->os_meta_dnode, - sizeof (os->os_phys->os_meta_dnode)) != 0); -@@ -2411,61 +1549,128 @@ dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) - -+typedef struct dsl_dataset_rename_snapshot_arg { -+ const char *ddrsa_fsname; -+ const char *ddrsa_oldsnapname; -+ const char *ddrsa_newsnapname; -+ boolean_t ddrsa_recursive; -+ dmu_tx_t *ddrsa_tx; -+} dsl_dataset_rename_snapshot_arg_t; -+ - /* ARGSUSED */ - static int --dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, -+ dsl_dataset_t *hds, void *arg) - { -- dsl_dataset_t *ds = arg1; -- char *newsnapname = arg2; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_dataset_t *hds; -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ int error; - uint64_t val; -- int err; - -- err = dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); -- if (err) -- return (err); -- -- /* new name better not be in use */ -- err = dsl_dataset_snap_lookup(hds, newsnapname, &val); -- dsl_dataset_rele(hds, FTAG); -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); -+ if (error != 0) { -+ /* ignore nonexistent snapshots */ -+ return (error == ENOENT ? 0 : error); -+ } - -- if (err == 0) -- err = EEXIST; -- else if (err == ENOENT) -- err = 0; -+ /* new name should not exist */ -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); -+ if (error == 0) -+ error = SET_ERROR(EEXIST); -+ else if (error == ENOENT) -+ error = 0; - - /* dataset name + 1 for the "@" + the new snapshot name must fit */ -- if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) -- err = ENAMETOOLONG; -+ if (dsl_dir_namelen(hds->ds_dir) + 1 + -+ strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) -+ error = SET_ERROR(ENAMETOOLONG); - -- return (err); -+ return (error); - } - --static void --dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static int -+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- const char *newsnapname = arg2; -- dsl_dir_t *dd = ds->ds_dir; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *hds; -- int err; -+ int error; - -- ASSERT(ds->ds_phys->ds_next_snap_obj != 0); -+ error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); -+ if (error != 0) -+ return (error); - -- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); -+ if (ddrsa->ddrsa_recursive) { -+ error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, -+ dsl_dataset_rename_snapshot_check_impl, ddrsa, -+ DS_FIND_CHILDREN); -+ } else { -+ error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); -+ } -+ dsl_dataset_rele(hds, FTAG); -+ return (error); -+} - -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); -- ASSERT0(err); -+static int -+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, -+ dsl_dataset_t *hds, void *arg) -+{ -+#ifdef _KERNEL -+ char *oldname, *newname; -+#endif -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_dataset_t *ds; -+ uint64_t val; -+ dmu_tx_t *tx = ddrsa->ddrsa_tx; -+ int error; -+ -+ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); -+ ASSERT(error == 0 || error == ENOENT); -+ if (error == ENOENT) { -+ /* ignore nonexistent snapshots */ -+ return (0); -+ } -+ -+ VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); -+ -+ /* log before we change the name */ -+ spa_history_log_internal_ds(ds, "rename", tx, -+ "-> @%s", ddrsa->ddrsa_newsnapname); -+ -+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx)); - mutex_enter(&ds->ds_lock); -- (void) strcpy(ds->ds_snapname, newsnapname); -+ (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); - mutex_exit(&ds->ds_lock); -- err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, -- ds->ds_snapname, 8, 1, &ds->ds_object, tx); -- ASSERT0(err); -+ VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, -+ ds->ds_snapname, 8, 1, &ds->ds_object, tx)); -+ -+#ifdef _KERNEL -+ oldname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ newname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, -+ ddrsa->ddrsa_oldsnapname); -+ snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, -+ ddrsa->ddrsa_newsnapname); -+ zvol_rename_minors(oldname, newname); -+ kmem_free(newname, MAXPATHLEN); -+ kmem_free(oldname, MAXPATHLEN); -+#endif - -- spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, -- "dataset = %llu", ds->ds_object); -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+static void -+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); -+ ddrsa->ddrsa_tx = tx; -+ if (ddrsa->ddrsa_recursive) { -+ VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, -+ dsl_dataset_rename_snapshot_sync_impl, ddrsa, -+ DS_FIND_CHILDREN)); -+ } else { -+ VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); -+ } - dsl_dataset_rele(hds, FTAG); -@@ -2473,43 +1678,44 @@ dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --struct renamesnaparg { -- dsl_sync_task_group_t *dstg; -- char failed[MAXPATHLEN]; -- char *oldsnap; -- char *newsnap; --}; -+int -+dsl_dataset_rename_snapshot(const char *fsname, -+ const char *oldsnapname, const char *newsnapname, boolean_t recursive) -+{ -+ dsl_dataset_rename_snapshot_arg_t ddrsa; -+ -+ ddrsa.ddrsa_fsname = fsname; -+ ddrsa.ddrsa_oldsnapname = oldsnapname; -+ ddrsa.ddrsa_newsnapname = newsnapname; -+ ddrsa.ddrsa_recursive = recursive; -+ -+ return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, -+ dsl_dataset_rename_snapshot_sync, &ddrsa, 1)); -+} - -+/* -+ * If we're doing an ownership handoff, we need to make sure that there is -+ * only one long hold on the dataset. We're not allowed to change anything here -+ * so we don't permanently release the long hold or regular hold here. We want -+ * to do this only when syncing to avoid the dataset unexpectedly going away -+ * when we release the long hold. -+ */ - static int --dsl_snapshot_rename_one(const char *name, void *arg) -+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) - { -- struct renamesnaparg *ra = arg; -- dsl_dataset_t *ds = NULL; -- char *snapname; -- int err; -+ boolean_t held; - -- snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); -- (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); - -- /* -- * For recursive snapshot renames the parent won't be changing -- * so we just pass name for both the to/from argument. -- */ -- err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); -- if (err != 0) { -- strfree(snapname); -- return (err == ENOENT ? 0 : err); -+ if (owner != NULL) { -+ VERIFY3P(ds->ds_owner, ==, owner); -+ dsl_dataset_long_rele(ds, owner); - } - --#ifdef _KERNEL -- /* -- * For all filesystems undergoing rename, we'll need to unmount it. -- */ -- (void) zfs_unmount_snap(snapname, NULL); --#endif -- err = dsl_dataset_hold(snapname, ra->dstg, &ds); -- strfree(snapname); -- if (err != 0) -- return (err == ENOENT ? 0 : err); -+ held = dsl_dataset_long_held(ds); - -- dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, -- dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); -+ if (owner != NULL) -+ dsl_dataset_long_hold(ds, owner); -+ -+ if (held) -+ return (SET_ERROR(EBUSY)); - -@@ -2518,65 +1724,67 @@ dsl_snapshot_rename_one(const char *name, void *arg) - -+typedef struct dsl_dataset_rollback_arg { -+ const char *ddra_fsname; -+ void *ddra_owner; -+ nvlist_t *ddra_result; -+} dsl_dataset_rollback_arg_t; -+ - static int --dsl_recursive_rename(char *oldname, const char *newname) -+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) - { -- int err; -- struct renamesnaparg *ra; -- dsl_sync_task_t *dst; -- spa_t *spa; -- char *cp, *fsname = spa_strdup(oldname); -- int len = strlen(oldname) + 1; -+ dsl_dataset_rollback_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int64_t unused_refres_delta; -+ int error; - -- /* truncate the snapshot name to get the fsname */ -- cp = strchr(fsname, '@'); -- *cp = '\0'; -+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- err = spa_open(fsname, &spa, FTAG); -- if (err) { -- kmem_free(fsname, len); -- return (err); -+ /* must not be a snapshot */ -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); - } -- ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); -- ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- -- ra->oldsnap = strchr(oldname, '@') + 1; -- ra->newsnap = strchr(newname, '@') + 1; -- *ra->failed = '\0'; - -- err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, -- DS_FIND_CHILDREN); -- kmem_free(fsname, len); -- -- if (err == 0) { -- err = dsl_sync_task_group_wait(ra->dstg); -+ /* must have a most recent snapshot */ -+ if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); - } - -- for (dst = list_head(&ra->dstg->dstg_tasks); dst; -- dst = list_next(&ra->dstg->dstg_tasks, dst)) { -- dsl_dataset_t *ds = dst->dst_arg1; -- if (dst->dst_err) { -- dsl_dir_name(ds->ds_dir, ra->failed); -- (void) strlcat(ra->failed, "@", sizeof (ra->failed)); -- (void) strlcat(ra->failed, ra->newsnap, -- sizeof (ra->failed)); -- } -- dsl_dataset_rele(ds, ra->dstg); -+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - -- if (err) -- (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); -- -- dsl_sync_task_group_destroy(ra->dstg); -- kmem_free(ra, sizeof (struct renamesnaparg)); -- spa_close(spa, FTAG); -- return (err); --} -+ /* -+ * Check if the snap we are rolling back to uses more than -+ * the refquota. -+ */ -+ if (ds->ds_quota != 0 && -+ ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EDQUOT)); -+ } - --static int --dsl_valid_rename(const char *oldname, void *arg) --{ -- int delta = *(int *)arg; -+ /* -+ * When we do the clone swap, we will temporarily use more space -+ * due to the refreservation (the head will no longer have any -+ * unique space, so the entire amount of the refreservation will need -+ * to be free). We will immediately destroy the clone, freeing -+ * this space, but the freeing happens over many txg's. -+ */ -+ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, -+ ds->ds_phys->ds_unique_bytes); - -- if (strlen(oldname) + delta >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (unused_refres_delta > 0 && -+ unused_refres_delta > -+ dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -2584,60 +1792,54 @@ dsl_valid_rename(const char *oldname, void *arg) - --#pragma weak dmu_objset_rename = dsl_dataset_rename --int --dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) -+static void -+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd; -- dsl_dataset_t *ds; -- const char *tail; -- int err; -- -- err = dsl_dir_open(oldname, FTAG, &dd, &tail); -- if (err) -- return (err); -+ dsl_dataset_rollback_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds, *clone; -+ uint64_t cloneobj; -+ char namebuf[ZFS_MAXNAMELEN]; - -- if (tail == NULL) { -- int delta = strlen(newname) - strlen(oldname); -+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); - -- /* if we're growing, validate child name lengths */ -- if (delta > 0) -- err = dmu_objset_find(oldname, dsl_valid_rename, -- &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ dsl_dataset_name(ds->ds_prev, namebuf); -+ fnvlist_add_string(ddra->ddra_result, "target", namebuf); - -- if (err == 0) -- err = dsl_dir_rename(dd, newname); -- dsl_dir_close(dd, FTAG); -- return (err); -- } -+ cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", -+ ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); - -- if (tail[0] != '@') { -- /* the name ended in a nonexistent component */ -- dsl_dir_close(dd, FTAG); -- return (ENOENT); -- } -+ VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); - -- dsl_dir_close(dd, FTAG); -+ dsl_dataset_clone_swap_sync_impl(clone, ds, tx); -+ dsl_dataset_zero_zil(ds, tx); - -- /* new name must be snapshot in same filesystem */ -- tail = strchr(newname, '@'); -- if (tail == NULL) -- return (EINVAL); -- tail++; -- if (strncmp(oldname, newname, tail - newname) != 0) -- return (EXDEV); -+ dsl_destroy_head_sync_impl(clone, tx); - -- if (recursive) { -- err = dsl_recursive_rename(oldname, newname); -- } else { -- err = dsl_dataset_hold(oldname, FTAG, &ds); -- if (err) -- return (err); -+ dsl_dataset_rele(clone, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+} - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_snapshot_rename_check, -- dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); -+/* -+ * Rolls back the given filesystem or volume to the most recent snapshot. -+ * The name of the most recent snapshot will be returned under key "target" -+ * in the result nvlist. -+ * -+ * If owner != NULL: -+ * - The existing dataset MUST be owned by the specified owner at entry -+ * - Upon return, dataset will still be held by the same owner, whether we -+ * succeed or not. -+ * -+ * This mode is required any time the existing filesystem is mounted. See -+ * notes above zfs_suspend_fs() for further details. -+ */ -+int -+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result) -+{ -+ dsl_dataset_rollback_arg_t ddra; - -- dsl_dataset_rele(ds, FTAG); -- } -+ ddra.ddra_fsname = fsname; -+ ddra.ddra_owner = owner; -+ ddra.ddra_result = result; - -- return (err); -+ return (dsl_sync_task(fsname, dsl_dataset_rollback_check, -+ dsl_dataset_rollback_sync, &ddra, 1)); - } -@@ -2649,18 +1851,24 @@ struct promotenode { - --struct promotearg { -+typedef struct dsl_dataset_promote_arg { -+ const char *ddpa_clonename; -+ dsl_dataset_t *ddpa_clone; - list_t shared_snaps, origin_snaps, clone_snaps; -- dsl_dataset_t *origin_origin; -+ dsl_dataset_t *origin_origin; /* origin of the origin */ - uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; - char *err_ds; --}; -+} dsl_dataset_promote_arg_t; - - static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); -+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, -+ void *tag); -+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); - - static int --dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *hds = arg1; -- struct promotearg *pa = arg2; -- struct promotenode *snap = list_head(&pa->shared_snaps); -- dsl_dataset_t *origin_ds = snap->ds; -+ dsl_dataset_promote_arg_t *ddpa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ struct promotenode *snap; -+ dsl_dataset_t *origin_ds; - int err; -@@ -2668,15 +1876,27 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- /* Check that it is a real clone */ -- if (!dsl_dir_is_clone(hds->ds_dir)) -- return (EINVAL); -+ err = promote_hold(ddpa, dp, FTAG); -+ if (err != 0) -+ return (err); -+ -+ hds = ddpa->ddpa_clone; - -- /* Since this is so expensive, don't do the preliminary check */ -- if (!dmu_tx_is_syncing(tx)) -+ if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) { -+ promote_rele(ddpa, FTAG); -+ return (SET_ERROR(EXDEV)); -+ } -+ -+ /* -+ * Compute and check the amount of space to transfer. Since this is -+ * so expensive, don't do the preliminary check. -+ */ -+ if (!dmu_tx_is_syncing(tx)) { -+ promote_rele(ddpa, FTAG); - return (0); -+ } - -- if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) -- return (EXDEV); -+ snap = list_head(&ddpa->shared_snaps); -+ origin_ds = snap->ds; - - /* compute origin's new unique space */ -- snap = list_tail(&pa->clone_snaps); -+ snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); -@@ -2684,3 +1904,3 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -- &pa->unique, &unused, &unused); -+ &ddpa->unique, &unused, &unused); - -@@ -2690,3 +1910,3 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - * Compute space to transfer. Consider the incremental changes -- * to used for each snapshot: -+ * to used by each snapshot: - * (my used) = (prev's used) + (blocks born) - (blocks killed) -@@ -2701,7 +1921,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- pa->used = origin_ds->ds_phys->ds_referenced_bytes; -- pa->comp = origin_ds->ds_phys->ds_compressed_bytes; -- pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; -- for (snap = list_head(&pa->shared_snaps); snap; -- snap = list_next(&pa->shared_snaps, snap)) { -+ ddpa->used = origin_ds->ds_phys->ds_referenced_bytes; -+ ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes; -+ ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; -+ for (snap = list_head(&ddpa->shared_snaps); snap; -+ snap = list_next(&ddpa->shared_snaps, snap)) { - uint64_t val, dlused, dlcomp, dluncomp; -@@ -2709,7 +1929,17 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - -+ /* -+ * If there are long holds, we won't be able to evict -+ * the objset. -+ */ -+ if (dsl_dataset_long_held(ds)) { -+ err = SET_ERROR(EBUSY); -+ goto out; -+ } -+ - /* Check that the snapshot name does not conflict */ -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); - err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); - if (err == 0) { -- err = EEXIST; -+ (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); -+ err = SET_ERROR(EEXIST); - goto out; -@@ -2725,5 +1955,5 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - &dlused, &dlcomp, &dluncomp); -- pa->used += dlused; -- pa->comp += dlcomp; -- pa->uncomp += dluncomp; -+ ddpa->used += dlused; -+ ddpa->comp += dlcomp; -+ ddpa->uncomp += dluncomp; - } -@@ -2734,6 +1964,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (pa->origin_origin) { -- pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes; -- pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; -- pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; -+ if (ddpa->origin_origin) { -+ ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes; -+ ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes; -+ ddpa->uncomp -= -+ ddpa->origin_origin->ds_phys->ds_uncompressed_bytes; - } -@@ -2742,5 +1973,5 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, -- pa->used); -- if (err) -- return (err); -+ ddpa->used); -+ if (err != 0) -+ goto out; - -@@ -2762,24 +1993,23 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- snap = list_head(&pa->origin_snaps); -- err = snaplist_space(&pa->shared_snaps, -- snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); -- if (err) -- return (err); -+ snap = list_head(&ddpa->origin_snaps); -+ err = snaplist_space(&ddpa->shared_snaps, -+ snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); -+ if (err != 0) -+ goto out; - -- err = snaplist_space(&pa->clone_snaps, -+ err = snaplist_space(&ddpa->clone_snaps, - snap->ds->ds_dir->dd_origin_txg, &space); -- if (err) -- return (err); -- pa->cloneusedsnap += space; -+ if (err != 0) -+ goto out; -+ ddpa->cloneusedsnap += space; - } - if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { -- err = snaplist_space(&pa->origin_snaps, -- origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); -- if (err) -- return (err); -+ err = snaplist_space(&ddpa->origin_snaps, -+ origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap); -+ if (err != 0) -+ goto out; - } - -- return (0); - out: -- pa->err_ds = snap->ds->ds_snapname; -+ promote_rele(ddpa, FTAG); - return (err); -@@ -2788,11 +2018,11 @@ out: - static void --dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *hds = arg1; -- struct promotearg *pa = arg2; -- struct promotenode *snap = list_head(&pa->shared_snaps); -- dsl_dataset_t *origin_ds = snap->ds; -+ dsl_dataset_promote_arg_t *ddpa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *hds; -+ struct promotenode *snap; -+ dsl_dataset_t *origin_ds; - dsl_dataset_t *origin_head; -- dsl_dir_t *dd = hds->ds_dir; -- dsl_pool_t *dp = hds->ds_dir->dd_pool; -+ dsl_dir_t *dd; - dsl_dir_t *odd = NULL; -@@ -2801,5 +2031,12 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); -+ VERIFY0(promote_hold(ddpa, dp, FTAG)); -+ hds = ddpa->ddpa_clone; -+ -+ ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE); - -- snap = list_head(&pa->origin_snaps); -+ snap = list_head(&ddpa->shared_snaps); -+ origin_ds = snap->ds; -+ dd = hds->ds_dir; -+ -+ snap = list_head(&ddpa->origin_snaps); - origin_head = snap->ds; -@@ -2810,3 +2047,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, -+ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, - NULL, FTAG, &odd)); -@@ -2816,3 +2053,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; -- snap = list_tail(&pa->clone_snaps); -+ snap = list_tail(&ddpa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); -@@ -2822,4 +2059,5 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (origin_ds->ds_phys->ds_next_clones_obj) { -- remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -+ dsl_dataset_remove_from_next_clones(origin_ds, -+ snap->ds->ds_object, tx); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, -@@ -2840,10 +2078,10 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, - odd->dd_phys->dd_clones, hds->ds_object, tx)); -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -- pa->origin_origin->ds_dir->dd_phys->dd_clones, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ ddpa->origin_origin->ds_dir->dd_phys->dd_clones, - hds->ds_object, tx)); - -- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, -- pa->origin_origin->ds_dir->dd_phys->dd_clones, -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, -+ ddpa->origin_origin->ds_dir->dd_phys->dd_clones, - origin_head->ds_object, tx)); -@@ -2853,5 +2091,4 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - dd->dd_phys->dd_clones, origin_head->ds_object, tx)); -- - } -@@ -2859,7 +2096,11 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* move snapshots to this dir */ -- for (snap = list_head(&pa->shared_snaps); snap; -- snap = list_next(&pa->shared_snaps, snap)) { -+ for (snap = list_head(&ddpa->shared_snaps); snap; -+ snap = list_next(&ddpa->shared_snaps, snap)) { - dsl_dataset_t *ds = snap->ds; - -- /* unregister props as dsl_dir is changing */ -+ /* -+ * Property callbacks are registered to a particular -+ * dsl_dir. Since ours is changing, evict the objset -+ * so that they will be unregistered from the old dsl_dir. -+ */ - if (ds->ds_objset) { -@@ -2868,7 +2109,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -+ - /* move snap name entry */ -- VERIFY(0 == dsl_dataset_get_snapname(ds)); -- VERIFY(0 == dsl_dataset_snap_remove(origin_head, -+ VERIFY0(dsl_dataset_get_snapname(ds)); -+ VERIFY0(dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx)); -- VERIFY(0 == zap_add(dp->dp_meta_objset, -+ VERIFY0(zap_add(dp->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, -@@ -2881,4 +2123,4 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - ASSERT3P(ds->ds_dir, ==, odd); -- dsl_dir_close(ds->ds_dir, ds); -- VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, -+ dsl_dir_rele(ds->ds_dir, ds); -+ VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, - NULL, ds, &ds->ds_dir)); -@@ -2906,3 +2148,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &cnds)); -@@ -2910,6 +2152,6 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY3U(zap_remove_int(dp->dp_meta_objset, -- odd->dd_phys->dd_clones, o, tx), ==, 0); -- VERIFY3U(zap_add_int(dp->dp_meta_objset, -- dd->dd_phys->dd_clones, o, tx), ==, 0); -+ VERIFY0(zap_remove_int(dp->dp_meta_objset, -+ odd->dd_phys->dd_clones, o, tx)); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ dd->dd_phys->dd_clones, o, tx)); - dsl_dataset_rele(cnds, FTAG); -@@ -2919,3 +2161,3 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT0(dsl_prop_numcb(ds)); -+ ASSERT(!dsl_prop_hascb(ds)); - } -@@ -2929,28 +2171,27 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- delta = pa->cloneusedsnap - -+ delta = ddpa->cloneusedsnap - - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, >=, 0); -- ASSERT3U(pa->used, >=, delta); -+ ASSERT3U(ddpa->used, >=, delta); - dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(dd, DD_USED_HEAD, -- pa->used - delta, pa->comp, pa->uncomp, tx); -+ ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); - -- delta = pa->originusedsnap - -+ delta = ddpa->originusedsnap - - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; - ASSERT3S(delta, <=, 0); -- ASSERT3U(pa->used, >=, -delta); -+ ASSERT3U(ddpa->used, >=, -delta); - dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); - dsl_dir_diduse_space(odd, DD_USED_HEAD, -- -pa->used - delta, -pa->comp, -pa->uncomp, tx); -+ -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - -- origin_ds->ds_phys->ds_unique_bytes = pa->unique; -+ origin_ds->ds_phys->ds_unique_bytes = ddpa->unique; - - /* log history record */ -- spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, -- "dataset = %llu", hds->ds_object); -+ spa_history_log_internal_ds(hds, "promote", tx, ""); - -- dsl_dir_close(odd, FTAG); -+ dsl_dir_rele(odd, FTAG); -+ promote_rele(ddpa, FTAG); - } - --static char *snaplist_tag = "snaplist"; - /* -@@ -2962,4 +2203,4 @@ static char *snaplist_tag = "snaplist"; - static int --snaplist_make(dsl_pool_t *dp, boolean_t own, -- uint64_t first_obj, uint64_t last_obj, list_t *l) -+snaplist_make(dsl_pool_t *dp, -+ uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) - { -@@ -2967,4 +2208,2 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); -- - list_create(l, sizeof (struct promotenode), -@@ -2977,19 +2216,6 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- if (own) { -- err = dsl_dataset_own_obj(dp, obj, -- 0, snaplist_tag, &ds); -- if (err == 0) -- dsl_dataset_make_exclusive(ds, snaplist_tag); -- } else { -- err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); -- } -- if (err == ENOENT) { -- /* lost race with snapshot destroy */ -- struct promotenode *last = list_tail(l); -- ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); -- obj = last->ds->ds_phys->ds_prev_snap_obj; -- continue; -- } else if (err) { -+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds); -+ ASSERT(err != ENOENT); -+ if (err != 0) - return (err); -- } - -@@ -2998,3 +2224,3 @@ snaplist_make(dsl_pool_t *dp, boolean_t own, - -- snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); -+ snap = kmem_alloc(sizeof (*snap), KM_PUSHPAGE); - snap->ds = ds; -@@ -3023,3 +2249,3 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) - static void --snaplist_destroy(list_t *l, boolean_t own) -+snaplist_destroy(list_t *l, void *tag) - { -@@ -3027,3 +2253,3 @@ snaplist_destroy(list_t *l, boolean_t own) - -- if (!l || !list_link_active(&l->list_head)) -+ if (l == NULL || !list_link_active(&l->list_head)) - return; -@@ -3032,7 +2258,4 @@ snaplist_destroy(list_t *l, boolean_t own) - list_remove(l, snap); -- if (own) -- dsl_dataset_disown(snap->ds, snaplist_tag); -- else -- dsl_dataset_rele(snap->ds, snaplist_tag); -- kmem_free(snap, sizeof (struct promotenode)); -+ dsl_dataset_rele(snap->ds, tag); -+ kmem_free(snap, sizeof (*snap)); - } -@@ -3041,63 +2264,37 @@ snaplist_destroy(list_t *l, boolean_t own) - --/* -- * Promote a clone. Nomenclature note: -- * "clone" or "cds": the original clone which is being promoted -- * "origin" or "ods": the snapshot which is originally clone's origin -- * "origin head" or "ohds": the dataset which is the head -- * (filesystem/volume) for the origin -- * "origin origin": the origin of the origin's filesystem (typically -- * NULL, indicating that the clone is not a clone of a clone). -- */ --int --dsl_dataset_promote(const char *name, char *conflsnap) -+static int -+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) - { -- dsl_dataset_t *ds; -+ int error; - dsl_dir_t *dd; -- dsl_pool_t *dp; -- dmu_object_info_t doi; -- struct promotearg pa; - struct promotenode *snap; -- int err; - -- bzero(&pa, sizeof(struct promotearg)); -- err = dsl_dataset_hold(name, FTAG, &ds); -- if (err) -- return (err); -- dd = ds->ds_dir; -- dp = dd->dd_pool; -- -- err = dmu_object_info(dp->dp_meta_objset, -- ds->ds_phys->ds_snapnames_zapobj, &doi); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -- } -+ error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, -+ &ddpa->ddpa_clone); -+ if (error != 0) -+ return (error); -+ dd = ddpa->ddpa_clone->ds_dir; - -- if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -+ if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || -+ !dsl_dir_is_clone(dd)) { -+ dsl_dataset_rele(ddpa->ddpa_clone, tag); -+ return (SET_ERROR(EINVAL)); - } - -- /* -- * We are going to inherit all the snapshots taken before our -- * origin (i.e., our new origin will be our parent's origin). -- * Take ownership of them so that we can rename them into our -- * namespace. -- */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- -- err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, -- &pa.shared_snaps); -- if (err != 0) -+ error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj, -+ &ddpa->shared_snaps, tag); -+ if (error != 0) - goto out; - -- err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); -- if (err != 0) -+ error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, -+ &ddpa->clone_snaps, tag); -+ if (error != 0) - goto out; - -- snap = list_head(&pa.shared_snaps); -+ snap = list_head(&ddpa->shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); -- err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, -- snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); -- if (err != 0) -+ error = snaplist_make(dp, dd->dd_phys->dd_origin_obj, -+ snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, -+ &ddpa->origin_snaps, tag); -+ if (error != 0) - goto out; -@@ -3105,86 +2302,109 @@ dsl_dataset_promote(const char *name, char *conflsnap) - if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { -- err = dsl_dataset_hold_obj(dp, -+ error = dsl_dataset_hold_obj(dp, - snap->ds->ds_dir->dd_phys->dd_origin_obj, -- FTAG, &pa.origin_origin); -- if (err != 0) -+ tag, &ddpa->origin_origin); -+ if (error != 0) - goto out; - } -- - out: -- rw_exit(&dp->dp_config_rwlock); -+ if (error != 0) -+ promote_rele(ddpa, tag); -+ return (error); -+} -+ -+static void -+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) -+{ -+ snaplist_destroy(&ddpa->shared_snaps, tag); -+ snaplist_destroy(&ddpa->clone_snaps, tag); -+ snaplist_destroy(&ddpa->origin_snaps, tag); -+ if (ddpa->origin_origin != NULL) -+ dsl_dataset_rele(ddpa->origin_origin, tag); -+ dsl_dataset_rele(ddpa->ddpa_clone, tag); -+} -+ -+/* -+ * Promote a clone. -+ * -+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled -+ * in with the name. (It must be at least MAXNAMELEN bytes long.) -+ */ -+int -+dsl_dataset_promote(const char *name, char *conflsnap) -+{ -+ dsl_dataset_promote_arg_t ddpa = { 0 }; -+ uint64_t numsnaps; -+ int error; -+ objset_t *os; - - /* -- * Add in 128x the snapnames zapobj size, since we will be moving -- * a bunch of snapnames to the promoted ds, and dirtying their -- * bonus buffers. -+ * We will modify space proportional to the number of -+ * snapshots. Compute numsnaps. - */ -- if (err == 0) { -- err = dsl_sync_task_do(dp, dsl_dataset_promote_check, -- dsl_dataset_promote_sync, ds, &pa, -- 2 + 2 * doi.doi_physical_blocks_512); -- if (err && pa.err_ds && conflsnap) -- (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); -- } -+ error = dmu_objset_hold(name, FTAG, &os); -+ if (error != 0) -+ return (error); -+ error = zap_count(dmu_objset_pool(os)->dp_meta_objset, -+ dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps); -+ dmu_objset_rele(os, FTAG); -+ if (error != 0) -+ return (error); - -- snaplist_destroy(&pa.shared_snaps, B_TRUE); -- snaplist_destroy(&pa.clone_snaps, B_FALSE); -- snaplist_destroy(&pa.origin_snaps, B_FALSE); -- if (pa.origin_origin) -- dsl_dataset_rele(pa.origin_origin, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); --} -+ ddpa.ddpa_clonename = name; -+ ddpa.err_ds = conflsnap; - --struct cloneswaparg { -- dsl_dataset_t *cds; /* clone dataset */ -- dsl_dataset_t *ohds; /* origin's head dataset */ -- boolean_t force; -- int64_t unused_refres_delta; /* change in unconsumed refreservation */ --}; -+ return (dsl_sync_task(name, dsl_dataset_promote_check, -+ dsl_dataset_promote_sync, &ddpa, 2 + numsnaps)); -+} - --/* ARGSUSED */ --static int --dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) -+int -+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) - { -- struct cloneswaparg *csa = arg1; -+ int64_t unused_refres_delta; - - /* they should both be heads */ -- if (dsl_dataset_is_snapshot(csa->cds) || -- dsl_dataset_is_snapshot(csa->ohds)) -- return (EINVAL); -+ if (dsl_dataset_is_snapshot(clone) || -+ dsl_dataset_is_snapshot(origin_head)) -+ return (SET_ERROR(EINVAL)); - -- /* the branch point should be just before them */ -- if (csa->cds->ds_prev != csa->ohds->ds_prev) -- return (EINVAL); -+ /* if we are not forcing, the branch point should be just before them */ -+ if (!force && clone->ds_prev != origin_head->ds_prev) -+ return (SET_ERROR(EINVAL)); - -- /* cds should be the clone (unless they are unrelated) */ -- if (csa->cds->ds_prev != NULL && -- csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && -- csa->ohds->ds_object != -- csa->cds->ds_prev->ds_phys->ds_next_snap_obj) -- return (EINVAL); -+ /* clone should be the clone (unless they are unrelated) */ -+ if (clone->ds_prev != NULL && -+ clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && -+ origin_head->ds_dir != clone->ds_prev->ds_dir) -+ return (SET_ERROR(EINVAL)); - - /* the clone should be a child of the origin */ -- if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) -- return (EINVAL); -- -- /* ohds shouldn't be modified unless 'force' */ -- if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) -- return (ETXTBSY); -- -- /* adjust amount of any unconsumed refreservation */ -- csa->unused_refres_delta = -- (int64_t)MIN(csa->ohds->ds_reserved, -- csa->ohds->ds_phys->ds_unique_bytes) - -- (int64_t)MIN(csa->ohds->ds_reserved, -- csa->cds->ds_phys->ds_unique_bytes); -- -- if (csa->unused_refres_delta > 0 && -- csa->unused_refres_delta > -- dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -- -- if (csa->ohds->ds_quota != 0 && -- csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) -- return (EDQUOT); -+ if (clone->ds_dir->dd_parent != origin_head->ds_dir) -+ return (SET_ERROR(EINVAL)); -+ -+ /* origin_head shouldn't be modified unless 'force' */ -+ if (!force && -+ dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) -+ return (SET_ERROR(ETXTBSY)); -+ -+ /* origin_head should have no long holds (e.g. is not mounted) */ -+ if (dsl_dataset_handoff_check(origin_head, owner, tx)) -+ return (SET_ERROR(EBUSY)); -+ -+ /* check amount of any unconsumed refreservation */ -+ unused_refres_delta = -+ (int64_t)MIN(origin_head->ds_reserved, -+ origin_head->ds_phys->ds_unique_bytes) - -+ (int64_t)MIN(origin_head->ds_reserved, -+ clone->ds_phys->ds_unique_bytes); -+ -+ if (unused_refres_delta > 0 && -+ unused_refres_delta > -+ dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) -+ return (SET_ERROR(ENOSPC)); -+ -+ /* clone can't be over the head's refquota */ -+ if (origin_head->ds_quota != 0 && -+ clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota) -+ return (SET_ERROR(EDQUOT)); - -@@ -3193,26 +2413,33 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) - --/* ARGSUSED */ --static void --dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, -+ dsl_dataset_t *origin_head, dmu_tx_t *tx) - { -- struct cloneswaparg *csa = arg1; -- dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ int64_t unused_refres_delta; - -- ASSERT(csa->cds->ds_reserved == 0); -- ASSERT(csa->ohds->ds_quota == 0 || -- csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); -+ ASSERT(clone->ds_reserved == 0); -+ ASSERT(origin_head->ds_quota == 0 || -+ clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota); -+ ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); - -- dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); -- dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); -+ dmu_buf_will_dirty(clone->ds_dbuf, tx); -+ dmu_buf_will_dirty(origin_head->ds_dbuf, tx); - -- if (csa->cds->ds_objset != NULL) { -- dmu_objset_evict(csa->cds->ds_objset); -- csa->cds->ds_objset = NULL; -+ if (clone->ds_objset != NULL) { -+ dmu_objset_evict(clone->ds_objset); -+ clone->ds_objset = NULL; - } - -- if (csa->ohds->ds_objset != NULL) { -- dmu_objset_evict(csa->ohds->ds_objset); -- csa->ohds->ds_objset = NULL; -+ if (origin_head->ds_objset != NULL) { -+ dmu_objset_evict(origin_head->ds_objset); -+ origin_head->ds_objset = NULL; - } - -+ unused_refres_delta = -+ (int64_t)MIN(origin_head->ds_reserved, -+ origin_head->ds_phys->ds_unique_bytes) - -+ (int64_t)MIN(origin_head->ds_reserved, -+ clone->ds_phys->ds_unique_bytes); -+ - /* -@@ -3220,4 +2447,4 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (csa->cds->ds_prev) { -- dsl_dataset_t *origin = csa->cds->ds_prev; -+ if (clone->ds_prev) { -+ dsl_dataset_t *origin = clone->ds_prev; - uint64_t comp, uncomp; -@@ -3225,3 +2452,3 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - dmu_buf_will_dirty(origin->ds_dbuf, tx); -- dsl_deadlist_space_range(&csa->cds->ds_deadlist, -+ dsl_deadlist_space_range(&clone->ds_deadlist, - origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, -@@ -3233,5 +2460,5 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - blkptr_t tmp; -- tmp = csa->ohds->ds_phys->ds_bp; -- csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; -- csa->cds->ds_phys->ds_bp = tmp; -+ tmp = origin_head->ds_phys->ds_bp; -+ origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp; -+ clone->ds_phys->ds_bp = tmp; - } -@@ -3244,21 +2471,21 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- ASSERT3U(csa->cds->ds_dir->dd_phys-> -+ ASSERT3U(clone->ds_dir->dd_phys-> - dd_used_breakdown[DD_USED_SNAP], ==, 0); - -- dsl_deadlist_space(&csa->cds->ds_deadlist, -+ dsl_deadlist_space(&clone->ds_deadlist, - &cdl_used, &cdl_comp, &cdl_uncomp); -- dsl_deadlist_space(&csa->ohds->ds_deadlist, -+ dsl_deadlist_space(&origin_head->ds_deadlist, - &odl_used, &odl_comp, &odl_uncomp); - -- dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used - -- (csa->ohds->ds_phys->ds_referenced_bytes + odl_used); -- dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - -- (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); -- duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + -+ dused = clone->ds_phys->ds_referenced_bytes + cdl_used - -+ (origin_head->ds_phys->ds_referenced_bytes + odl_used); -+ dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp - -+ (origin_head->ds_phys->ds_compressed_bytes + odl_comp); -+ duncomp = clone->ds_phys->ds_uncompressed_bytes + - cdl_uncomp - -- (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); -+ (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp); - -- dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, -+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, - dused, dcomp, duncomp, tx); -- dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, -+ dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, - -dused, -dcomp, -duncomp, tx); -@@ -3271,9 +2498,9 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- dsl_deadlist_space_range(&csa->cds->ds_deadlist, -- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, -+ dsl_deadlist_space_range(&clone->ds_deadlist, -+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &cdl_used, &cdl_comp, &cdl_uncomp); -- dsl_deadlist_space_range(&csa->ohds->ds_deadlist, -- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, -+ dsl_deadlist_space_range(&origin_head->ds_deadlist, -+ origin_head->ds_dir->dd_origin_txg, UINT64_MAX, - &odl_used, &odl_comp, &odl_uncomp); -- dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, -+ dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, - DD_USED_HEAD, DD_USED_SNAP, tx); -@@ -3282,14 +2509,14 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* swap ds_*_bytes */ -- SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes, -- csa->cds->ds_phys->ds_referenced_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, -- csa->cds->ds_phys->ds_compressed_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, -- csa->cds->ds_phys->ds_uncompressed_bytes); -- SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, -- csa->cds->ds_phys->ds_unique_bytes); -+ SWITCH64(origin_head->ds_phys->ds_referenced_bytes, -+ clone->ds_phys->ds_referenced_bytes); -+ SWITCH64(origin_head->ds_phys->ds_compressed_bytes, -+ clone->ds_phys->ds_compressed_bytes); -+ SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes, -+ clone->ds_phys->ds_uncompressed_bytes); -+ SWITCH64(origin_head->ds_phys->ds_unique_bytes, -+ clone->ds_phys->ds_unique_bytes); - - /* apply any parent delta for change in unconsumed refreservation */ -- dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, -- csa->unused_refres_delta, 0, 0, tx); -+ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, -+ unused_refres_delta, 0, 0, tx); - -@@ -3298,52 +2525,15 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- dsl_deadlist_close(&csa->cds->ds_deadlist); -- dsl_deadlist_close(&csa->ohds->ds_deadlist); -- SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, -- csa->cds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, -- csa->cds->ds_phys->ds_deadlist_obj); -- dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, -- csa->ohds->ds_phys->ds_deadlist_obj); -- -- dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); --} -+ dsl_deadlist_close(&clone->ds_deadlist); -+ dsl_deadlist_close(&origin_head->ds_deadlist); -+ SWITCH64(origin_head->ds_phys->ds_deadlist_obj, -+ clone->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, -+ clone->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, -+ origin_head->ds_phys->ds_deadlist_obj); - --/* -- * Swap 'clone' with its origin head datasets. Used at the end of "zfs -- * recv" into an existing fs to swizzle the file system to the new -- * version, and by "zfs rollback". Can also be used to swap two -- * independent head datasets if neither has any snapshots. -- */ --int --dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, -- boolean_t force) --{ -- struct cloneswaparg csa; -- int error; -+ dsl_scan_ds_clone_swapped(origin_head, clone, tx); - -- ASSERT(clone->ds_owner); -- ASSERT(origin_head->ds_owner); --retry: -- /* -- * Need exclusive access for the swap. If we're swapping these -- * datasets back after an error, we already hold the locks. -- */ -- if (!RW_WRITE_HELD(&clone->ds_rwlock)) -- rw_enter(&clone->ds_rwlock, RW_WRITER); -- if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && -- !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { -- rw_exit(&clone->ds_rwlock); -- rw_enter(&origin_head->ds_rwlock, RW_WRITER); -- if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { -- rw_exit(&origin_head->ds_rwlock); -- goto retry; -- } -- } -- csa.cds = clone; -- csa.ohds = origin_head; -- csa.force = force; -- error = dsl_sync_task_do(clone->ds_dir->dd_pool, -- dsl_dataset_clone_swap_check, -- dsl_dataset_clone_swap_sync, &csa, NULL, 9); -- return (error); -+ spa_history_log_internal_ds(clone, "clone swap", tx, -+ "parent=%s", origin_head->ds_dir->dd_myname); - } -@@ -3357,3 +2547,2 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - { -- spa_t *spa; - dsl_pool_t *dp; -@@ -3362,7 +2551,8 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - -- if ((error = spa_open(pname, &spa, FTAG)) != 0) -+ error = dsl_pool_hold(pname, FTAG, &dp); -+ if (error != 0) - return (error); -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { -+ -+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); -+ if (error == 0) { - dsl_dataset_name(ds, buf); -@@ -3370,4 +2560,3 @@ dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) - } -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -3415,7 +2604,5 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - ds->ds_phys->ds_referenced_bytes < ds->ds_quota) -- error = ERESTART; -+ error = SET_ERROR(ERESTART); - else -- error = EDQUOT; -- -- DMU_TX_STAT_BUMP(dmu_tx_quota); -+ error = SET_ERROR(EDQUOT); - } -@@ -3426,23 +2613,51 @@ dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - -+typedef struct dsl_dataset_set_qr_arg { -+ const char *ddsqra_name; -+ zprop_source_t ddsqra_source; -+ uint64_t ddsqra_value; -+} dsl_dataset_set_qr_arg_t; -+ -+ - /* ARGSUSED */ - static int --dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- int err; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t newval; - -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) -- return (ENOTSUP); -+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) -+ return (SET_ERROR(ENOTSUP)); - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- if (psa->psa_effective_value == 0) -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } -+ -+ if (newval == 0) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -- if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes || -- psa->psa_effective_value < ds->ds_reserved) -- return (ENOSPC); -+ if (newval < ds->ds_phys->ds_referenced_bytes || -+ newval < ds->ds_reserved) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -3450,18 +2665,25 @@ dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) - --extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); -- --void --dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+static void -+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ uint64_t newval; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); -+ -+ dsl_prop_set_sync_impl(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - -- if (ds->ds_quota != effective_value) { -+ if (ds->ds_quota != newval) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); -- ds->ds_quota = effective_value; -+ ds->ds_quota = newval; - } -+ dsl_dataset_rele(ds, FTAG); - } -@@ -3469,26 +2691,13 @@ dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) -+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, -+ uint64_t refquota) - { -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -- -- dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -+ dsl_dataset_set_qr_arg_t ddsqra; - -- /* -- * If someone removes a file, then tries to set the quota, we -- * want to make sure the file freeing takes effect. -- */ -- txg_wait_open(ds->ds_dir->dd_pool, 0); -+ ddsqra.ddsqra_name = dsname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = refquota; - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, -- ds, &psa, 0); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, -+ dsl_dataset_set_refquota_sync, &ddsqra, 0)); - } -@@ -3496,21 +2705,29 @@ dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) - static int --dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value; -- uint64_t unique; -- int err; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t newval, unique; - -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -- SPA_VERSION_REFRESERVATION) -- return (ENOTSUP); -+ if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) -+ return (SET_ERROR(ENOTSUP)); - -- if (dsl_dataset_is_snapshot(ds)) -- return (EINVAL); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ if (dsl_dataset_is_snapshot(ds)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- effective_value = psa->psa_effective_value; -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -@@ -3520,4 +2737,6 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (!dmu_tx_is_syncing(tx)) -+ if (!dmu_tx_is_syncing(tx)) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -@@ -3529,13 +2748,15 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { -- uint64_t delta = MAX(unique, effective_value) - -+ if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { -+ uint64_t delta = MAX(unique, newval) - - MAX(unique, ds->ds_reserved); - -- if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) -- return (ENOSPC); -- if (ds->ds_quota > 0 && -- effective_value > ds->ds_quota) -- return (ENOSPC); -+ if (delta > -+ dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || -+ (ds->ds_quota > 0 && newval > ds->ds_quota)) { -+ dsl_dataset_rele(ds, FTAG); -+ return (SET_ERROR(ENOSPC)); -+ } - } - -+ dsl_dataset_rele(ds, FTAG); - return (0); -@@ -3543,8 +2764,7 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - --static void --dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, -+ zprop_source_t source, uint64_t value, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ uint64_t newval; - uint64_t unique; -@@ -3552,7 +2772,9 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); -+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), -+ source, sizeof (value), 1, &value, tx); - -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); - -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_dir->dd_lock); -@@ -3561,5 +2783,5 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - unique = ds->ds_phys->ds_unique_bytes; -- delta = MAX(0, (int64_t)(effective_value - unique)) - -+ delta = MAX(0, (int64_t)(newval - unique)) - - MAX(0, (int64_t)(ds->ds_reserved - unique)); -- ds->ds_reserved = effective_value; -+ ds->ds_reserved = newval; - mutex_exit(&ds->ds_lock); -@@ -3570,596 +2792,28 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --int --dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, -- uint64_t reservation) --{ -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -- -- dsl_prop_setarg_init_uint64(&psa, "refreservation", source, -- &reservation); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_set_reservation_check, -- dsl_dataset_set_reservation_sync, ds, &psa, 0); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); --} -- --typedef struct zfs_hold_cleanup_arg { -- dsl_pool_t *dp; -- uint64_t dsobj; -- char htag[MAXNAMELEN]; --} zfs_hold_cleanup_arg_t; -- - static void --dsl_dataset_user_release_onexit(void *arg) --{ -- zfs_hold_cleanup_arg_t *ca = arg; -- -- (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, -- B_TRUE); -- kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); --} -- --void --dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, -- minor_t minor) --{ -- zfs_hold_cleanup_arg_t *ca; -- -- ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); -- ca->dp = ds->ds_dir->dd_pool; -- ca->dsobj = ds->ds_object; -- (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); -- VERIFY3U(0, ==, zfs_onexit_add_cb(minor, -- dsl_dataset_user_release_onexit, ca, NULL)); --} -- --/* -- * If you add new checks here, you may need to add -- * additional checks to the "temporary" case in -- * snapshot_check() in dmu_objset.c. -- */ --static int --dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- struct dsl_ds_holdarg *ha = arg2; -- char *htag = ha->htag; -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- int error = 0; -- -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- -- if (!dsl_dataset_is_snapshot(ds)) -- return (EINVAL); -- -- /* tags must be unique */ -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_userrefs_obj) { -- error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, -- 8, 1, tx); -- if (error == 0) -- error = EEXIST; -- else if (error == ENOENT) -- error = 0; -- } -- mutex_exit(&ds->ds_lock); -- -- if (error == 0 && ha->temphold && -- strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -- error = E2BIG; -- -- return (error); --} -- --void --dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- struct dsl_ds_holdarg *ha = arg2; -- char *htag = ha->htag; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- uint64_t now = gethrestime_sec(); -- uint64_t zapobj; -- -- mutex_enter(&ds->ds_lock); -- if (ds->ds_phys->ds_userrefs_obj == 0) { -- /* -- * This is the first user hold for this dataset. Create -- * the userrefs zap object. -- */ -- dmu_buf_will_dirty(ds->ds_dbuf, tx); -- zapobj = ds->ds_phys->ds_userrefs_obj = -- zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); -- } else { -- zapobj = ds->ds_phys->ds_userrefs_obj; -- } -- ds->ds_userrefs++; -- mutex_exit(&ds->ds_lock); -- -- VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); -- -- if (ha->temphold) { -- VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, -- htag, &now, tx)); -- } -- -- spa_history_log_internal(LOG_DS_USER_HOLD, -- dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, -- (int)ha->temphold, ds->ds_object); --} -- --static int --dsl_dataset_user_hold_one(const char *dsname, void *arg) --{ -- struct dsl_ds_holdarg *ha = arg; -+ dsl_dataset_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -- int error; -- char *name; -- -- /* alloc a buffer to hold dsname@snapname plus terminating NULL */ -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = dsl_dataset_hold(name, ha->dstg, &ds); -- strfree(name); -- if (error == 0) { -- ha->gotone = B_TRUE; -- dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, -- dsl_dataset_user_hold_sync, ds, ha, 0); -- } else if (error == ENOENT && ha->recursive) { -- error = 0; -- } else { -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- } -- return (error); --} -- --int --dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, -- boolean_t temphold) --{ -- struct dsl_ds_holdarg *ha; -- int error; -- -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- ha->htag = htag; -- ha->temphold = temphold; -- error = dsl_sync_task_do(ds->ds_dir->dd_pool, -- dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, -- ds, ha, 0); -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- -- return (error); --} -- --int --dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, -- boolean_t recursive, boolean_t temphold, int cleanup_fd) --{ -- struct dsl_ds_holdarg *ha; -- dsl_sync_task_t *dst; -- spa_t *spa; -- int error; -- minor_t minor = 0; -- -- if (cleanup_fd != -1) { -- /* Currently we only support cleanup-on-exit of tempholds. */ -- if (!temphold) -- return (EINVAL); -- error = zfs_onexit_fd_hold(cleanup_fd, &minor); -- if (error) -- return (error); -- } -- -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- -- error = spa_open(dsname, &spa, FTAG); -- if (error) { -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- if (cleanup_fd != -1) -- zfs_onexit_fd_rele(cleanup_fd); -- return (error); -- } -- -- ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- ha->htag = htag; -- ha->snapname = snapname; -- ha->recursive = recursive; -- ha->temphold = temphold; -- -- if (recursive) { -- error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, -- ha, DS_FIND_CHILDREN); -- } else { -- error = dsl_dataset_user_hold_one(dsname, ha); -- } -- if (error == 0) -- error = dsl_sync_task_group_wait(ha->dstg); -- -- for (dst = list_head(&ha->dstg->dstg_tasks); dst; -- dst = list_next(&ha->dstg->dstg_tasks, dst)) { -- dsl_dataset_t *ds = dst->dst_arg1; - -- if (dst->dst_err) { -- dsl_dataset_name(ds, ha->failed); -- *strchr(ha->failed, '@') = '\0'; -- } else if (error == 0 && minor != 0 && temphold) { -- /* -- * If this hold is to be released upon process exit, -- * register that action now. -- */ -- dsl_register_onexit_hold_cleanup(ds, htag, minor); -- } -- dsl_dataset_rele(ds, ha->dstg); -- } -- -- if (error == 0 && recursive && !ha->gotone) -- error = ENOENT; -- -- if (error) -- (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); -- -- dsl_sync_task_group_destroy(ha->dstg); -- -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- spa_close(spa, FTAG); -- if (cleanup_fd != -1) -- zfs_onexit_fd_rele(cleanup_fd); -- return (error); --} -- --struct dsl_ds_releasearg { -- dsl_dataset_t *ds; -- const char *htag; -- boolean_t own; /* do we own or just hold ds? */ --}; -- --static int --dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, -- boolean_t *might_destroy) --{ -- objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -- uint64_t zapobj; -- uint64_t tmp; -- int error; -- -- *might_destroy = B_FALSE; -- -- mutex_enter(&ds->ds_lock); -- zapobj = ds->ds_phys->ds_userrefs_obj; -- if (zapobj == 0) { -- /* The tag can't possibly exist */ -- mutex_exit(&ds->ds_lock); -- return (ESRCH); -- } -- -- /* Make sure the tag exists */ -- error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); -- if (error) { -- mutex_exit(&ds->ds_lock); -- if (error == ENOENT) -- error = ESRCH; -- return (error); -- } -- -- if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && -- DS_IS_DEFER_DESTROY(ds)) -- *might_destroy = B_TRUE; -- -- mutex_exit(&ds->ds_lock); -- return (0); --} -- --static int --dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) --{ -- struct dsl_ds_releasearg *ra = arg1; -- dsl_dataset_t *ds = ra->ds; -- boolean_t might_destroy; -- int error; -- -- if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) -- return (ENOTSUP); -- -- error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); -- if (error) -- return (error); -- -- if (might_destroy) { -- struct dsl_ds_destroyarg dsda = {0}; -- -- if (dmu_tx_is_syncing(tx)) { -- /* -- * If we're not prepared to remove the snapshot, -- * we can't allow the release to happen right now. -- */ -- if (!ra->own) -- return (EBUSY); -- } -- dsda.ds = ds; -- dsda.releasing = B_TRUE; -- return (dsl_dataset_destroy_check(&dsda, tag, tx)); -- } -- -- return (0); --} -- --static void --dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) --{ -- struct dsl_ds_releasearg *ra = arg1; -- dsl_dataset_t *ds = ra->ds; -- dsl_pool_t *dp = ds->ds_dir->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- uint64_t zapobj; -- uint64_t dsobj = ds->ds_object; -- uint64_t refs; -- int error; -- -- mutex_enter(&ds->ds_lock); -- ds->ds_userrefs--; -- refs = ds->ds_userrefs; -- mutex_exit(&ds->ds_lock); -- error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); -- VERIFY(error == 0 || error == ENOENT); -- zapobj = ds->ds_phys->ds_userrefs_obj; -- VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); -- -- spa_history_log_internal(LOG_DS_USER_RELEASE, -- dp->dp_spa, tx, "<%s> %lld dataset = %llu", -- ra->htag, (longlong_t)refs, dsobj); -- -- if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && -- DS_IS_DEFER_DESTROY(ds)) { -- struct dsl_ds_destroyarg dsda = {0}; -- -- ASSERT(ra->own); -- dsda.ds = ds; -- dsda.releasing = B_TRUE; -- /* We already did the destroy_check */ -- dsl_dataset_destroy_sync(&dsda, tag, tx); -- } --} -- --static int --dsl_dataset_user_release_one(const char *dsname, void *arg) --{ -- struct dsl_ds_holdarg *ha = arg; -- struct dsl_ds_releasearg *ra; -- dsl_dataset_t *ds; -- int error; -- void *dtag = ha->dstg; -- char *name; -- boolean_t own = B_FALSE; -- boolean_t might_destroy; -- -- /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = dsl_dataset_hold(name, dtag, &ds); -- strfree(name); -- if (error == ENOENT && ha->recursive) -- return (0); -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- if (error) -- return (error); -- -- ha->gotone = B_TRUE; -- -- ASSERT(dsl_dataset_is_snapshot(ds)); -- -- error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); -- if (error) { -- dsl_dataset_rele(ds, dtag); -- return (error); -- } -- -- if (might_destroy) { --#ifdef _KERNEL -- name = kmem_asprintf("%s@%s", dsname, ha->snapname); -- error = zfs_unmount_snap(name, NULL); -- strfree(name); -- if (error) { -- dsl_dataset_rele(ds, dtag); -- return (error); -- } --#endif -- if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { -- dsl_dataset_rele(ds, dtag); -- return (EBUSY); -- } else { -- own = B_TRUE; -- dsl_dataset_make_exclusive(ds, dtag); -- } -- } -- -- ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); -- ra->ds = ds; -- ra->htag = ha->htag; -- ra->own = own; -- dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, -- dsl_dataset_user_release_sync, ra, dtag, 0); -- -- return (0); --} -- --int --dsl_dataset_user_release(char *dsname, char *snapname, char *htag, -- boolean_t recursive) --{ -- struct dsl_ds_holdarg *ha; -- dsl_sync_task_t *dst; -- spa_t *spa; -- int error; -- --top: -- ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); -- -- (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); -- -- error = spa_open(dsname, &spa, FTAG); -- if (error) { -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- return (error); -- } -- -- ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); -- ha->htag = htag; -- ha->snapname = snapname; -- ha->recursive = recursive; -- if (recursive) { -- error = dmu_objset_find(dsname, dsl_dataset_user_release_one, -- ha, DS_FIND_CHILDREN); -- } else { -- error = dsl_dataset_user_release_one(dsname, ha); -- } -- if (error == 0) -- error = dsl_sync_task_group_wait(ha->dstg); -- -- for (dst = list_head(&ha->dstg->dstg_tasks); dst; -- dst = list_next(&ha->dstg->dstg_tasks, dst)) { -- struct dsl_ds_releasearg *ra = dst->dst_arg1; -- dsl_dataset_t *ds = ra->ds; -- -- if (dst->dst_err) -- dsl_dataset_name(ds, ha->failed); -- -- if (ra->own) -- dsl_dataset_disown(ds, ha->dstg); -- else -- dsl_dataset_rele(ds, ha->dstg); -- -- kmem_free(ra, sizeof (struct dsl_ds_releasearg)); -- } -- -- if (error == 0 && recursive && !ha->gotone) -- error = ENOENT; -- -- if (error && error != EBUSY) -- (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); -- -- dsl_sync_task_group_destroy(ha->dstg); -- kmem_free(ha, sizeof (struct dsl_ds_holdarg)); -- spa_close(spa, FTAG); -- -- /* -- * We can get EBUSY if we were racing with deferred destroy and -- * dsl_dataset_user_release_check() hadn't done the necessary -- * open context setup. We can also get EBUSY if we're racing -- * with destroy and that thread is the ds_owner. Either way -- * the busy condition should be transient, and we should retry -- * the release operation. -- */ -- if (error == EBUSY) -- goto top; -- -- return (error); --} -- --/* -- * Called at spa_load time (with retry == B_FALSE) to release a stale -- * temporary user hold. Also called by the onexit code (with retry == B_TRUE). -- */ --int --dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, -- boolean_t retry) --{ -- dsl_dataset_t *ds; -- char *snap; -- char *name; -- int namelen; -- int error; -- -- do { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) -- return (error); -- namelen = dsl_dataset_namelen(ds)+1; -- name = kmem_alloc(namelen, KM_SLEEP); -- dsl_dataset_name(ds, name); -- dsl_dataset_rele(ds, FTAG); -- -- snap = strchr(name, '@'); -- *snap = '\0'; -- ++snap; -- error = dsl_dataset_user_release(name, snap, htag, B_FALSE); -- kmem_free(name, namelen); -- -- /* -- * The object can't have been destroyed because we have a hold, -- * but it might have been renamed, resulting in ENOENT. Retry -- * if we've been requested to do so. -- * -- * It would be nice if we could use the dsobj all the way -- * through and avoid ENOENT entirely. But we might need to -- * unmount the snapshot, and there's currently no way to lookup -- * a vfsp using a ZFS object id. -- */ -- } while ((error == ENOENT) && retry); -- -- return (error); --} -- --int --dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) --{ -- dsl_dataset_t *ds; -- int err; -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); -- if (ds->ds_phys->ds_userrefs_obj != 0) { -- zap_attribute_t *za; -- zap_cursor_t zc; -- -- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); -- for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, -- ds->ds_phys->ds_userrefs_obj); -- zap_cursor_retrieve(&zc, za) == 0; -- zap_cursor_advance(&zc)) { -- VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, -- za->za_first_integer)); -- } -- zap_cursor_fini(&zc); -- kmem_free(za, sizeof (zap_attribute_t)); -- } -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); -+ dsl_dataset_set_refreservation_sync_impl(ds, -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); - dsl_dataset_rele(ds, FTAG); -- return (0); - } - --/* -- * Note, this function is used as the callback for dmu_objset_find(). We -- * always return 0 so that we will continue to find and process -- * inconsistent datasets, even if we encounter an error trying to -- * process one of them. -- */ --/* ARGSUSED */ - int --dsl_destroy_inconsistent(const char *dsname, void *arg) -+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, -+ uint64_t refreservation) - { -- dsl_dataset_t *ds; -+ dsl_dataset_set_qr_arg_t ddsqra; - -- if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { -- if (DS_IS_INCONSISTENT(ds)) -- (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); -- else -- dsl_dataset_disown(ds, FTAG); -- } -- return (0); --} -+ ddsqra.ddsqra_name = dsname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = refreservation; - -+ return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, -+ dsl_dataset_set_refreservation_sync, &ddsqra, 0)); -+} - -@@ -4190,2 +2844,4 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - -+ ASSERT(dsl_pool_config_held(dp)); -+ - *usedp = 0; -@@ -4202,3 +2858,2 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - snapobj = new->ds_object; -@@ -4246,3 +2901,3 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - if (snapobj == 0) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -4251,3 +2906,2 @@ dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, - } -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -4289,3 +2943,3 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - lastsnap->ds_phys->ds_creation_txg) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4293,3 +2947,2 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - snapobj = lastsnap->ds_phys->ds_next_snap_obj; -@@ -4314,3 +2967,2 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - } -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -4318,4 +2970,40 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, - -+/* -+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. -+ * For example, they could both be snapshots of the same filesystem, and -+ * 'earlier' is before 'later'. Or 'earlier' could be the origin of -+ * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's -+ * filesystem. Or 'earlier' could be the origin's origin. -+ */ -+boolean_t -+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier) -+{ -+ dsl_pool_t *dp = later->ds_dir->dd_pool; -+ int error; -+ boolean_t ret; -+ dsl_dataset_t *origin; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ if (earlier->ds_phys->ds_creation_txg >= -+ later->ds_phys->ds_creation_txg) -+ return (B_FALSE); -+ -+ if (later->ds_dir == earlier->ds_dir) -+ return (B_TRUE); -+ if (!dsl_dir_is_clone(later->ds_dir)) -+ return (B_FALSE); -+ -+ if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) -+ return (B_TRUE); -+ error = dsl_dataset_hold_obj(dp, -+ later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin); -+ if (error != 0) -+ return (B_FALSE); -+ ret = dsl_dataset_is_before(origin, earlier); -+ dsl_dataset_rele(origin, FTAG); -+ return (ret); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) --EXPORT_SYMBOL(dmu_snapshots_destroy_nvl); - EXPORT_SYMBOL(dsl_dataset_hold); -@@ -4327,18 +3015,10 @@ EXPORT_SYMBOL(dsl_dataset_rele); - EXPORT_SYMBOL(dsl_dataset_disown); --EXPORT_SYMBOL(dsl_dataset_drop_ref); - EXPORT_SYMBOL(dsl_dataset_tryown); --EXPORT_SYMBOL(dsl_dataset_make_exclusive); - EXPORT_SYMBOL(dsl_dataset_create_sync); - EXPORT_SYMBOL(dsl_dataset_create_sync_dd); --EXPORT_SYMBOL(dsl_dataset_destroy); --EXPORT_SYMBOL(dsl_dataset_destroy_check); --EXPORT_SYMBOL(dsl_dataset_destroy_sync); - EXPORT_SYMBOL(dsl_dataset_snapshot_check); - EXPORT_SYMBOL(dsl_dataset_snapshot_sync); --EXPORT_SYMBOL(dsl_dataset_rename); - EXPORT_SYMBOL(dsl_dataset_promote); --EXPORT_SYMBOL(dsl_dataset_clone_swap); - EXPORT_SYMBOL(dsl_dataset_user_hold); - EXPORT_SYMBOL(dsl_dataset_user_release); --EXPORT_SYMBOL(dsl_dataset_user_release_tmp); - EXPORT_SYMBOL(dsl_dataset_get_holds); -@@ -4347,3 +3027,3 @@ EXPORT_SYMBOL(dsl_dataset_set_blkptr); - EXPORT_SYMBOL(dsl_dataset_get_spa); --EXPORT_SYMBOL(dsl_dataset_modified_since_lastsnap); -+EXPORT_SYMBOL(dsl_dataset_modified_since_snap); - EXPORT_SYMBOL(dsl_dataset_space_written); -@@ -4362,6 +3042,4 @@ EXPORT_SYMBOL(dsl_dsobj_to_dsname); - EXPORT_SYMBOL(dsl_dataset_check_quota); --EXPORT_SYMBOL(dsl_dataset_set_quota); --EXPORT_SYMBOL(dsl_dataset_set_quota_sync); --EXPORT_SYMBOL(dsl_dataset_set_reservation); --EXPORT_SYMBOL(dsl_destroy_inconsistent); -+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); -+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); - #endif -diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c -index 294932c..99670df 100644 ---- a/module/zfs/dsl_deleg.c -+++ b/module/zfs/dsl_deleg.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -109,3 +109,3 @@ dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) - if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -141,6 +141,6 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) - type != ZFS_DELEG_USER_SETS) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - - if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -149,11 +149,20 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) - -+typedef struct dsl_deleg_arg { -+ const char *dda_name; -+ nvlist_t *dda_nvlist; -+} dsl_deleg_arg_t; -+ - static void --dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- nvlist_t *nvp = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_deleg_arg_t *dda = arg; -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; -- uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; -+ uint64_t zapobj; -+ -+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); - -+ zapobj = dd->dd_phys->dd_deleg_zapobj; - if (zapobj == 0) { -@@ -164,3 +173,3 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) { -+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { - const char *whokey = nvpair_name(whopair); -@@ -170,3 +179,3 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); -+ perms = fnvpair_value_nvlist(whopair); - -@@ -183,8 +192,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - perm, 8, 1, &n, tx) == 0); -- spa_history_log_internal(LOG_DS_PERM_UPDATE, -- dd->dd_pool->dp_spa, tx, -- "%s %s dataset = %llu", whokey, perm, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission update", tx, -+ "%s %s", whokey, perm); - } - } -+ dsl_dir_rele(dd, FTAG); - } -@@ -192,14 +200,19 @@ dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- nvlist_t *nvp = arg2; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -+ dsl_deleg_arg_t *dda = arg; -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; - nvpair_t *whopair = NULL; -- uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; -+ uint64_t zapobj; - -- if (zapobj == 0) -+ VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); -+ zapobj = dd->dd_phys->dd_deleg_zapobj; -+ if (zapobj == 0) { -+ dsl_dir_rele(dd, FTAG); - return; -+ } - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) { -+ while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { - const char *whokey = nvpair_name(whopair); -@@ -215,6 +228,4 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE, -- dd->dd_pool->dp_spa, tx, -- "%s dataset = %llu", whokey, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission who remove", -+ tx, "%s", whokey); - continue; -@@ -236,37 +247,40 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx) - } -- spa_history_log_internal(LOG_DS_PERM_REMOVE, -- dd->dd_pool->dp_spa, tx, -- "%s %s dataset = %llu", whokey, perm, -- dd->dd_phys->dd_head_dataset_obj); -+ spa_history_log_internal_dd(dd, "permission remove", tx, -+ "%s %s", whokey, perm); - } - } -+ dsl_dir_rele(dd, FTAG); - } - --int --dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) -+static int -+dsl_deleg_check(void *arg, dmu_tx_t *tx) - { -+ dsl_deleg_arg_t *dda = arg; - dsl_dir_t *dd; - int error; -- nvpair_t *whopair = NULL; -- int blocks_modified = 0; - -- error = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (error) -- return (error); -- -- if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < -+ if (spa_version(dmu_tx_pool(tx)->dp_spa) < - SPA_VERSION_DELEGATED_PERMS) { -- dsl_dir_close(dd, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } - -- while ((whopair = nvlist_next_nvpair(nvp, whopair))) -- blocks_modified++; -+ error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); -+ if (error == 0) -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+} - -- error = dsl_sync_task_do(dd->dd_pool, NULL, -- unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, -- dd, nvp, blocks_modified); -- dsl_dir_close(dd, FTAG); -+int -+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) -+{ -+ dsl_deleg_arg_t dda; - -- return (error); -+ /* nvp must already have been verified to be valid */ -+ -+ dda.dda_name = ddname; -+ dda.dda_nvlist = nvp; -+ -+ return (dsl_sync_task(ddname, dsl_deleg_check, -+ unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, -+ &dda, fnvlist_num_pairs(nvp))); - } -@@ -301,5 +315,11 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- error = dsl_dir_open(ddname, FTAG, &startdd, NULL); -- if (error) -+ error = dsl_pool_hold(ddname, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); -+ } - -@@ -308,6 +328,6 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -- basezc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- baseza = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); -+ basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - source = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); -@@ -315,3 +335,2 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- rw_enter(&dp->dp_config_rwlock, RW_READER); - for (dd = startdd; dd != NULL; dd = dd->dd_parent) { -@@ -320,11 +339,8 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- if (dd->dd_phys->dd_deleg_zapobj && -- (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, -- &n) == 0) && n) { -- VERIFY(nvlist_alloc(&sp_nvp, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -- } else { -+ if (dd->dd_phys->dd_deleg_zapobj == 0 || -+ zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 || -+ n == 0) - continue; -- } - -+ sp_nvp = fnvlist_alloc(); - for (zap_cursor_init(basezc, mos, -@@ -338,4 +354,3 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - -- VERIFY(nvlist_alloc(&perms_nvp, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ perms_nvp = fnvlist_alloc(); - for (zap_cursor_init(zc, mos, baseza->za_first_integer); -@@ -343,9 +358,7 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - zap_cursor_advance(zc)) { -- VERIFY(nvlist_add_boolean(perms_nvp, -- za->za_name) == 0); -+ fnvlist_add_boolean(perms_nvp, za->za_name); - } - zap_cursor_fini(zc); -- VERIFY(nvlist_add_nvlist(sp_nvp, baseza->za_name, -- perms_nvp) == 0); -- nvlist_free(perms_nvp); -+ fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp); -+ fnvlist_free(perms_nvp); - } -@@ -355,14 +368,14 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) - dsl_dir_name(dd, source); -- VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); -+ fnvlist_add_nvlist(*nvp, source, sp_nvp); - nvlist_free(sp_nvp); - } -- rw_exit(&dp->dp_config_rwlock); - - kmem_free(source, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); -- kmem_free(baseza, sizeof(zap_attribute_t)); -- kmem_free(basezc, sizeof(zap_cursor_t)); -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(baseza, sizeof (zap_attribute_t)); -+ kmem_free(basezc, sizeof (zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - -- dsl_dir_close(startdd, FTAG); -+ dsl_dir_rele(startdd, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (0); -@@ -415,3 +428,3 @@ dsl_check_access(objset_t *mos, uint64_t zapobj, - if (error == ENOENT) -- error = EPERM; -+ error = SET_ERROR(EPERM); - } -@@ -460,3 +473,3 @@ dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, - -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -535,8 +548,6 @@ dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, - /* -- * Check if user has requested permission. If descendent is set, must have -- * descendent perms. -+ * Check if user has requested permission. - */ - int --dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, -- cred_t *cr) -+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) - { -@@ -555,3 +566,3 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - if (dsl_delegation_on(mos) == B_FALSE) -- return (ECANCELED); -+ return (SET_ERROR(ECANCELED)); - -@@ -559,5 +570,5 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - SPA_VERSION_DELEGATED_PERMS) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -- if (dsl_dataset_is_snapshot(ds) || descendent) { -+ if (dsl_dataset_is_snapshot(ds)) { - /* -@@ -574,3 +585,3 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, boolean_t descendent, const char *perm, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, -@@ -633,5 +644,4 @@ again: - } -- error = EPERM; -+ error = SET_ERROR(EPERM); - success: -- rw_exit(&dp->dp_config_rwlock); - -@@ -647,2 +657,3 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -650,8 +661,11 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) - -- error = dsl_dataset_hold(dsname, FTAG, &ds); -- if (error) -+ error = dsl_pool_hold(dsname, FTAG, &dp); -+ if (error != 0) - return (error); -- -- error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); -- dsl_dataset_rele(ds, FTAG); -+ error = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_deleg_access_impl(ds, perm, cr); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ dsl_pool_rele(dp, FTAG); - -diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c -new file mode 100644 -index 0000000..351165d ---- /dev/null -+++ b/module/zfs/dsl_destroy.c -@@ -0,0 +1,950 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct dmu_snapshots_destroy_arg { -+ nvlist_t *dsda_snaps; -+ nvlist_t *dsda_successful_snaps; -+ boolean_t dsda_defer; -+ nvlist_t *dsda_errlist; -+} dmu_snapshots_destroy_arg_t; -+ -+int -+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) -+{ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (dsl_dataset_long_held(ds)) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Only allow deferred destroy on pools that support it. -+ * NOTE: deferred destroy is only supported on snapshots. -+ */ -+ if (defer) { -+ if (spa_version(ds->ds_dir->dd_pool->dp_spa) < -+ SPA_VERSION_USERREFS) -+ return (SET_ERROR(ENOTSUP)); -+ return (0); -+ } -+ -+ /* -+ * If this snapshot has an elevated user reference count, -+ * we can't destroy it yet. -+ */ -+ if (ds->ds_userrefs > 0) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Can't delete a branch point. -+ */ -+ if (ds->ds_phys->ds_num_children > 1) -+ return (SET_ERROR(EEXIST)); -+ -+ return (0); -+} -+ -+static int -+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) -+{ -+ dmu_snapshots_destroy_arg_t *dsda = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ int error = 0; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { -+ dsl_dataset_t *ds; -+ -+ error = dsl_dataset_hold(dp, nvpair_name(pair), -+ FTAG, &ds); -+ -+ /* -+ * If the snapshot does not exist, silently ignore it -+ * (it's "already destroyed"). -+ */ -+ if (error == ENOENT) -+ continue; -+ -+ if (error == 0) { -+ error = dsl_destroy_snapshot_check_impl(ds, -+ dsda->dsda_defer); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error == 0) { -+ fnvlist_add_boolean(dsda->dsda_successful_snaps, -+ nvpair_name(pair)); -+ } else { -+ fnvlist_add_int32(dsda->dsda_errlist, -+ nvpair_name(pair), error); -+ } -+ } -+ -+ pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); -+ if (pair != NULL) -+ return (fnvpair_value_int32(pair)); -+ -+ return (0); -+} -+ -+struct process_old_arg { -+ dsl_dataset_t *ds; -+ dsl_dataset_t *ds_prev; -+ boolean_t after_branch_point; -+ zio_t *pio; -+ uint64_t used, comp, uncomp; -+}; -+ -+static int -+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) -+{ -+ struct process_old_arg *poa = arg; -+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; -+ -+ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { -+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); -+ if (poa->ds_prev && !poa->after_branch_point && -+ bp->blk_birth > -+ poa->ds_prev->ds_phys->ds_prev_snap_txg) { -+ poa->ds_prev->ds_phys->ds_unique_bytes += -+ bp_get_dsize_sync(dp->dp_spa, bp); -+ } -+ } else { -+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp); -+ poa->comp += BP_GET_PSIZE(bp); -+ poa->uncomp += BP_GET_UCSIZE(bp); -+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); -+ } -+ return (0); -+} -+ -+static void -+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, -+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) -+{ -+ struct process_old_arg poa = { 0 }; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t deadlist_obj; -+ -+ ASSERT(ds->ds_deadlist.dl_oldfmt); -+ ASSERT(ds_next->ds_deadlist.dl_oldfmt); -+ -+ poa.ds = ds; -+ poa.ds_prev = ds_prev; -+ poa.after_branch_point = after_branch_point; -+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -+ VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, -+ process_old_cb, &poa, tx)); -+ VERIFY0(zio_wait(poa.pio)); -+ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); -+ -+ /* change snapused */ -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -+ -poa.used, -poa.comp, -poa.uncomp, tx); -+ -+ /* swap next's deadlist to our deadlist */ -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_close(&ds_next->ds_deadlist); -+ deadlist_obj = ds->ds_phys->ds_deadlist_obj; -+ ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj; -+ ds_next->ds_phys->ds_deadlist_obj = deadlist_obj; -+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); -+ dsl_deadlist_open(&ds_next->ds_deadlist, mos, -+ ds_next->ds_phys->ds_deadlist_obj); -+} -+ -+static void -+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) -+{ -+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ zap_cursor_t *zc; -+ zap_attribute_t *za; -+ -+ /* -+ * If it is the old version, dd_clones doesn't exist so we can't -+ * find the clones, but dsl_deadlist_remove_key() is a no-op so it -+ * doesn't matter. -+ */ -+ if (ds->ds_dir->dd_phys->dd_clones == 0) -+ return; -+ -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ -+ for (zap_cursor_init(zc, mos, ds->ds_dir->dd_phys->dd_clones); -+ zap_cursor_retrieve(zc, za) == 0; -+ zap_cursor_advance(zc)) { -+ dsl_dataset_t *clone; -+ -+ VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, -+ za->za_first_integer, FTAG, &clone)); -+ if (clone->ds_dir->dd_origin_txg > mintxg) { -+ dsl_deadlist_remove_key(&clone->ds_deadlist, -+ mintxg, tx); -+ dsl_dataset_remove_clones_key(clone, mintxg, tx); -+ } -+ dsl_dataset_rele(clone, FTAG); -+ } -+ zap_cursor_fini(zc); -+ -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); -+} -+ -+void -+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) -+{ -+#ifdef ZFS_DEBUG -+ int err; -+#endif -+ int after_branch_point = FALSE; -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ dsl_dataset_t *ds_prev = NULL; -+ uint64_t obj, old_unique, used = 0, comp = 0, uncomp = 0; -+ dsl_dataset_t *ds_next, *ds_head, *hds; -+ -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ ASSERT(refcount_is_zero(&ds->ds_longholds)); -+ -+ if (defer && -+ (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) { -+ ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; -+ spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); -+ return; -+ } -+ -+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -+ -+ /* We need to log before removing it from the namespace. */ -+ spa_history_log_internal_ds(ds, "destroy", tx, ""); -+ -+ dsl_scan_ds_destroyed(ds, tx); -+ -+ obj = ds->ds_object; -+ -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ ASSERT3P(ds->ds_prev, ==, NULL); -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); -+ after_branch_point = -+ (ds_prev->ds_phys->ds_next_snap_obj != obj); -+ -+ dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); -+ if (after_branch_point && -+ ds_prev->ds_phys->ds_next_clones_obj != 0) { -+ dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); -+ if (ds->ds_phys->ds_next_snap_obj != 0) { -+ VERIFY0(zap_add_int(mos, -+ ds_prev->ds_phys->ds_next_clones_obj, -+ ds->ds_phys->ds_next_snap_obj, tx)); -+ } -+ } -+ if (!after_branch_point) { -+ ds_prev->ds_phys->ds_next_snap_obj = -+ ds->ds_phys->ds_next_snap_obj; -+ } -+ } -+ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); -+ ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); -+ -+ old_unique = ds_next->ds_phys->ds_unique_bytes; -+ -+ dmu_buf_will_dirty(ds_next->ds_dbuf, tx); -+ ds_next->ds_phys->ds_prev_snap_obj = -+ ds->ds_phys->ds_prev_snap_obj; -+ ds_next->ds_phys->ds_prev_snap_txg = -+ ds->ds_phys->ds_prev_snap_txg; -+ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, -+ ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); -+ -+ if (ds_next->ds_deadlist.dl_oldfmt) { -+ process_old_deadlist(ds, ds_prev, ds_next, -+ after_branch_point, tx); -+ } else { -+ /* Adjust prev's unique space. */ -+ if (ds_prev && !after_branch_point) { -+ dsl_deadlist_space_range(&ds_next->ds_deadlist, -+ ds_prev->ds_phys->ds_prev_snap_txg, -+ ds->ds_phys->ds_prev_snap_txg, -+ &used, &comp, &uncomp); -+ ds_prev->ds_phys->ds_unique_bytes += used; -+ } -+ -+ /* Adjust snapused. */ -+ dsl_deadlist_space_range(&ds_next->ds_deadlist, -+ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, -+ &used, &comp, &uncomp); -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -+ -used, -comp, -uncomp, tx); -+ -+ /* Move blocks to be freed to pool's free list. */ -+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, -+ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, -+ tx); -+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, -+ DD_USED_HEAD, used, comp, uncomp, tx); -+ -+ /* Merge our deadlist into next's and free it. */ -+ dsl_deadlist_merge(&ds_next->ds_deadlist, -+ ds->ds_phys->ds_deadlist_obj, tx); -+ } -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_deadlist_obj = 0; -+ -+ /* Collapse range in clone heads */ -+ dsl_dataset_remove_clones_key(ds, -+ ds->ds_phys->ds_creation_txg, tx); -+ -+ if (dsl_dataset_is_snapshot(ds_next)) { -+ dsl_dataset_t *ds_nextnext; -+ -+ /* -+ * Update next's unique to include blocks which -+ * were previously shared by only this snapshot -+ * and it. Those blocks will be born after the -+ * prev snap and before this snap, and will have -+ * died after the next snap and before the one -+ * after that (ie. be on the snap after next's -+ * deadlist). -+ */ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext)); -+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, -+ ds->ds_phys->ds_prev_snap_txg, -+ ds->ds_phys->ds_creation_txg, -+ &used, &comp, &uncomp); -+ ds_next->ds_phys->ds_unique_bytes += used; -+ dsl_dataset_rele(ds_nextnext, FTAG); -+ ASSERT3P(ds_next->ds_prev, ==, NULL); -+ -+ /* Collapse range in this head. */ -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds)); -+ dsl_deadlist_remove_key(&hds->ds_deadlist, -+ ds->ds_phys->ds_creation_txg, tx); -+ dsl_dataset_rele(hds, FTAG); -+ -+ } else { -+ ASSERT3P(ds_next->ds_prev, ==, ds); -+ dsl_dataset_rele(ds_next->ds_prev, ds_next); -+ ds_next->ds_prev = NULL; -+ if (ds_prev) { -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, -+ ds_next, &ds_next->ds_prev)); -+ } -+ -+ dsl_dataset_recalc_head_uniq(ds_next); -+ -+ /* -+ * Reduce the amount of our unconsumed refreservation -+ * being charged to our parent by the amount of -+ * new unique data we have gained. -+ */ -+ if (old_unique < ds_next->ds_reserved) { -+ int64_t mrsdelta; -+ uint64_t new_unique = -+ ds_next->ds_phys->ds_unique_bytes; -+ -+ ASSERT(old_unique <= new_unique); -+ mrsdelta = MIN(new_unique - old_unique, -+ ds_next->ds_reserved - old_unique); -+ dsl_dir_diduse_space(ds->ds_dir, -+ DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); -+ } -+ } -+ dsl_dataset_rele(ds_next, FTAG); -+ -+ /* -+ * This must be done after the dsl_traverse(), because it will -+ * re-open the objset. -+ */ -+ if (ds->ds_objset) { -+ dmu_objset_evict(ds->ds_objset); -+ ds->ds_objset = NULL; -+ } -+ -+ /* remove from snapshot namespace */ -+ ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); -+ VERIFY0(dsl_dataset_hold_obj(dp, -+ ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); -+ VERIFY0(dsl_dataset_get_snapname(ds)); -+#ifdef ZFS_DEBUG -+ { -+ uint64_t val; -+ -+ err = dsl_dataset_snap_lookup(ds_head, -+ ds->ds_snapname, &val); -+ ASSERT0(err); -+ ASSERT3U(val, ==, obj); -+ } -+#endif -+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx)); -+ dsl_dataset_rele(ds_head, FTAG); -+ -+ if (ds_prev != NULL) -+ dsl_dataset_rele(ds_prev, FTAG); -+ -+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -+ -+ if (ds->ds_phys->ds_next_clones_obj != 0) { -+ ASSERTV(uint64_t count); -+ ASSERT0(zap_count(mos, -+ ds->ds_phys->ds_next_clones_obj, &count) && count == 0); -+ VERIFY0(dmu_object_free(mos, -+ ds->ds_phys->ds_next_clones_obj, tx)); -+ } -+ if (ds->ds_phys->ds_props_obj != 0) -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); -+ if (ds->ds_phys->ds_userrefs_obj != 0) -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); -+ dsl_dir_rele(ds->ds_dir, ds); -+ ds->ds_dir = NULL; -+ VERIFY0(dmu_object_free(mos, obj, tx)); -+} -+ -+static void -+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) -+{ -+ dmu_snapshots_destroy_arg_t *dsda = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); -+ pair != NULL; -+ pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); -+ -+ dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The semantics of this function are described in the comment above -+ * lzc_destroy_snaps(). To summarize: -+ * -+ * The snapshots must all be in the same pool. -+ * -+ * Snapshots that don't exist will be silently ignored (considered to be -+ * "already deleted"). -+ * -+ * On success, all snaps will be destroyed and this will return 0. -+ * On failure, no snaps will be destroyed, the errlist will be filled in, -+ * and this will return an errno. -+ */ -+int -+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, -+ nvlist_t *errlist) -+{ -+ dmu_snapshots_destroy_arg_t dsda; -+ int error; -+ nvpair_t *pair; -+ -+ pair = nvlist_next_nvpair(snaps, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ dsda.dsda_snaps = snaps; -+ VERIFY0(nvlist_alloc(&dsda.dsda_successful_snaps, -+ NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ dsda.dsda_defer = defer; -+ dsda.dsda_errlist = errlist; -+ -+ error = dsl_sync_task(nvpair_name(pair), -+ dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, -+ &dsda, 0); -+ fnvlist_free(dsda.dsda_successful_snaps); -+ -+ return (error); -+} -+ -+int -+dsl_destroy_snapshot(const char *name, boolean_t defer) -+{ -+ int error; -+ nvlist_t *nvl; -+ nvlist_t *errlist; -+ -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ VERIFY0(nvlist_alloc(&errlist, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ fnvlist_add_boolean(nvl, name); -+ error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); -+ fnvlist_free(errlist); -+ fnvlist_free(nvl); -+ return (error); -+} -+ -+struct killarg { -+ dsl_dataset_t *ds; -+ dmu_tx_t *tx; -+}; -+ -+/* ARGSUSED */ -+static int -+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, -+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) -+{ -+ struct killarg *ka = arg; -+ dmu_tx_t *tx = ka->tx; -+ -+ if (bp == NULL) -+ return (0); -+ -+ if (zb->zb_level == ZB_ZIL_LEVEL) { -+ ASSERT(zilog != NULL); -+ /* -+ * It's a block in the intent log. It has no -+ * accounting, so just free it. -+ */ -+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); -+ } else { -+ ASSERT(zilog == NULL); -+ ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); -+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); -+ } -+ -+ return (0); -+} -+ -+static void -+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ struct killarg ka; -+ -+ /* -+ * Free everything that we point to (that's born after -+ * the previous snapshot, if we are a clone) -+ * -+ * NB: this should be very quick, because we already -+ * freed all the objects in open context. -+ */ -+ ka.ds = ds; -+ ka.tx = tx; -+ VERIFY0(traverse_dataset(ds, -+ ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, -+ kill_blkptr, &ka)); -+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); -+} -+ -+typedef struct dsl_destroy_head_arg { -+ const char *ddha_name; -+} dsl_destroy_head_arg_t; -+ -+int -+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) -+{ -+ int error; -+ uint64_t count; -+ objset_t *mos; -+ -+ if (dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (refcount_count(&ds->ds_longholds) != expected_holds) -+ return (SET_ERROR(EBUSY)); -+ -+ mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ -+ /* -+ * Can't delete a head dataset if there are snapshots of it. -+ * (Except if the only snapshots are from the branch we cloned -+ * from.) -+ */ -+ if (ds->ds_prev != NULL && -+ ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) -+ return (SET_ERROR(EBUSY)); -+ -+ /* -+ * Can't delete if there are children of this fs. -+ */ -+ error = zap_count(mos, -+ ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); -+ if (error != 0) -+ return (error); -+ if (count != 0) -+ return (SET_ERROR(EEXIST)); -+ -+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && -+ ds->ds_prev->ds_phys->ds_num_children == 2 && -+ ds->ds_prev->ds_userrefs == 0) { -+ /* We need to remove the origin snapshot as well. */ -+ if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) -+ return (SET_ERROR(EBUSY)); -+ } -+ return (0); -+} -+ -+static int -+dsl_destroy_head_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ -+ error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_destroy_head_check_impl(ds, 0); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+} -+ -+static void -+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) -+{ -+ dsl_dir_t *dd; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ dd_used_t t; -+ -+ ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); -+ -+ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); -+ -+ ASSERT0(dd->dd_phys->dd_head_dataset_obj); -+ -+ /* -+ * Remove our reservation. The impl() routine avoids setting the -+ * actual property, which would require the (already destroyed) ds. -+ */ -+ dsl_dir_set_reservation_sync_impl(dd, 0, tx); -+ -+ ASSERT0(dd->dd_phys->dd_used_bytes); -+ ASSERT0(dd->dd_phys->dd_reserved); -+ for (t = 0; t < DD_USED_NUM; t++) -+ ASSERT0(dd->dd_phys->dd_used_breakdown[t]); -+ -+ VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); -+ VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); -+ VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); -+ VERIFY0(zap_remove(mos, -+ dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); -+ -+ dsl_dir_rele(dd, FTAG); -+ VERIFY0(dmu_object_free(mos, ddobj, tx)); -+} -+ -+void -+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t obj, ddobj, prevobj = 0; -+ boolean_t rmorigin; -+ zfeature_info_t *async_destroy; -+ objset_t *os; -+ -+ ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -+ ASSERT(ds->ds_prev == NULL || -+ ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); -+ ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ /* We need to log before removing it from the namespace. */ -+ spa_history_log_internal_ds(ds, "destroy", tx, ""); -+ -+ rmorigin = (dsl_dir_is_clone(ds->ds_dir) && -+ DS_IS_DEFER_DESTROY(ds->ds_prev) && -+ ds->ds_prev->ds_phys->ds_num_children == 2 && -+ ds->ds_prev->ds_userrefs == 0); -+ -+ /* Remove our reservation */ -+ if (ds->ds_reserved != 0) { -+ dsl_dataset_set_refreservation_sync_impl(ds, -+ (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -+ 0, tx); -+ ASSERT0(ds->ds_reserved); -+ } -+ -+ dsl_scan_ds_destroyed(ds, tx); -+ -+ obj = ds->ds_object; -+ -+ if (ds->ds_phys->ds_prev_snap_obj != 0) { -+ /* This is a clone */ -+ ASSERT(ds->ds_prev != NULL); -+ ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj); -+ ASSERT0(ds->ds_phys->ds_next_snap_obj); -+ -+ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); -+ if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) { -+ dsl_dataset_remove_from_next_clones(ds->ds_prev, -+ obj, tx); -+ } -+ -+ ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1); -+ ds->ds_prev->ds_phys->ds_num_children--; -+ } -+ -+ async_destroy = -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]; -+ -+ /* -+ * Destroy the deadlist. Unless it's a clone, the -+ * deadlist should be empty. (If it's a clone, it's -+ * safe to ignore the deadlist contents.) -+ */ -+ dsl_deadlist_close(&ds->ds_deadlist); -+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_deadlist_obj = 0; -+ -+ VERIFY0(dmu_objset_from_ds(ds, &os)); -+ -+ if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) { -+ old_synchronous_dataset_destroy(ds, tx); -+ } else { -+ /* -+ * Move the bptree into the pool's list of trees to -+ * clean up and update space accounting information. -+ */ -+ uint64_t used, comp, uncomp; -+ -+ zil_destroy_sync(dmu_objset_zil(os), tx); -+ -+ if (!spa_feature_is_active(dp->dp_spa, async_destroy)) { -+ dsl_scan_t *scn = dp->dp_scan; -+ -+ spa_feature_incr(dp->dp_spa, async_destroy, tx); -+ dp->dp_bptree_obj = bptree_alloc(mos, tx); -+ VERIFY0(zap_add(mos, -+ DMU_POOL_DIRECTORY_OBJECT, -+ DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, -+ &dp->dp_bptree_obj, tx)); -+ ASSERT(!scn->scn_async_destroying); -+ scn->scn_async_destroying = B_TRUE; -+ } -+ -+ used = ds->ds_dir->dd_phys->dd_used_bytes; -+ comp = ds->ds_dir->dd_phys->dd_compressed_bytes; -+ uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes; -+ -+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || -+ ds->ds_phys->ds_unique_bytes == used); -+ -+ bptree_add(mos, dp->dp_bptree_obj, -+ &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg, -+ used, comp, uncomp, tx); -+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -+ -used, -comp, -uncomp, tx); -+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -+ used, comp, uncomp, tx); -+ } -+ -+ if (ds->ds_prev != NULL) { -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { -+ VERIFY0(zap_remove_int(mos, -+ ds->ds_prev->ds_dir->dd_phys->dd_clones, -+ ds->ds_object, tx)); -+ } -+ prevobj = ds->ds_prev->ds_object; -+ dsl_dataset_rele(ds->ds_prev, ds); -+ ds->ds_prev = NULL; -+ } -+ -+ /* -+ * This must be done after the dsl_traverse(), because it will -+ * re-open the objset. -+ */ -+ if (ds->ds_objset) { -+ dmu_objset_evict(ds->ds_objset); -+ ds->ds_objset = NULL; -+ } -+ -+ /* Erase the link in the dir */ -+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -+ ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; -+ ddobj = ds->ds_dir->dd_object; -+ ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); -+ VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx)); -+ -+ spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); -+ -+ ASSERT0(ds->ds_phys->ds_next_clones_obj); -+ ASSERT0(ds->ds_phys->ds_props_obj); -+ ASSERT0(ds->ds_phys->ds_userrefs_obj); -+ dsl_dir_rele(ds->ds_dir, ds); -+ ds->ds_dir = NULL; -+ VERIFY0(dmu_object_free(mos, obj, tx)); -+ -+ dsl_dir_destroy_sync(ddobj, tx); -+ -+ if (rmorigin) { -+ dsl_dataset_t *prev; -+ VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); -+ dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); -+ dsl_dataset_rele(prev, FTAG); -+ } -+} -+ -+static void -+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); -+ dsl_destroy_head_sync_impl(ds, tx); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+static void -+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_destroy_head_arg_t *ddha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); -+ -+ /* Mark it as inconsistent on-disk, in case we crash */ -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; -+ -+ spa_history_log_internal_ds(ds, "destroy begin", tx, ""); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_destroy_head(const char *name) -+{ -+ dsl_destroy_head_arg_t ddha; -+ int error; -+ spa_t *spa; -+ boolean_t isenabled; -+ -+#ifdef _KERNEL -+ zfs_destroy_unmount_origin(name); -+#endif -+ -+ error = spa_open(name, &spa, FTAG); -+ if (error != 0) -+ return (error); -+ isenabled = spa_feature_is_enabled(spa, -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); -+ spa_close(spa, FTAG); -+ -+ ddha.ddha_name = name; -+ -+ if (!isenabled) { -+ objset_t *os; -+ -+ error = dsl_sync_task(name, dsl_destroy_head_check, -+ dsl_destroy_head_begin_sync, &ddha, 0); -+ if (error != 0) -+ return (error); -+ -+ /* -+ * Head deletion is processed in one txg on old pools; -+ * remove the objects from open context so that the txg sync -+ * is not too long. -+ */ -+ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); -+ if (error == 0) { -+ uint64_t obj; -+ uint64_t prev_snap_txg = -+ dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg; -+ for (obj = 0; error == 0; -+ error = dmu_object_next(os, &obj, FALSE, -+ prev_snap_txg)) -+ (void) dmu_free_long_object(os, obj); -+ /* sync out all frees */ -+ txg_wait_synced(dmu_objset_pool(os), 0); -+ dmu_objset_disown(os, FTAG); -+ } -+ } -+ -+ return (dsl_sync_task(name, dsl_destroy_head_check, -+ dsl_destroy_head_sync, &ddha, 0)); -+} -+ -+/* -+ * Note, this function is used as the callback for dmu_objset_find(). We -+ * always return 0 so that we will continue to find and process -+ * inconsistent datasets, even if we encounter an error trying to -+ * process one of them. -+ */ -+/* ARGSUSED */ -+int -+dsl_destroy_inconsistent(const char *dsname, void *arg) -+{ -+ objset_t *os; -+ -+ if (dmu_objset_hold(dsname, FTAG, &os) == 0) { -+ boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); -+ dmu_objset_rele(os, FTAG); -+ if (inconsistent) -+ (void) dsl_destroy_head(dsname); -+ } -+ return (0); -+} -+ -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+EXPORT_SYMBOL(dsl_destroy_head); -+EXPORT_SYMBOL(dsl_destroy_head_sync_impl); -+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one); -+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl); -+EXPORT_SYMBOL(dsl_destroy_inconsistent); -+EXPORT_SYMBOL(dsl_dataset_user_release_tmp); -+EXPORT_SYMBOL(dsl_destroy_head_check_impl); -+#endif -diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c -index 69f68c2..fb7cd2c 100644 ---- a/module/zfs/dsl_dir.c -+++ b/module/zfs/dsl_dir.c -@@ -22,2 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. - */ -@@ -38,2 +40,3 @@ - #include -+#include - #include "zfs_namecheck.h" -@@ -41,4 +44,2 @@ - static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); --static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx); -- - -@@ -49,4 +50,4 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - dsl_dir_t *dd = arg; -- ASSERTV(dsl_pool_t *dp = dd->dd_pool;) - int t; -+ ASSERTV(dsl_pool_t *dp = dd->dd_pool); - -@@ -59,3 +60,3 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - -@@ -73,3 +74,3 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) - int --dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, -+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **ddp) -@@ -80,7 +81,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - -- ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || -- dsl_pool_sync_context(dp)); -+ ASSERT(dsl_pool_config_held(dp)); - - err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); -- if (err) -+ if (err != 0) - return (err); -@@ -111,5 +111,5 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - if (dd->dd_phys->dd_parent_obj) { -- err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, -+ err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, - NULL, dd, &dd->dd_parent); -- if (err) -+ if (err != 0) - goto errout; -@@ -130,3 +130,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - } -- if (err) -+ if (err != 0) - goto errout; -@@ -147,3 +147,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); -- if (err) -+ if (err != 0) - goto errout; -@@ -159,3 +159,3 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); -@@ -186,3 +186,3 @@ errout: - if (dd->dd_parent) -- dsl_dir_close(dd->dd_parent, dd); -+ dsl_dir_rele(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); -@@ -194,3 +194,3 @@ errout: - void --dsl_dir_close(dsl_dir_t *dd, void *tag) -+dsl_dir_rele(dsl_dir_t *dd, void *tag) - { -@@ -251,4 +251,5 @@ getcomponent(const char *path, char *component, const char **nextp) - char *p; -+ - if ((path == NULL) || (path[0] == '\0')) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - /* This would be a good place to reserve some namespace... */ -@@ -257,3 +258,3 @@ getcomponent(const char *path, char *component, const char **nextp) - /* two separators in a row */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -267,5 +268,5 @@ getcomponent(const char *path, char *component, const char **nextp) - (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - if (strlen(path) >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strcpy(component, path); -@@ -273,6 +274,6 @@ getcomponent(const char *path, char *component, const char **nextp) - } else if (p[0] == '/') { -- if (p-path >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ if (p - path >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); -- component[p-path] = '\0'; -+ component[p - path] = '\0'; - p++; -@@ -284,9 +285,9 @@ getcomponent(const char *path, char *component, const char **nextp) - if (strchr(path, '/')) -- return (EINVAL); -- if (p-path >= MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(EINVAL)); -+ if (p - path >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); -- component[p-path] = '\0'; -+ component[p - path] = '\0'; - } else { -- ASSERT(!"invalid p"); -+ panic("invalid p=%p", (void *)p); - } -@@ -297,7 +298,10 @@ getcomponent(const char *path, char *component, const char **nextp) - /* -- * same as dsl_dir_open, ignore the first component of name and use the -- * spa instead -+ * Return the dsl_dir_t, and possibly the last component which couldn't -+ * be found in *tail. The name must be in the specified dsl_pool_t. This -+ * thread must hold the dp_config_rwlock for the pool. Returns NULL if the -+ * path is bogus, or if tail==NULL and we couldn't parse the whole name. -+ * (*tail)[0] == '@' means that the last component is a snapshot. - */ - int --dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, -+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, - dsl_dir_t **ddp, const char **tailp) -@@ -305,10 +309,6 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - char *buf; -- const char *next, *nextnext = NULL; -+ const char *spaname, *next, *nextnext = NULL; - int err; - dsl_dir_t *dd; -- dsl_pool_t *dp; - uint64_t ddobj; -- int openedspa = FALSE; -- -- dprintf("%s\n", name); - -@@ -316,24 +316,16 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - err = getcomponent(name, buf, &next); -- if (err) -+ if (err != 0) - goto error; -- if (spa == NULL) { -- err = spa_open(buf, &spa, FTAG); -- if (err) { -- dprintf("spa_open(%s) failed\n", buf); -- goto error; -- } -- openedspa = TRUE; - -- /* XXX this assertion belongs in spa_open */ -- ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); -+ /* Make sure the name is in the specified pool. */ -+ spaname = spa_name(dp->dp_spa); -+ if (strcmp(buf, spaname) != 0) { -+ err = SET_ERROR(EINVAL); -+ goto error; - } - -- dp = spa_get_dsl(spa); -+ ASSERT(dsl_pool_config_held(dp)); - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); -- if (err) { -- rw_exit(&dp->dp_config_rwlock); -- if (openedspa) -- spa_close(spa, FTAG); -+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); -+ if (err != 0) { - goto error; -@@ -344,3 +336,3 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - err = getcomponent(next, buf, &nextnext); -- if (err) -+ if (err != 0) - break; -@@ -355,3 +347,3 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - buf, sizeof (ddobj), 1, &ddobj); -- if (err) { -+ if (err != 0) { - if (err == ENOENT) -@@ -361,6 +353,6 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - -- err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); -- if (err) -+ err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); -+ if (err != 0) - break; -- dsl_dir_close(dd, tag); -+ dsl_dir_rele(dd, tag); - dd = child_ds; -@@ -368,8 +360,5 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - } -- rw_exit(&dp->dp_config_rwlock); - -- if (err) { -- dsl_dir_close(dd, tag); -- if (openedspa) -- spa_close(spa, FTAG); -+ if (err != 0) { -+ dsl_dir_rele(dd, tag); - goto error; -@@ -384,10 +373,8 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, - /* bad path name */ -- dsl_dir_close(dd, tag); -+ dsl_dir_rele(dd, tag); - dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -- if (tailp) -+ if (tailp != NULL) - *tailp = next; -- if (openedspa) -- spa_close(spa, FTAG); - *ddp = dd; -@@ -398,14 +385,2 @@ error: - --/* -- * Return the dsl_dir_t, and possibly the last component which couldn't -- * be found in *tail. Return NULL if the path is bogus, or if -- * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' -- * means that the last component is a snapshot. -- */ --int --dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) --{ -- return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); --} -- - uint64_t -@@ -447,73 +422,2 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, - --/* ARGSUSED */ --int --dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_pool_t *dp = dd->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- int err; -- uint64_t count; -- -- /* -- * There should be exactly two holds, both from -- * dsl_dataset_destroy: one on the dd directory, and one on its -- * head ds. If there are more holds, then a concurrent thread is -- * performing a lookup inside this dir while we're trying to destroy -- * it. To minimize this possibility, we perform this check only -- * in syncing context and fail the operation if we encounter -- * additional holds. The dp_config_rwlock ensures that nobody else -- * opens it after we check. -- */ -- if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2) -- return (EBUSY); -- -- err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); -- if (err) -- return (err); -- if (count != 0) -- return (EEXIST); -- -- return (0); --} -- --void --dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) --{ -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- dsl_prop_setarg_t psa; -- uint64_t value = 0; -- uint64_t obj; -- dd_used_t t; -- -- ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); -- ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); -- -- /* Remove our reservation. */ -- dsl_prop_setarg_init_uint64(&psa, "reservation", -- (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), -- &value); -- psa.psa_effective_value = 0; /* predict default value */ -- -- dsl_dir_set_reservation_sync(ds, &psa, tx); -- -- ASSERT0(dd->dd_phys->dd_used_bytes); -- ASSERT0(dd->dd_phys->dd_reserved); -- for (t = 0; t < DD_USED_NUM; t++) -- ASSERT0(dd->dd_phys->dd_used_breakdown[t]); -- -- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); -- VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); -- VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); -- VERIFY(0 == zap_remove(mos, -- dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); -- -- obj = dd->dd_object; -- dsl_dir_close(dd, tag); -- VERIFY(0 == dmu_object_free(mos, obj, tx)); --} -- - boolean_t -@@ -540,2 +444,4 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - dd->dd_phys->dd_compressed_bytes)); -+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, -+ dd->dd_phys->dd_uncompressed_bytes); - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { -@@ -553,3 +459,2 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - if (dsl_dir_is_clone(dd)) { -@@ -558,3 +463,3 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - -- VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, -+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &ds)); -@@ -564,3 +469,2 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) - } -- rw_exit(&dd->dd_pool->dp_config_rwlock); - } -@@ -574,3 +478,3 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) - -- if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { -+ if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { - /* up the hold count until we can be written out */ -@@ -688,3 +592,2 @@ struct tempreserve { - list_node_t tr_node; -- dsl_pool_t *tr_dp; - dsl_dir_t *tr_ds; -@@ -734,2 +637,3 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - mutex_exit(&dd->dd_lock); -+ DMU_TX_STAT_BUMP(dmu_tx_quota); - return (error); -@@ -782,3 +686,4 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - mutex_exit(&dd->dd_lock); -- return (retval); -+ DMU_TX_STAT_BUMP(dmu_tx_quota); -+ return (SET_ERROR(retval)); - } -@@ -839,10 +744,17 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - list_insert_tail(tr_list, tr); -- -- err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); - } else { - if (err == EAGAIN) { -- txg_delay(dd->dd_pool, tx->tx_txg, 1); -- err = ERESTART; -+ /* -+ * If arc_memory_throttle() detected that pageout -+ * is running and we are low on memory, we delay new -+ * non-pageout transactions to give pageout an -+ * advantage. -+ * -+ * It is unfortunate to be delaying while the caller's -+ * locks are held. -+ */ -+ txg_delay(dd->dd_pool, tx->tx_txg, -+ MSEC2NSEC(10), MSEC2NSEC(10)); -+ err = SET_ERROR(ERESTART); - } -- dsl_pool_memory_pressure(dd->dd_pool); - } -@@ -850,9 +762,2 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - if (err == 0) { -- struct tempreserve *tr; -- -- tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); -- tr->tr_dp = dd->dd_pool; -- tr->tr_size = asize; -- list_insert_tail(tr_list, tr); -- - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, -@@ -861,3 +766,3 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - -- if (err) -+ if (err != 0) - dsl_dir_tempreserve_clear(tr_list, tx); -@@ -885,6 +790,4 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) - -- while ((tr = list_head(tr_list))) { -- if (tr->tr_dp) { -- dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); -- } else if (tr->tr_ds) { -+ while ((tr = list_head(tr_list)) != NULL) { -+ if (tr->tr_ds) { - mutex_enter(&tr->tr_ds->dd_lock); -@@ -904,4 +807,14 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) - --static void --dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) -+/* -+ * This should be called from open context when we think we're going to write -+ * or free space, for example when dirtying data. Be conservative; it's okay -+ * to write less space or free more, but we don't want to write more or free -+ * less than the amount specified. -+ * -+ * NOTE: The behavior of this function is identical to the Illumos / FreeBSD -+ * version however it has been adjusted to use an iterative rather then -+ * recursive algorithm to minimize stack usage. -+ */ -+void -+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) - { -@@ -910,28 +823,18 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) - -- mutex_enter(&dd->dd_lock); -- if (space > 0) -- dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; -- -- est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; -- parent_space = parent_delta(dd, est_used, space); -- mutex_exit(&dd->dd_lock); -+ do { -+ mutex_enter(&dd->dd_lock); -+ if (space > 0) -+ dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - -- /* Make sure that we clean up dd_space_to* */ -- dsl_dir_dirty(dd, tx); -+ est_used = dsl_dir_space_towrite(dd) + -+ dd->dd_phys->dd_used_bytes; -+ parent_space = parent_delta(dd, est_used, space); -+ mutex_exit(&dd->dd_lock); - -- /* XXX this is potentially expensive and unnecessary... */ -- if (parent_space && dd->dd_parent) -- dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); --} -+ /* Make sure that we clean up dd_space_to* */ -+ dsl_dir_dirty(dd, tx); - --/* -- * Call in open context when we think we're going to write/free space, -- * eg. when dirtying data. Be conservative (ie. OK to write less than -- * this or free more than this, but don't write more or free less). -- */ --void --dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) --{ -- dsl_pool_willuse_space(dd->dd_pool, space, tx); -- dsl_dir_willuse_space_impl(dd, space, tx); -+ dd = dd->dd_parent; -+ space = parent_space; -+ } while (space && dd); - } -@@ -944,2 +847,10 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - int64_t accounted_delta; -+ -+ /* -+ * dsl_dataset_set_refreservation_sync_impl() calls this with -+ * dd_lock held, so that it can atomically update -+ * ds->ds_reserved and the dsl_dir accounting, so that -+ * dsl_dataset_check_quota() can see dataset and dir accounting -+ * consistently. -+ */ - boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); -@@ -949,2 +860,4 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - -+ dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ - if (needlock) -@@ -957,3 +870,2 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, - dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); -- dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_used_bytes += used; -@@ -992,4 +904,2 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - { -- boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); -- - ASSERT(dmu_tx_is_syncing(tx)); -@@ -1001,4 +911,4 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - -- if (needlock) -- mutex_enter(&dd->dd_lock); -+ dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ mutex_enter(&dd->dd_lock); - ASSERT(delta > 0 ? -@@ -1007,25 +917,39 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, - ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); -- dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_used_breakdown[oldtype] -= delta; - dd->dd_phys->dd_used_breakdown[newtype] += delta; -- if (needlock) -- mutex_exit(&dd->dd_lock); -+ mutex_exit(&dd->dd_lock); - } - -+typedef struct dsl_dir_set_qr_arg { -+ const char *ddsqra_name; -+ zprop_source_t ddsqra_source; -+ uint64_t ddsqra_value; -+} dsl_dir_set_qr_arg_t; -+ - static int --dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- int err; -- uint64_t towrite; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ int error; -+ uint64_t towrite, newval; - -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_prop_predict(ds->ds_dir, "quota", -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -- if (psa->psa_effective_value == 0) -+ if (newval == 0) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } - -- mutex_enter(&dd->dd_lock); -+ mutex_enter(&ds->ds_dir->dd_lock); - /* -@@ -1036,30 +960,41 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- towrite = dsl_dir_space_towrite(dd); -+ towrite = dsl_dir_space_towrite(ds->ds_dir); - if ((dmu_tx_is_syncing(tx) || towrite == 0) && -- (psa->psa_effective_value < dd->dd_phys->dd_reserved || -- psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { -- err = ENOSPC; -+ (newval < ds->ds_dir->dd_phys->dd_reserved || -+ newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { -+ error = SET_ERROR(ENOSPC); - } -- mutex_exit(&dd->dd_lock); -- return (err); -+ mutex_exit(&ds->ds_dir->dd_lock); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - --extern dsl_syncfunc_t dsl_prop_set_sync; -- - static void --dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ uint64_t newval; - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(dd, psa); -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - -- dmu_buf_will_dirty(dd->dd_dbuf, tx); -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { -+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- mutex_enter(&dd->dd_lock); -- dd->dd_phys->dd_quota = effective_value; -- mutex_exit(&dd->dd_lock); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); -+ } else { -+ newval = ddsqra->ddsqra_value; -+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", -+ zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); -+ } -+ -+ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); -+ mutex_enter(&ds->ds_dir->dd_lock); -+ ds->ds_dir->dd_phys->dd_quota = newval; -+ mutex_exit(&ds->ds_dir->dd_lock); -+ dsl_dataset_rele(ds, FTAG); - } -@@ -1069,33 +1004,10 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) - { -- dsl_dir_t *dd; -- dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -+ dsl_dir_set_qr_arg_t ddsqra; - -- dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); -+ ddsqra.ddsqra_name = ddname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = quota; - -- err = dsl_dataset_hold(ddname, FTAG, &ds); -- if (err) -- return (err); -- -- err = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -- } -- -- ASSERT(ds->ds_dir == dd); -- -- /* -- * If someone removes a file, then tries to set the quota, we want to -- * make sure the file freeing takes effect. -- */ -- txg_wait_open(dd->dd_pool, 0); -- -- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, -- dsl_dir_set_quota_sync, ds, &psa, 0); -- -- dsl_dir_close(dd, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(ddname, dsl_dir_set_quota_check, -+ dsl_dir_set_quota_sync, &ddsqra, 0)); - } -@@ -1103,15 +1015,15 @@ dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) - int --dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value; -- uint64_t used, avail; -- int err; -- -- if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) -- return (err); -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; -+ dsl_dir_t *dd; -+ uint64_t newval, used, avail; -+ int error; - -- effective_value = psa->psa_effective_value; -+ error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); -+ if (error != 0) -+ return (error); -+ dd = ds->ds_dir; - -@@ -1121,4 +1033,14 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- if (!dmu_tx_is_syncing(tx)) -+ if (!dmu_tx_is_syncing(tx)) { -+ dsl_dataset_rele(ds, FTAG); - return (0); -+ } -+ -+ error = dsl_prop_predict(ds->ds_dir, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); -+ if (error != 0) { -+ dsl_dataset_rele(ds, FTAG); -+ return (error); -+ } - -@@ -1135,23 +1057,19 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { -- uint64_t delta = MAX(used, effective_value) - -+ if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { -+ uint64_t delta = MAX(used, newval) - - MAX(used, dd->dd_phys->dd_reserved); - -- if (delta > avail) -- return (ENOSPC); -- if (dd->dd_phys->dd_quota > 0 && -- effective_value > dd->dd_phys->dd_quota) -- return (ENOSPC); -+ if (delta > avail || -+ (dd->dd_phys->dd_quota > 0 && -+ newval > dd->dd_phys->dd_quota)) -+ error = SET_ERROR(ENOSPC); - } - -- return (0); -+ dsl_dataset_rele(ds, FTAG); -+ return (error); - } - --static void --dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+void -+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_dir_t *dd = ds->ds_dir; -- dsl_prop_setarg_t *psa = arg2; -- uint64_t effective_value = psa->psa_effective_value; - uint64_t used; -@@ -1159,5 +1077,2 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- dsl_prop_set_sync(ds, psa, tx); -- DSL_PROP_CHECK_PREDICTION(dd, psa); -- - dmu_buf_will_dirty(dd->dd_dbuf, tx); -@@ -1166,5 +1081,4 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - used = dd->dd_phys->dd_used_bytes; -- delta = MAX(used, effective_value) - -- MAX(used, dd->dd_phys->dd_reserved); -- dd->dd_phys->dd_reserved = effective_value; -+ delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved); -+ dd->dd_phys->dd_reserved = value; - -@@ -1178,31 +1092,43 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --int --dsl_dir_set_reservation(const char *ddname, zprop_source_t source, -- uint64_t reservation) -+static void -+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd; -+ dsl_dir_set_qr_arg_t *ddsqra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -- dsl_prop_setarg_t psa; -- int err; -+ uint64_t newval; - -- dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); -+ VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - -- err = dsl_dataset_hold(ddname, FTAG, &ds); -- if (err) -- return (err); -+ if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { -+ dsl_prop_set_sync_impl(ds, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, -+ &ddsqra->ddsqra_value, tx); - -- err = dsl_dir_open(ddname, FTAG, &dd, NULL); -- if (err) { -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ VERIFY0(dsl_prop_get_int_ds(ds, -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); -+ } else { -+ newval = ddsqra->ddsqra_value; -+ spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", -+ zfs_prop_to_name(ZFS_PROP_RESERVATION), -+ (longlong_t)newval); - } - -- ASSERT(ds->ds_dir == dd); -+ dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); -+ dsl_dataset_rele(ds, FTAG); -+} -+ -+int -+dsl_dir_set_reservation(const char *ddname, zprop_source_t source, -+ uint64_t reservation) -+{ -+ dsl_dir_set_qr_arg_t ddsqra; - -- err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, -- dsl_dir_set_reservation_sync, ds, &psa, 0); -+ ddsqra.ddsqra_name = ddname; -+ ddsqra.ddsqra_source = source; -+ ddsqra.ddsqra_value = reservation; - -- dsl_dir_close(dd, FTAG); -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, -+ dsl_dir_set_reservation_sync, &ddsqra, 0)); - } -@@ -1238,36 +1164,70 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) - --struct renamearg { -- dsl_dir_t *newparent; -- const char *mynewname; --}; -+typedef struct dsl_dir_rename_arg { -+ const char *ddra_oldname; -+ const char *ddra_newname; -+} dsl_dir_rename_arg_t; - -+/* ARGSUSED */ - static int --dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) - { -- dsl_dir_t *dd = arg1; -- struct renamearg *ra = arg2; -- dsl_pool_t *dp = dd->dd_pool; -- objset_t *mos = dp->dp_meta_objset; -- int err; -- uint64_t val; -+ int *deltap = arg; -+ char namebuf[MAXNAMELEN]; - -- /* -- * There should only be one reference, from dmu_objset_rename(). -- * Fleeting holds are also possible (eg, from "zfs list" getting -- * stats), but any that are present in open context will likely -- * be gone by syncing context, so only fail from syncing -- * context. -- */ -- if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1) -- return (EBUSY); -- -- /* check for existing name */ -- err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, -- ra->mynewname, 8, 1, &val); -- if (err == 0) -- return (EEXIST); -- if (err != ENOENT) -- return (err); -+ dsl_dataset_name(ds, namebuf); - -- if (ra->newparent != dd->dd_parent) { -+ if (strlen(namebuf) + *deltap >= MAXNAMELEN) -+ return (SET_ERROR(ENAMETOOLONG)); -+ return (0); -+} -+ -+static int -+dsl_dir_rename_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dir_rename_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *dd, *newparent; -+ const char *mynewname; -+ int error; -+ int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); -+ -+ /* target dir should exist */ -+ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); -+ if (error != 0) -+ return (error); -+ -+ /* new parent should exist */ -+ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, -+ &newparent, &mynewname); -+ if (error != 0) { -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } -+ -+ /* can't rename to different pool */ -+ if (dd->dd_pool != newparent->dd_pool) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(ENXIO)); -+ } -+ -+ /* new name should not already exist */ -+ if (mynewname == NULL) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(EEXIST)); -+ } -+ -+ /* if the name length is growing, validate child name lengths */ -+ if (delta > 0) { -+ error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, -+ &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ if (error != 0) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } -+ } -+ -+ if (newparent != dd->dd_parent) { - /* is there enough space? */ -@@ -1277,10 +1237,19 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) - /* no rename into our descendant */ -- if (closest_common_ancestor(dd, ra->newparent) == dd) -- return (EINVAL); -+ if (closest_common_ancestor(dd, newparent) == dd) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } - -- if ((err = dsl_dir_transfer_possible(dd->dd_parent, -- ra->newparent, myspace))) -- return (err); -+ error = dsl_dir_transfer_possible(dd->dd_parent, -+ newparent, myspace); -+ if (error != 0) { -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); -+ return (error); -+ } - } - -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); - return (0); -@@ -1289,13 +1258,20 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) - { -- dsl_dir_t *dd = arg1; -- struct renamearg *ra = arg2; -- dsl_pool_t *dp = dd->dd_pool; -+ dsl_dir_rename_arg_t *ddra = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dir_t *dd, *newparent; -+ const char *mynewname; -+ int error; - objset_t *mos = dp->dp_meta_objset; -- int err; - -- ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); -+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); -+ VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, -+ &mynewname)); - -- if (ra->newparent != dd->dd_parent) { -+ /* Log this before we change the name. */ -+ spa_history_log_internal_dd(dd, "rename", tx, -+ "-> %s", ddra->ddra_newname); -+ -+ if (newparent != dd->dd_parent) { - dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -@@ -1304,3 +1280,3 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -dd->dd_phys->dd_uncompressed_bytes, tx); -- dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, -+ dsl_dir_diduse_space(newparent, DD_USED_CHILD, - dd->dd_phys->dd_used_bytes, -@@ -1315,3 +1291,3 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -unused_rsrv, 0, 0, tx); -- dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, -+ dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, - unused_rsrv, 0, 0, tx); -@@ -1323,19 +1299,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* remove from old parent zapobj */ -- err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, -+ error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, tx); -- ASSERT0(err); -+ ASSERT0(error); - -- (void) strcpy(dd->dd_myname, ra->mynewname); -- dsl_dir_close(dd->dd_parent, dd); -- dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; -- VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, -- ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); -+ (void) strcpy(dd->dd_myname, mynewname); -+ dsl_dir_rele(dd->dd_parent, dd); -+ dd->dd_phys->dd_parent_obj = newparent->dd_object; -+ VERIFY0(dsl_dir_hold_obj(dp, -+ newparent->dd_object, NULL, dd, &dd->dd_parent)); - - /* add to new parent zapobj */ -- err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, -- dd->dd_myname, 8, 1, &dd->dd_object, tx); -- ASSERT0(err); -+ VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, -+ dd->dd_myname, 8, 1, &dd->dd_object, tx)); -+ -+#ifdef _KERNEL -+ zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); -+#endif - -- spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, -- tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); -+ dsl_prop_notify_all(dd); -+ -+ dsl_dir_rele(newparent, FTAG); -+ dsl_dir_rele(dd, FTAG); - } -@@ -1343,30 +1324,11 @@ dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --dsl_dir_rename(dsl_dir_t *dd, const char *newname) -+dsl_dir_rename(const char *oldname, const char *newname) - { -- struct renamearg ra; -- int err; -- -- /* new parent should exist */ -- err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); -- if (err) -- return (err); -- -- /* can't rename to different pool */ -- if (dd->dd_pool != ra.newparent->dd_pool) { -- err = ENXIO; -- goto out; -- } -- -- /* new name should not already exist */ -- if (ra.mynewname == NULL) { -- err = EEXIST; -- goto out; -- } -+ dsl_dir_rename_arg_t ddra; - -- err = dsl_sync_task_do(dd->dd_pool, -- dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); -+ ddra.ddra_oldname = oldname; -+ ddra.ddra_newname = newname; - --out: -- dsl_dir_close(ra.newparent, FTAG); -- return (err); -+ return (dsl_sync_task(oldname, -+ dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); - } -@@ -1384,3 +1346,3 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) - if (avail < space) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -1415,4 +1377,2 @@ EXPORT_SYMBOL(dsl_dir_set_quota); - EXPORT_SYMBOL(dsl_dir_set_reservation); --EXPORT_SYMBOL(dsl_dir_open); --EXPORT_SYMBOL(dsl_dir_close); - #endif -diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c -index 7795d80..0ef5071 100644 ---- a/module/zfs/dsl_pool.c -+++ b/module/zfs/dsl_pool.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. - */ -@@ -45,210 +46,86 @@ - #include -+#include - --int zfs_no_write_throttle = 0; --int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ --int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ --int zfs_txg_history = 60; /* statistics for the last N txgs */ -- --unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ --unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ --unsigned long zfs_write_limit_inflated = 0; --unsigned long zfs_write_limit_override = 0; -- --kmutex_t zfs_write_limit_lock; -- --static pgcnt_t old_physmem = 0; -- --static void --dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata) --{ -- kstat_named_t *ks; -- char name[KSTAT_STRLEN]; -- int i, data_size = ndata * sizeof(kstat_named_t); -- -- (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s", -- spa_name(dp->dp_spa)); -- -- dp->dp_tx_assign_size = ndata; -- -- if (data_size) -- dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP); -- else -- dp->dp_tx_assign_buckets = NULL; -- -- for (i = 0; i < dp->dp_tx_assign_size; i++) { -- ks = &dp->dp_tx_assign_buckets[i]; -- ks->data_type = KSTAT_DATA_UINT64; -- ks->value.ui64 = 0; -- (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i); -- } -- -- dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc", -- KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); -- -- if (dp->dp_tx_assign_kstat) { -- dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets; -- dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size; -- dp->dp_tx_assign_kstat->ks_data_size = data_size; -- kstat_install(dp->dp_tx_assign_kstat); -- } --} -- --static void --dsl_pool_tx_assign_destroy(dsl_pool_t *dp) --{ -- if (dp->dp_tx_assign_buckets) -- kmem_free(dp->dp_tx_assign_buckets, -- dp->dp_tx_assign_size * sizeof(kstat_named_t)); -- -- if (dp->dp_tx_assign_kstat) -- kstat_delete(dp->dp_tx_assign_kstat); --} -- --void --dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs) --{ -- uint64_t idx = 0; -- -- while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1)) -- idx++; -- -- atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64); --} -- --static int --dsl_pool_txg_history_update(kstat_t *ksp, int rw) --{ -- dsl_pool_t *dp = ksp->ks_private; -- txg_history_t *th; -- int i = 0; -- -- if (rw == KSTAT_WRITE) -- return (EACCES); -- -- if (ksp->ks_data) -- kmem_free(ksp->ks_data, ksp->ks_data_size); -- -- mutex_enter(&dp->dp_lock); -- -- ksp->ks_ndata = dp->dp_txg_history_size; -- ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); -- if (ksp->ks_data_size > 0) -- ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); -- -- /* Traversed oldest to youngest for the most readable kstat output */ -- for (th = list_tail(&dp->dp_txg_history); th != NULL; -- th = list_prev(&dp->dp_txg_history, th)) { -- mutex_enter(&th->th_lock); -- ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); -- memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); -- i += sizeof(kstat_txg_t); -- mutex_exit(&th->th_lock); -- } -- -- mutex_exit(&dp->dp_lock); -- -- return (0); --} -- --static void --dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) --{ -- char name[KSTAT_STRLEN]; -- -- list_create(&dp->dp_txg_history, sizeof (txg_history_t), -- offsetof(txg_history_t, th_link)); -- dsl_pool_txg_history_add(dp, txg); -- -- (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); -- dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", -- KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); -- if (dp->dp_txg_kstat) { -- dp->dp_txg_kstat->ks_data = NULL; -- dp->dp_txg_kstat->ks_private = dp; -- dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; -- kstat_install(dp->dp_txg_kstat); -- } --} -- --static void --dsl_pool_txg_history_destroy(dsl_pool_t *dp) --{ -- txg_history_t *th; -- -- if (dp->dp_txg_kstat) { -- if (dp->dp_txg_kstat->ks_data) -- kmem_free(dp->dp_txg_kstat->ks_data, -- dp->dp_txg_kstat->ks_data_size); -- -- kstat_delete(dp->dp_txg_kstat); -- } -- -- mutex_enter(&dp->dp_lock); -- while ((th = list_remove_head(&dp->dp_txg_history))) { -- dp->dp_txg_history_size--; -- mutex_destroy(&th->th_lock); -- kmem_free(th, sizeof(txg_history_t)); -- } -- -- ASSERT3U(dp->dp_txg_history_size, ==, 0); -- list_destroy(&dp->dp_txg_history); -- mutex_exit(&dp->dp_lock); --} -- --txg_history_t * --dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) --{ -- txg_history_t *th, *rm; -- -- th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); -- mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); -- th->th_kstat.txg = txg; -- th->th_kstat.state = TXG_STATE_OPEN; -- th->th_kstat.birth = gethrtime(); -- -- mutex_enter(&dp->dp_lock); -- -- list_insert_head(&dp->dp_txg_history, th); -- dp->dp_txg_history_size++; -- -- while (dp->dp_txg_history_size > zfs_txg_history) { -- dp->dp_txg_history_size--; -- rm = list_remove_tail(&dp->dp_txg_history); -- mutex_destroy(&rm->th_lock); -- kmem_free(rm, sizeof(txg_history_t)); -- } -- -- mutex_exit(&dp->dp_lock); -+/* -+ * ZFS Write Throttle -+ * ------------------ -+ * -+ * ZFS must limit the rate of incoming writes to the rate at which it is able -+ * to sync data modifications to the backend storage. Throttling by too much -+ * creates an artificial limit; throttling by too little can only be sustained -+ * for short periods and would lead to highly lumpy performance. On a per-pool -+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change -+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount -+ * of dirty data decreases. When the amount of dirty data exceeds a -+ * predetermined threshold further modifications are blocked until the amount -+ * of dirty data decreases (as data is synced out). -+ * -+ * The limit on dirty data is tunable, and should be adjusted according to -+ * both the IO capacity and available memory of the system. The larger the -+ * window, the more ZFS is able to aggregate and amortize metadata (and data) -+ * changes. However, memory is a limited resource, and allowing for more dirty -+ * data comes at the cost of keeping other useful data in memory (for example -+ * ZFS data cached by the ARC). -+ * -+ * Implementation -+ * -+ * As buffers are modified dsl_pool_willuse_space() increments both the per- -+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of -+ * dirty space used; dsl_pool_dirty_space() decrements those values as data -+ * is synced out from dsl_pool_sync(). While only the poolwide value is -+ * relevant, the per-txg value is useful for debugging. The tunable -+ * zfs_dirty_data_max determines the dirty space limit. Once that value is -+ * exceeded, new writes are halted until space frees up. -+ * -+ * The zfs_dirty_data_sync tunable dictates the threshold at which we -+ * ensure that there is a txg syncing (see the comment in txg.c for a full -+ * description of transaction group stages). -+ * -+ * The IO scheduler uses both the dirty space limit and current amount of -+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS -+ * issues. See the comment in vdev_queue.c for details of the IO scheduler. -+ * -+ * The delay is also calculated based on the amount of dirty data. See the -+ * comment above dmu_tx_delay() for details. -+ */ - -- return (th); --} -+/* -+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, -+ * capped at zfs_dirty_data_max_max. It can also be overridden with a module -+ * parameter. -+ */ -+unsigned long zfs_dirty_data_max = 0; -+unsigned long zfs_dirty_data_max_max = 0; -+int zfs_dirty_data_max_percent = 10; -+int zfs_dirty_data_max_max_percent = 25; - - /* -- * Traversed youngest to oldest because lookups are only done for open -- * or syncing txgs which are guaranteed to be at the head of the list. -- * The txg_history_t structure will be returned locked. -+ * If there is at least this much dirty data, push out a txg. - */ --txg_history_t * --dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) --{ -- txg_history_t *th; -+unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024; - -- mutex_enter(&dp->dp_lock); -- for (th = list_head(&dp->dp_txg_history); th != NULL; -- th = list_next(&dp->dp_txg_history, th)) { -- if (th->th_kstat.txg == txg) { -- mutex_enter(&th->th_lock); -- break; -- } -- } -- mutex_exit(&dp->dp_lock); -+/* -+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in -+ * and delay each transaction. -+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. -+ */ -+int zfs_delay_min_dirty_percent = 60; - -- return (th); --} -+/* -+ * This controls how quickly the delay approaches infinity. -+ * Larger values cause it to delay more for a given amount of dirty data. -+ * Therefore larger values will cause there to be less dirty data for a -+ * given throughput. -+ * -+ * For the smoothest delay, this value should be about 1 billion divided -+ * by the maximum number of operations per second. This will smoothly -+ * handle between 10x and 1/10th this number. -+ * -+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the -+ * multiply in dmu_tx_delay(). -+ */ -+unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; - --void --dsl_pool_txg_history_put(txg_history_t *th) --{ -- mutex_exit(&th->th_lock); --} -+hrtime_t zfs_throttle_delay = MSEC2NSEC(10); -+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); - -@@ -266,3 +143,3 @@ dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) - -- return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); -+ return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); - } -@@ -278,4 +155,3 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - dp->dp_meta_rootbp = *bp; -- rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); -- dp->dp_write_limit = zfs_write_limit_min; -+ rrw_init(&dp->dp_config_rwlock, B_TRUE); - txg_init(dp, txg); -@@ -289,5 +165,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - txg_list_create(&dp->dp_sync_tasks, -- offsetof(dsl_sync_task_group_t, dstg_node)); -+ offsetof(dsl_sync_task_t, dst_node)); - - mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); -+ cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - -@@ -296,5 +173,2 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) - -- dsl_pool_txg_history_init(dp, txg); -- dsl_pool_tx_assign_init(dp, 32); -- - return (dp); -@@ -326,3 +200,3 @@ dsl_pool_open(dsl_pool_t *dp) - -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -@@ -333,3 +207,3 @@ dsl_pool_open(dsl_pool_t *dp) - -- err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, -+ err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir); -@@ -354,3 +228,3 @@ dsl_pool_open(dsl_pool_t *dp) - } -- dsl_dir_close(dd, dp); -+ dsl_dir_rele(dd, dp); - if (err) -@@ -369,3 +243,3 @@ dsl_pool_open(dsl_pool_t *dp) - goto out; -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); -@@ -402,3 +276,3 @@ dsl_pool_open(dsl_pool_t *dp) - out: -- rw_exit(&dp->dp_config_rwlock); -+ rrw_exit(&dp->dp_config_rwlock, FTAG); - return (err); -@@ -409,5 +283,5 @@ dsl_pool_close(dsl_pool_t *dp) - { -- /* drop our references from dsl_pool_open() */ -- - /* -+ * Drop our references from dsl_pool_open(). -+ * - * Since we held the origin_snap from "syncing" context (which -@@ -417,9 +291,9 @@ dsl_pool_close(dsl_pool_t *dp) - if (dp->dp_origin_snap) -- dsl_dataset_drop_ref(dp->dp_origin_snap, dp); -+ dsl_dataset_rele(dp->dp_origin_snap, dp); - if (dp->dp_mos_dir) -- dsl_dir_close(dp->dp_mos_dir, dp); -+ dsl_dir_rele(dp->dp_mos_dir, dp); - if (dp->dp_free_dir) -- dsl_dir_close(dp->dp_free_dir, dp); -+ dsl_dir_rele(dp->dp_free_dir, dp); - if (dp->dp_root_dir) -- dsl_dir_close(dp->dp_root_dir, dp); -+ dsl_dir_rele(dp->dp_root_dir, dp); - -@@ -439,5 +313,3 @@ dsl_pool_close(dsl_pool_t *dp) - dsl_scan_fini(dp); -- dsl_pool_tx_assign_destroy(dp); -- dsl_pool_txg_history_destroy(dp); -- rw_destroy(&dp->dp_config_rwlock); -+ rrw_destroy(&dp->dp_config_rwlock); - mutex_destroy(&dp->dp_lock); -@@ -459,2 +331,4 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ - /* create and open the MOS (meta-objset) */ -@@ -469,3 +343,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - /* Initialize scan structures */ -- VERIFY3U(0, ==, dsl_scan_init(dp, txg)); -+ VERIFY0(dsl_scan_init(dp, txg)); - -@@ -473,3 +347,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); -- VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, -+ VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, - NULL, dp, &dp->dp_root_dir)); -@@ -478,3 +352,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - MOS_DIR_NAME, &dp->dp_mos_dir)); -@@ -485,3 +359,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - FREE_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); -@@ -492,3 +366,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, - dp->dp_meta_objset, obj)); -@@ -503,3 +377,3 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - /* create the root objset */ -- VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); -+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); - VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds, -@@ -513,2 +387,4 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) - -+ rrw_exit(&dp->dp_config_rwlock, FTAG); -+ - return (dp); -@@ -535,6 +411,3 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - dsl_deadlist_t *dl = arg; -- dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); -- rw_enter(&dp->dp_config_rwlock, RW_READER); - dsl_deadlist_insert(dl, bp, tx); -- rw_exit(&dp->dp_config_rwlock); - return (0); -@@ -542,2 +415,30 @@ deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - -+static void -+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -+ dmu_objset_sync(dp->dp_meta_objset, zio, tx); -+ VERIFY0(zio_wait(zio)); -+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); -+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); -+} -+ -+static void -+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) -+{ -+ ASSERT(MUTEX_HELD(&dp->dp_lock)); -+ -+ if (delta < 0) -+ ASSERT3U(-delta, <=, dp->dp_dirty_total); -+ -+ dp->dp_dirty_total += delta; -+ -+ /* -+ * Note: we signal even when increasing dp_dirty_total. -+ * This ensures forward progress -- each thread wakes the next waiter. -+ */ -+ if (dp->dp_dirty_total <= zfs_dirty_data_max) -+ cv_signal(&dp->dp_spaceavail_cv); -+} -+ - void -@@ -550,5 +451,2 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - objset_t *mos = dp->dp_meta_objset; -- hrtime_t start, write_time; -- uint64_t data_written; -- int err; - list_t synced_datasets; -@@ -558,17 +456,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- /* -- * We need to copy dp_space_towrite() before doing -- * dsl_sync_task_group_sync(), because -- * dsl_dataset_snapshot_reserve_space() will increase -- * dp_space_towrite but not actually write anything. -- */ -- data_written = dp->dp_space_towrite[txg & TXG_MASK]; -- - tx = dmu_tx_create_assigned(dp, txg); - -- dp->dp_read_overhead = 0; -- start = gethrtime(); -- -+ /* -+ * Write out all dirty blocks of dirty datasets. -+ */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { -+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - /* -@@ -582,8 +472,12 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - } -- DTRACE_PROBE(pool_sync__1setup); -- err = zio_wait(zio); -+ VERIFY0(zio_wait(zio)); - -- write_time = gethrtime() - start; -- ASSERT(err == 0); -- DTRACE_PROBE(pool_sync__2rootzio); -+ /* -+ * We have written all of the accounted dirty data, so our -+ * dp_space_towrite should now be zero. However, some seldom-used -+ * code paths do not adhere to this (e.g. dbuf_undirty(), also -+ * rounding error in dbuf_write_physdone). -+ * Shore up the accounting of any dirtied space now. -+ */ -+ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); - -@@ -593,5 +487,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- for (ds = list_head(&synced_datasets); ds; -- ds = list_next(&synced_datasets, ds)) -+ for (ds = list_head(&synced_datasets); ds != NULL; -+ ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); -+ } - -@@ -605,3 +500,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { -+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { - ASSERT(list_link_active(&ds->ds_synced_link)); -@@ -610,3 +505,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - } -- err = zio_wait(zio); -+ VERIFY0(zio_wait(zio)); - -@@ -617,6 +512,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - * - move dead blocks from the pending deadlist to the on-disk deadlist -- * - clean up zil records - * - release hold from dsl_dataset_dirty() - */ -- while ((ds = list_remove_head(&synced_datasets))) { -+ while ((ds = list_remove_head(&synced_datasets)) != NULL) { - ASSERTV(objset_t *os = ds->ds_objset); -@@ -628,6 +522,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- start = gethrtime(); -- while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) -+ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { - dsl_dir_sync(dd, tx); -- write_time += gethrtime() - start; -+ } - -@@ -649,16 +542,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- start = gethrtime(); - if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { -- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); -- dmu_objset_sync(mos, zio, tx); -- err = zio_wait(zio); -- ASSERT(err == 0); -- dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); -- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); -+ dsl_pool_sync_mos(dp, tx); - } -- write_time += gethrtime() - start; -- DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, -- hrtime_t, dp->dp_read_overhead); -- write_time -= dp->dp_read_overhead; - -@@ -673,5 +556,4 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- DTRACE_PROBE(pool_sync__3task); - if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { -- dsl_sync_task_group_t *dstg; -+ dsl_sync_task_t *dst; - /* -@@ -680,5 +562,5 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - */ -- ASSERT(spa_sync_pass(dp->dp_spa) == 1); -- while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) -- dsl_sync_task_group_sync(dstg, tx); -+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); -+ while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) -+ dsl_sync_task_sync(dst, tx); - } -@@ -687,43 +569,3 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) - -- dp->dp_space_towrite[txg & TXG_MASK] = 0; -- ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); -- -- /* -- * If the write limit max has not been explicitly set, set it -- * to a fraction of available physical memory (default 1/8th). -- * Note that we must inflate the limit because the spa -- * inflates write sizes to account for data replication. -- * Check this each sync phase to catch changing memory size. -- */ -- if (physmem != old_physmem && zfs_write_limit_shift) { -- mutex_enter(&zfs_write_limit_lock); -- old_physmem = physmem; -- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; -- zfs_write_limit_inflated = MAX(zfs_write_limit_min, -- spa_get_asize(dp->dp_spa, zfs_write_limit_max)); -- mutex_exit(&zfs_write_limit_lock); -- } -- -- /* -- * Attempt to keep the sync time consistent by adjusting the -- * amount of write traffic allowed into each transaction group. -- * Weight the throughput calculation towards the current value: -- * thru = 3/4 old_thru + 1/4 new_thru -- * -- * Note: write_time is in nanosecs, so write_time/MICROSEC -- * yields millisecs -- */ -- ASSERT(zfs_write_limit_min > 0); -- if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { -- uint64_t throughput = data_written / (write_time / MICROSEC); -- -- if (dp->dp_throughput) -- dp->dp_throughput = throughput / 4 + -- 3 * dp->dp_throughput / 4; -- else -- dp->dp_throughput = throughput; -- dp->dp_write_limit = MIN(zfs_write_limit_inflated, -- MAX(zfs_write_limit_min, -- dp->dp_throughput * zfs_txg_synctime_ms)); -- } -+ DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); - } -@@ -734,6 +576,5 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) - zilog_t *zilog; -- dsl_dataset_t *ds; - - while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { -- ds = dmu_objset_ds(zilog->zl_os); -+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - zil_clean(zilog, txg); -@@ -779,44 +620,15 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) - --int --dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) -+boolean_t -+dsl_pool_need_dirty_delay(dsl_pool_t *dp) - { -- uint64_t reserved = 0; -- uint64_t write_limit = (zfs_write_limit_override ? -- zfs_write_limit_override : dp->dp_write_limit); -- -- if (zfs_no_write_throttle) { -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -- space); -- return (0); -- } -- -- /* -- * Check to see if we have exceeded the maximum allowed IO for -- * this transaction group. We can do this without locks since -- * a little slop here is ok. Note that we do the reserved check -- * with only half the requested reserve: this is because the -- * reserve requests are worst-case, and we really don't want to -- * throttle based off of worst-case estimates. -- */ -- if (write_limit > 0) { -- reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] -- + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; -- -- if (reserved && reserved > write_limit) { -- DMU_TX_STAT_BUMP(dmu_tx_write_limit); -- return (ERESTART); -- } -- } -- -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); -- -- /* -- * If this transaction group is over 7/8ths capacity, delay -- * the caller 1 clock tick. This will slow down the "fill" -- * rate until the sync process can catch up with us. -- */ -- if (reserved && reserved > (write_limit - (write_limit >> 3))) -- txg_delay(dp, tx->tx_txg, 1); -+ uint64_t delay_min_bytes = -+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; -+ boolean_t rv; - -- return (0); -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_total > zfs_dirty_data_sync) -+ txg_kick(dp); -+ rv = (dp->dp_dirty_total > delay_min_bytes); -+ mutex_exit(&dp->dp_lock); -+ return (rv); - } -@@ -824,6 +636,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) - void --dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) -+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - { -- ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); -- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); -+ if (space > 0) { -+ mutex_enter(&dp->dp_lock); -+ dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; -+ dsl_pool_dirty_delta(dp, space); -+ mutex_exit(&dp->dp_lock); -+ } - } -@@ -831,26 +647,18 @@ dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - void --dsl_pool_memory_pressure(dsl_pool_t *dp) -+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) - { -- uint64_t space_inuse = 0; -- int i; -- -- if (dp->dp_write_limit == zfs_write_limit_min) -+ ASSERT3S(space, >=, 0); -+ if (space == 0) - return; - -- for (i = 0; i < TXG_SIZE; i++) { -- space_inuse += dp->dp_space_towrite[i]; -- space_inuse += dp->dp_tempreserved[i]; -- } -- dp->dp_write_limit = MAX(zfs_write_limit_min, -- MIN(dp->dp_write_limit, space_inuse / 4)); --} -- --void --dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) --{ -- if (space > 0) { -- mutex_enter(&dp->dp_lock); -- dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; -- mutex_exit(&dp->dp_lock); -+ mutex_enter(&dp->dp_lock); -+ if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { -+ /* XXX writing something we didn't dirty? */ -+ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; - } -+ ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); -+ dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; -+ ASSERT3U(dp->dp_dirty_total, >=, space); -+ dsl_pool_dirty_delta(dp, -space); -+ mutex_exit(&dp->dp_lock); - } -@@ -859,3 +667,3 @@ dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) - static int --upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -864,5 +672,4 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa_get_dsl(spa); - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -892,3 +699,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - */ -- ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); -+ ASSERT0(prev->ds_phys->ds_bp.blk_birth); - -@@ -912,3 +719,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - ASSERT(ds->ds_prev == NULL); -- VERIFY(0 == dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); -@@ -917,4 +724,4 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); -- ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); -+ ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object); -+ ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object); - -@@ -926,3 +733,3 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - } -- VERIFY(0 == zap_add_int(dp->dp_meta_objset, -+ VERIFY0(zap_add_int(dp->dp_meta_objset, - prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); -@@ -941,3 +748,3 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) - -- VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, - tx, DS_FIND_CHILDREN)); -@@ -947,15 +754,11 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) - static int --upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) - { - dmu_tx_t *tx = arg; -- dsl_dataset_t *ds; -- dsl_pool_t *dp = spa_get_dsl(spa); - objset_t *mos = dp->dp_meta_objset; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- -- if (ds->ds_dir->dd_phys->dd_origin_obj) { -+ if (ds->ds_dir->dd_phys->dd_origin_obj != 0) { - dsl_dataset_t *origin; - -- VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, -+ VERIFY0(dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); -@@ -968,4 +771,4 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, -- origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); -+ VERIFY0(zap_add_int(dp->dp_meta_objset, -+ origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx)); - -@@ -973,4 +776,2 @@ upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - } -- -- dsl_dataset_rele(ds, FTAG); - return (0); -@@ -986,3 +787,3 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) - (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); -- VERIFY(0 == dsl_pool_open_special_dir(dp, -+ VERIFY0(dsl_pool_open_special_dir(dp, - FREE_DIR_NAME, &dp->dp_free_dir)); -@@ -996,8 +797,7 @@ dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) - SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); -- VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -+ VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); -- VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, -- dp->dp_meta_objset, obj)); -+ VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); - -- VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); -@@ -1013,13 +813,12 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) - ASSERT(dp->dp_origin_snap == NULL); -+ ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); - - /* create the origin dir, ds, & snap-ds */ -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); - dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, - NULL, 0, kcred, tx); -- VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -- dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); -- VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, -+ VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); -+ dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); -+ VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap)); - dsl_dataset_rele(ds, FTAG); -- rw_exit(&dp->dp_config_rwlock); - } -@@ -1043,2 +842,3 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - uint64_t zapobj = dp->dp_tmp_userrefs_obj; -+ nvlist_t *holds; - -@@ -1048,2 +848,4 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - -+ holds = fnvlist_alloc(); -+ - for (zap_cursor_init(&zc, mos, zapobj); -@@ -1052,3 +854,3 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - char *htag; -- uint64_t dsobj; -+ nvlist_t *tags; - -@@ -1057,5 +859,13 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) - ++htag; -- dsobj = strtonum(za.za_name, NULL); -- (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); -+ if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { -+ tags = fnvlist_alloc(); -+ fnvlist_add_boolean(tags, htag); -+ fnvlist_add_nvlist(holds, za.za_name, tags); -+ fnvlist_free(tags); -+ } else { -+ fnvlist_add_boolean(tags, htag); -+ } - } -+ dsl_dataset_user_release_tmp(dp, holds); -+ fnvlist_free(holds); - zap_cursor_fini(&zc); -@@ -1080,3 +890,3 @@ static int - dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, -- const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) -+ const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) - { -@@ -1099,3 +909,3 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - } else { -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1105,3 +915,3 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - if (holding) -- error = zap_add(mos, zapobj, name, 8, 1, now, tx); -+ error = zap_add(mos, zapobj, name, 8, 1, &now, tx); - else -@@ -1118,3 +928,3 @@ int - dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, -- uint64_t *now, dmu_tx_t *tx) -+ uint64_t now, dmu_tx_t *tx) - { -@@ -1130,3 +940,3 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - { -- return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, -+ return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, - tx, B_FALSE)); -@@ -1134,26 +944,134 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - -+/* -+ * DSL Pool Configuration Lock -+ * -+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset -+ * creation / destruction / rename / property setting). It must be held for -+ * read to hold a dataset or dsl_dir. I.e. you must call -+ * dsl_pool_config_enter() or dsl_pool_hold() before calling -+ * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock -+ * must be held continuously until all datasets and dsl_dirs are released. -+ * -+ * The only exception to this rule is that if a "long hold" is placed on -+ * a dataset, then the dp_config_rwlock may be dropped while the dataset -+ * is still held. The long hold will prevent the dataset from being -+ * destroyed -- the destroy will fail with EBUSY. A long hold can be -+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset -+ * (by calling dsl_{dataset,objset}_{try}own{_obj}). -+ * -+ * Legitimate long-holders (including owners) should be long-running, cancelable -+ * tasks that should cause "zfs destroy" to fail. This includes DMU -+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), -+ * "zfs send", and "zfs diff". There are several other long-holders whose -+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). -+ * -+ * The usual formula for long-holding would be: -+ * dsl_pool_hold() -+ * dsl_dataset_hold() -+ * ... perform checks ... -+ * dsl_dataset_long_hold() -+ * dsl_pool_rele() -+ * ... perform long-running task ... -+ * dsl_dataset_long_rele() -+ * dsl_dataset_rele() -+ * -+ * Note that when the long hold is released, the dataset is still held but -+ * the pool is not held. The dataset may change arbitrarily during this time -+ * (e.g. it could be destroyed). Therefore you shouldn't do anything to the -+ * dataset except release it. -+ * -+ * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only -+ * or modifying operations. -+ * -+ * Modifying operations should generally use dsl_sync_task(). The synctask -+ * infrastructure enforces proper locking strategy with respect to the -+ * dp_config_rwlock. See the comment above dsl_sync_task() for details. -+ * -+ * Read-only operations will manually hold the pool, then the dataset, obtain -+ * information from the dataset, then release the pool and dataset. -+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool -+ * hold/rele. -+ */ -+ -+int -+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) -+{ -+ spa_t *spa; -+ int error; -+ -+ error = spa_open(name, &spa, tag); -+ if (error == 0) { -+ *dp = spa_get_dsl(spa); -+ dsl_pool_config_enter(*dp, tag); -+ } -+ return (error); -+} -+ -+void -+dsl_pool_rele(dsl_pool_t *dp, void *tag) -+{ -+ dsl_pool_config_exit(dp, tag); -+ spa_close(dp->dp_spa, tag); -+} -+ -+void -+dsl_pool_config_enter(dsl_pool_t *dp, void *tag) -+{ -+ /* -+ * We use a "reentrant" reader-writer lock, but not reentrantly. -+ * -+ * The rrwlock can (with the track_all flag) track all reading threads, -+ * which is very useful for debugging which code path failed to release -+ * the lock, and for verifying that the *current* thread does hold -+ * the lock. -+ * -+ * (Unlike a rwlock, which knows that N threads hold it for -+ * read, but not *which* threads, so rw_held(RW_READER) returns TRUE -+ * if any thread holds it for read, even if this thread doesn't). -+ */ -+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); -+ rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); -+} -+ -+void -+dsl_pool_config_exit(dsl_pool_t *dp, void *tag) -+{ -+ rrw_exit(&dp->dp_config_rwlock, tag); -+} -+ -+boolean_t -+dsl_pool_config_held(dsl_pool_t *dp) -+{ -+ return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) --module_param(zfs_no_write_throttle, int, 0644); --MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling"); -+EXPORT_SYMBOL(dsl_pool_config_enter); -+EXPORT_SYMBOL(dsl_pool_config_exit); - --module_param(zfs_write_limit_shift, int, 0444); --MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); -+/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_percent, int, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty"); - --module_param(zfs_txg_synctime_ms, int, 0644); --MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); -+/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_max_percent, int, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_max_percent, -+ "zfs_dirty_data_max upper bound as % of RAM"); - --module_param(zfs_txg_history, int, 0644); --MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); -+module_param(zfs_delay_min_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold"); - --module_param(zfs_write_limit_min, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); -+module_param(zfs_dirty_data_max, ulong, 0644); -+MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit"); - --module_param(zfs_write_limit_max, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit"); -+/* zfs_dirty_data_max_max only applied at module load in arc_init(). */ -+module_param(zfs_dirty_data_max_max, ulong, 0444); -+MODULE_PARM_DESC(zfs_dirty_data_max_max, -+ "zfs_dirty_data_max upper bound in bytes"); - --module_param(zfs_write_limit_inflated, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit"); -+module_param(zfs_dirty_data_sync, ulong, 0644); -+MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data"); - --module_param(zfs_write_limit_override, ulong, 0444); --MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit"); -+module_param(zfs_delay_scale, ulong, 0644); -+MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity"); - #endif -diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c -index e44a948..079ef97 100644 ---- a/module/zfs/dsl_prop.c -+++ b/module/zfs/dsl_prop.c -@@ -22,2 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Martin Matuska. All rights reserved. - */ -@@ -53,3 +55,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -57,3 +59,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - if (intsz != 1) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - (void) strncpy(buf, zfs_prop_default_string(prop), -@@ -62,3 +64,3 @@ dodefault(const char *propname, int intsz, int numints, void *buf) - if (intsz != 8 || numints < 1) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - -@@ -83,3 +85,3 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - -- ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); -+ ASSERT(dsl_pool_config_held(dd->dd_pool)); - -@@ -98,4 +100,2 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - for (; dd != NULL; dd = dd->dd_parent) { -- ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); -- - if (dd != target || snapshot) { -@@ -147,3 +147,3 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - */ -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -@@ -168,3 +168,3 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, - -- ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); -+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); -@@ -232,3 +232,2 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - dsl_dir_t *dd = ds->ds_dir; -- dsl_pool_t *dp = dd->dd_pool; - uint64_t value; -@@ -236,14 +235,9 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - int err; -- int need_rwlock; -+ ASSERTV(dsl_pool_t *dp = dd->dd_pool); - -- need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); -- if (need_rwlock) -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - -- err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); -- if (err != 0) { -- if (need_rwlock) -- rw_exit(&dp->dp_config_rwlock); -+ err = dsl_prop_get_int_ds(ds, propname, &value); -+ if (err != 0) - return (err); -- } - -@@ -260,5 +254,2 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, - cbr->cbr_func(cbr->cbr_arg, value); -- -- if (need_rwlock) -- rw_exit(&dp->dp_config_rwlock); - return (0); -@@ -270,15 +261,14 @@ dsl_prop_get(const char *dsname, const char *propname, - { -- dsl_dataset_t *ds; -- int err; -+ objset_t *os; -+ int error; - -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -+ error = dmu_objset_hold(dsname, FTAG, &os); -+ if (error != 0) -+ return (error); - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -+ error = dsl_prop_get_ds(dmu_objset_ds(os), propname, -+ intsz, numints, buf, setpoint); - -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ dmu_objset_rele(os, FTAG); -+ return (error); - } -@@ -300,13 +290,7 @@ dsl_prop_get_integer(const char *ddname, const char *propname, - --void --dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, -- zprop_source_t source, uint64_t *value) -+int -+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, -+ uint64_t *valuep) - { -- psa->psa_name = propname; -- psa->psa_source = source; -- psa->psa_intsz = 8; -- psa->psa_numints = 1; -- psa->psa_value = value; -- -- psa->psa_effective_value = -1ULL; -+ return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); - } -@@ -324,7 +308,6 @@ dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - int --dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) -+dsl_prop_predict(dsl_dir_t *dd, const char *propname, -+ zprop_source_t source, uint64_t value, uint64_t *newvalp) - { -- const char *propname = psa->psa_name; - zfs_prop_t prop = zfs_name_to_prop(propname); -- zprop_source_t source = psa->psa_source; - objset_t *mos; -@@ -360,9 +343,8 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - /* Revert to the received value, if any. */ -- err = zap_lookup(mos, zapobj, recvdstr, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = 0; -+ *newvalp = 0; - break; - case ZPROP_SRC_LOCAL: -- psa->psa_effective_value = *(uint64_t *)psa->psa_value; -+ *newvalp = value; - break; -@@ -373,6 +355,5 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - */ -- err = zap_lookup(mos, zapobj, propname, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = *(uint64_t *)psa->psa_value; -+ *newvalp = value; - break; -@@ -383,9 +364,8 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - */ -- err = zap_lookup(mos, zapobj, propname, 8, 1, -- &psa->psa_effective_value); -+ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); - if (err == ENOENT) -- psa->psa_effective_value = 0; -+ *newvalp = 0; - break; - default: -- cmn_err(CE_PANIC, "unexpected property source: %d", source); -+ panic("unexpected property source: %d", source); - } -@@ -400,38 +380,5 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) - --#ifdef ZFS_DEBUG --void --dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) --{ -- zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); -- uint64_t intval; -- char setpoint[MAXNAMELEN]; -- uint64_t version = spa_version(dd->dd_pool->dp_spa); -- int err; -- -- if (version < SPA_VERSION_RECVD_PROPS) { -- switch (prop) { -- case ZFS_PROP_QUOTA: -- case ZFS_PROP_RESERVATION: -- return; -- default: -- break; -- } -- } -- -- err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, -- setpoint, B_FALSE); -- if (err == 0 && intval != psa->psa_effective_value) { -- cmn_err(CE_PANIC, "%s property, source: %x, " -- "predicted effective value: %llu, " -- "actual effective value: %llu (setpoint: %s)", -- psa->psa_name, psa->psa_source, -- (unsigned long long)psa->psa_effective_value, -- (unsigned long long)intval, setpoint); -- } --} --#endif -- - /* - * Unregister this callback. Return 0 on success, ENOENT if ddname is -- * invalid, ENOMSG if no matching callback registered. -+ * invalid, or ENOMSG if no matching callback registered. - */ -@@ -456,3 +403,3 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - mutex_exit(&dd->dd_lock); -- return (ENOMSG); -+ return (SET_ERROR(ENOMSG)); - } -@@ -467,21 +414,53 @@ dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - --/* -- * Return the number of callbacks that are registered for this dataset. -- */ --int --dsl_prop_numcb(dsl_dataset_t *ds) -+boolean_t -+dsl_prop_hascb(dsl_dataset_t *ds) - { - dsl_dir_t *dd = ds->ds_dir; -+ boolean_t rv = B_FALSE; - dsl_prop_cb_record_t *cbr; -- int num = 0; - - mutex_enter(&dd->dd_lock); -- for (cbr = list_head(&dd->dd_prop_cbs); -- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { -- if (cbr->cbr_ds == ds) -- num++; -+ for (cbr = list_head(&dd->dd_prop_cbs); cbr; -+ cbr = list_next(&dd->dd_prop_cbs, cbr)) { -+ if (cbr->cbr_ds == ds) { -+ rv = B_TRUE; -+ break; -+ } - } - mutex_exit(&dd->dd_lock); -+ return (rv); -+} - -- return (num); -+/* ARGSUSED */ -+static int -+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -+{ -+ dsl_dir_t *dd = ds->ds_dir; -+ dsl_prop_cb_record_t *cbr; -+ -+ mutex_enter(&dd->dd_lock); -+ for (cbr = list_head(&dd->dd_prop_cbs); cbr; -+ cbr = list_next(&dd->dd_prop_cbs, cbr)) { -+ uint64_t value; -+ -+ if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname, -+ sizeof (value), 1, &value, NULL) == 0) -+ cbr->cbr_func(cbr->cbr_arg, value); -+ } -+ mutex_exit(&dd->dd_lock); -+ -+ return (0); -+} -+ -+/* -+ * Update all property values for ddobj & its descendants. This is used -+ * when renaming the dir. -+ */ -+void -+dsl_prop_notify_all(dsl_dir_t *dd) -+{ -+ dsl_pool_t *dp = dd->dd_pool; -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, -+ NULL, DS_FIND_CHILDREN); - } -@@ -499,4 +478,4 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - -- ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); -- err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); - if (err) -@@ -511,3 +490,3 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - if (err == 0) { -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - return; -@@ -546,3 +525,3 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - zap_cursor_fini(&zc); -- dsl_dir_close(dd, FTAG); -+ dsl_dir_rele(dd, FTAG); - } -@@ -550,6 +529,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, - void --dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, -+ zprop_source_t source, int intsz, int numints, const void *value, -+ dmu_tx_t *tx) - { -- dsl_dataset_t *ds = arg1; -- dsl_prop_setarg_t *psa = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; -@@ -558,3 +537,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - char valbuf[32]; -- char *valstr = NULL; -+ const char *valstr = NULL; - char *inheritstr; -@@ -564,4 +543,2 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- const char *propname = psa->psa_name; -- zprop_source_t source = psa->psa_source; - -@@ -583,6 +560,2 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (version < SPA_VERSION_RECVD_PROPS) { -- zfs_prop_t prop = zfs_name_to_prop(propname); -- if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) -- return; -- - if (source & ZPROP_SRC_NONE) -@@ -615,4 +588,4 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - ASSERT(err == 0 || err == ENOENT); -- VERIFY(0 == zap_update(mos, zapobj, propname, -- psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); -+ VERIFY0(zap_update(mos, zapobj, propname, -+ intsz, numints, value, tx)); - break; -@@ -627,8 +600,6 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (version >= SPA_VERSION_RECVD_PROPS && -- dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, -- NULL) == 0) { -+ dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { - dummy = 0; -- err = zap_update(mos, zapobj, inheritstr, -- 8, 1, &dummy, tx); -- ASSERT(err == 0); -+ VERIFY0(zap_update(mos, zapobj, inheritstr, -+ 8, 1, &dummy, tx)); - } -@@ -640,3 +611,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - err = zap_update(mos, zapobj, recvdstr, -- psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); -+ intsz, numints, value, tx); - ASSERT(err == 0); -@@ -670,3 +641,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (isint) { -- VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); -+ VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - -@@ -697,3 +668,3 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - if (source == ZPROP_SRC_LOCAL) { -- valstr = (char *)psa->psa_value; -+ valstr = value; - } else { -@@ -706,7 +677,5 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal((source == ZPROP_SRC_NONE || -- source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : -- LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, -- "%s=%s dataset = %llu", propname, -- (valstr == NULL ? "" : valstr), ds->ds_object); -+ spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || -+ source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, -+ "%s=%s", propname, (valstr == NULL ? "" : valstr)); - -@@ -716,61 +685,26 @@ dsl_prop_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) - --void --dsl_props_set_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+int -+dsl_prop_set_int(const char *dsname, const char *propname, -+ zprop_source_t source, uint64_t value) - { -- dsl_dataset_t *ds = arg1; -- dsl_props_arg_t *pa = arg2; -- nvlist_t *props = pa->pa_props; -- dsl_prop_setarg_t psa; -- nvpair_t *elem = NULL; -- -- psa.psa_source = pa->pa_source; -- -- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -- nvpair_t *pair = elem; -- -- psa.psa_name = nvpair_name(pair); -- -- if (nvpair_type(pair) == DATA_TYPE_NVLIST) { -- /* -- * dsl_prop_get_all_impl() returns properties in this -- * format. -- */ -- nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, -- &pair) == 0); -- } -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- if (nvpair_type(pair) == DATA_TYPE_STRING) { -- VERIFY(nvpair_value_string(pair, -- (char **)&psa.psa_value) == 0); -- psa.psa_intsz = 1; -- psa.psa_numints = strlen(psa.psa_value) + 1; -- } else { -- uint64_t intval; -- VERIFY(nvpair_value_uint64(pair, &intval) == 0); -- psa.psa_intsz = sizeof (intval); -- psa.psa_numints = 1; -- psa.psa_value = &intval; -- } -- dsl_prop_set_sync(ds, &psa, tx); -- } -+ fnvlist_add_uint64(nvl, propname, value); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } - --void --dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, -- dmu_tx_t *tx) -+int -+dsl_prop_set_string(const char *dsname, const char *propname, -+ zprop_source_t source, const char *value) - { -- objset_t *mos = dd->dd_pool->dp_meta_objset; -- uint64_t zapobj = dd->dd_phys->dd_props_zapobj; -- -- ASSERT(dmu_tx_is_syncing(tx)); -- -- VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); -- -- dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- spa_history_log_internal(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, -- "%s=%llu dataset = %llu", name, (u_longlong_t)val, -- dd->dd_phys->dd_head_dataset_obj); -+ fnvlist_add_string(nvl, propname, value); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } -@@ -778,50 +712,25 @@ dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - int --dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, -- int intsz, int numints, const void *buf) -+dsl_prop_inherit(const char *dsname, const char *propname, -+ zprop_source_t source) - { -- dsl_dataset_t *ds; -- uint64_t version; -- int err; -- dsl_prop_setarg_t psa; -- -- /* -- * We must do these checks before we get to the syncfunc, since -- * it can't fail. -- */ -- if (strlen(propname) >= ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -- -- err = dsl_dataset_hold(dsname, FTAG, &ds); -- if (err) -- return (err); -- -- version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? -- ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { -- dsl_dataset_rele(ds, FTAG); -- return (E2BIG); -- } -- if (dsl_dataset_is_snapshot(ds) && -- version < SPA_VERSION_SNAP_PROPS) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -- } -+ nvlist_t *nvl = fnvlist_alloc(); -+ int error; - -- psa.psa_name = propname; -- psa.psa_source = source; -- psa.psa_intsz = intsz; -- psa.psa_numints = numints; -- psa.psa_value = buf; -- psa.psa_effective_value = -1ULL; -- -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- NULL, dsl_prop_set_sync, ds, &psa, 2); -- -- dsl_dataset_rele(ds, FTAG); -- return (err); -+ fnvlist_add_boolean(nvl, propname); -+ error = dsl_props_set(dsname, source, nvl); -+ fnvlist_free(nvl); -+ return (error); - } - --int --dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) -+typedef struct dsl_props_set_arg { -+ const char *dpsa_dsname; -+ zprop_source_t dpsa_source; -+ nvlist_t *dpsa_props; -+} dsl_props_set_arg_t; -+ -+static int -+dsl_props_set_check(void *arg, dmu_tx_t *tx) - { -+ dsl_props_set_arg_t *dpsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; -@@ -829,19 +738,16 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) - nvpair_t *elem = NULL; -- dsl_props_arg_t pa; - int err; - -- if ((err = dsl_dataset_hold(dsname, FTAG, &ds))) -+ err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); -+ if (err != 0) - return (err); -- /* -- * Do these checks before the syncfunc, since it can't fail. -- */ -+ - version = spa_version(ds->ds_dir->dd_pool->dp_spa); -- while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -+ while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { - if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { - dsl_dataset_rele(ds, FTAG); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } - if (nvpair_type(elem) == DATA_TYPE_STRING) { -- char *valstr; -- VERIFY(nvpair_value_string(elem, &valstr) == 0); -+ char *valstr = fnvpair_value_string(elem); - if (strlen(valstr) >= (version < -@@ -855,16 +761,79 @@ dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) - -- if (dsl_dataset_is_snapshot(ds) && -- version < SPA_VERSION_SNAP_PROPS) { -+ if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -+ dsl_dataset_rele(ds, FTAG); -+ return (0); -+} -+ -+void -+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, -+ nvlist_t *props, dmu_tx_t *tx) -+{ -+ nvpair_t *elem = NULL; -+ -+ while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { -+ nvpair_t *pair = elem; - -- pa.pa_props = props; -- pa.pa_source = source; -+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) { -+ /* -+ * dsl_prop_get_all_impl() returns properties in this -+ * format. -+ */ -+ nvlist_t *attrs = fnvpair_value_nvlist(pair); -+ pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); -+ } - -- err = dsl_sync_task_do(ds->ds_dir->dd_pool, -- NULL, dsl_props_set_sync, ds, &pa, 2); -+ if (nvpair_type(pair) == DATA_TYPE_STRING) { -+ const char *value = fnvpair_value_string(pair); -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, 1, strlen(value) + 1, value, tx); -+ } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { -+ uint64_t intval = fnvpair_value_uint64(pair); -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, sizeof (intval), 1, &intval, tx); -+ } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { -+ dsl_prop_set_sync_impl(ds, nvpair_name(pair), -+ source, 0, 0, NULL, tx); -+ } else { -+ panic("invalid nvpair type"); -+ } -+ } -+} -+ -+static void -+dsl_props_set_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_props_set_arg_t *dpsa = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ dsl_dataset_t *ds; - -+ VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); -+ dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); - dsl_dataset_rele(ds, FTAG); -- return (err); -+} -+ -+/* -+ * All-or-nothing; if any prop can't be set, nothing will be modified. -+ */ -+int -+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) -+{ -+ dsl_props_set_arg_t dpsa; -+ int nblks = 0; -+ -+ dpsa.dpsa_dsname = dsname; -+ dpsa.dpsa_source = source; -+ dpsa.dpsa_props = props; -+ -+ /* -+ * If the source includes NONE, then we will only be removing entries -+ * from the ZAP object. In that case don't check for ENOSPC. -+ */ -+ if ((source & ZPROP_SRC_NONE) == 0) -+ nblks = 2 * fnvlist_num_pairs(props); -+ -+ return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, -+ &dpsa, nblks)); - } -@@ -1018,3 +987,3 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, - -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ ASSERT(dsl_pool_config_held(dp)); - -@@ -1043,3 +1012,2 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, - out: -- rw_exit(&dp->dp_config_rwlock); - return (err); -@@ -1048,29 +1016,24 @@ out: - boolean_t --dsl_prop_get_hasrecvd(objset_t *os) -+dsl_prop_get_hasrecvd(const char *dsname) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; -- int rc; - uint64_t dummy; - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -- ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); -- return (rc == 0); -+ return (0 == -+ dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); - } - --static void --dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) -+static int -+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) - { -- dsl_dataset_t *ds = os->os_dsl_dataset; -- uint64_t dummy = 0; -- dsl_prop_setarg_t psa; -- -- if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) -- return; -+ uint64_t version; -+ spa_t *spa; -+ int error = 0; - -- dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); -+ VERIFY0(spa_open(dsname, &spa, FTAG)); -+ version = spa_version(spa); -+ spa_close(spa, FTAG); - -- (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, -- dsl_prop_set_sync, ds, &psa, 2); -+ if (version >= SPA_VERSION_RECVD_PROPS) -+ error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); -+ return (error); - } -@@ -1081,10 +1044,9 @@ dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) - */ --void --dsl_prop_set_hasrecvd(objset_t *os) -+int -+dsl_prop_set_hasrecvd(const char *dsname) - { -- if (dsl_prop_get_hasrecvd(os)) { -- ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); -- return; -- } -- dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); -+ int error = 0; -+ if (!dsl_prop_get_hasrecvd(dsname)) -+ error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); -+ return (error); - } -@@ -1092,5 +1054,5 @@ dsl_prop_set_hasrecvd(objset_t *os) - void --dsl_prop_unset_hasrecvd(objset_t *os) -+dsl_prop_unset_hasrecvd(const char *dsname) - { -- dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); -+ VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); - } -@@ -1104,4 +1066,7 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) - int --dsl_prop_get_received(objset_t *os, nvlist_t **nvp) -+dsl_prop_get_received(const char *dsname, nvlist_t **nvp) - { -+ objset_t *os; -+ int error; -+ - /* -@@ -1111,5 +1076,11 @@ dsl_prop_get_received(objset_t *os, nvlist_t **nvp) - */ -- dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? -+ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? - DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); -- return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); -+ -+ error = dmu_objset_hold(dsname, FTAG, &os); -+ if (error != 0) -+ return (error); -+ error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); -+ dmu_objset_rele(os, FTAG); -+ return (error); - } -@@ -1159,4 +1130,2 @@ EXPORT_SYMBOL(dsl_prop_register); - EXPORT_SYMBOL(dsl_prop_unregister); --EXPORT_SYMBOL(dsl_prop_numcb); --EXPORT_SYMBOL(dsl_prop_set); - EXPORT_SYMBOL(dsl_prop_get); -@@ -1166,3 +1135,9 @@ EXPORT_SYMBOL(dsl_prop_get_received); - EXPORT_SYMBOL(dsl_prop_get_ds); -+EXPORT_SYMBOL(dsl_prop_get_int_ds); - EXPORT_SYMBOL(dsl_prop_get_dd); -+EXPORT_SYMBOL(dsl_props_set); -+EXPORT_SYMBOL(dsl_prop_set_int); -+EXPORT_SYMBOL(dsl_prop_set_string); -+EXPORT_SYMBOL(dsl_prop_inherit); -+EXPORT_SYMBOL(dsl_prop_predict); - EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64); -diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c -index 34a4f03..7807f84 100644 ---- a/module/zfs/dsl_scan.c -+++ b/module/zfs/dsl_scan.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -55,3 +55,3 @@ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - static scan_cb_t dsl_scan_scrub_cb; --static dsl_syncfunc_t dsl_scan_cancel_sync; -+static void dsl_scan_cancel_sync(void *, dmu_tx_t *); - static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); -@@ -93,2 +93,11 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) - -+ /* -+ * It's possible that we're resuming a scan after a reboot so -+ * make sure that the scan_async_destroying flag is initialized -+ * appropriately. -+ */ -+ ASSERT(!scn->scn_async_destroying); -+ scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, -+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]); -+ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, -@@ -116,2 +125,38 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) - &scn->scn_phys); -+ /* -+ * Detect if the pool contains the signature of #2094. If it -+ * does properly update the scn->scn_phys structure and notify -+ * the administrator by setting an errata for the pool. -+ */ -+ if (err == EOVERFLOW) { -+ uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; -+ VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); -+ VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, -+ (23 * sizeof (uint64_t))); -+ -+ err = zap_lookup(dp->dp_meta_objset, -+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, -+ sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); -+ if (err == 0) { -+ uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; -+ -+ if (overflow & ~DSL_SCAN_FLAGS_MASK || -+ scn->scn_async_destroying) { -+ spa->spa_errata = -+ ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; -+ return (EOVERFLOW); -+ } -+ -+ bcopy(zaptmp, &scn->scn_phys, -+ SCAN_PHYS_NUMINTS * sizeof (uint64_t)); -+ scn->scn_phys.scn_flags = overflow; -+ -+ /* Required scrub already in progress. */ -+ if (scn->scn_phys.scn_state == DSS_FINISHED || -+ scn->scn_phys.scn_state == DSS_CANCELED) -+ spa->spa_errata = -+ ZPOOL_ERRATA_ZOL_2094_SCRUB; -+ } -+ } -+ - if (err == ENOENT) -@@ -152,8 +197,8 @@ dsl_scan_fini(dsl_pool_t *dp) - static int --dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_setup_check(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (scn->scn_phys.scn_state == DSS_SCANNING) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -162,8 +207,7 @@ dsl_scan_setup_check(void *arg1, void *arg2, dmu_tx_t *tx) - --/* ARGSUSED */ - static void --dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -- pool_scan_func_t *funcp = arg2; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; -+ pool_scan_func_t *funcp = arg; - dmu_object_type_t ot = 0; -@@ -184,2 +228,3 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - scn->scn_restart_txg = 0; -+ scn->scn_done_txg = 0; - spa_scan_stat_init(spa); -@@ -194,5 +239,7 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { -- spa_event_notify(spa, NULL, FM_EREPORT_ZFS_RESILVER_START); -+ spa_event_notify(spa, NULL, -+ FM_EREPORT_ZFS_RESILVER_START); - } else { -- spa_event_notify(spa, NULL, FM_EREPORT_ZFS_SCRUB_START); -+ spa_event_notify(spa, NULL, -+ FM_EREPORT_ZFS_SCRUB_START); - } -@@ -226,3 +273,3 @@ dsl_scan_setup_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_SCAN, spa, tx, -+ spa_history_log_internal(spa, "scan setup", tx, - "func=%u mintxg=%llu maxtxg=%llu", -@@ -275,3 +322,3 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_SCAN_DONE, spa, tx, -+ spa_history_log_internal(spa, "scan done", tx, - "complete=%u", complete); -@@ -310,2 +357,5 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - scn->scn_phys.scn_end_time = gethrestime_sec(); -+ -+ if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) -+ spa->spa_errata = 0; - } -@@ -314,8 +364,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) - static int --dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - - if (scn->scn_phys.scn_state != DSS_SCANNING) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - return (0); -@@ -325,5 +375,5 @@ dsl_scan_cancel_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --dsl_scan_cancel_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) - { -- dsl_scan_t *scn = arg1; -+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - -@@ -336,8 +386,4 @@ dsl_scan_cancel(dsl_pool_t *dp) - { -- boolean_t complete = B_FALSE; -- int err; -- -- err = dsl_sync_task_do(dp, dsl_scan_cancel_check, -- dsl_scan_cancel_sync, dp->dp_scan, &complete, 3); -- return (err); -+ return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, -+ dsl_scan_cancel_sync, NULL, 3)); - } -@@ -377,3 +423,3 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) - { -- VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset, -+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, -@@ -407,3 +453,3 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb) - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || -- (elapsed_nanosecs / MICROSEC > mintime && -+ (NSEC2MSEC(elapsed_nanosecs) > mintime && - txg_sync_waiting(scn->scn_dp)) || -@@ -774,3 +820,3 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, - */ -- if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) { -+ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); -@@ -780,3 +826,3 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb, - out: -- kmem_free(bp_toread, sizeof(blkptr_t)); -+ kmem_free(bp_toread, sizeof (blkptr_t)); - } -@@ -961,3 +1007,3 @@ struct enqueue_clones_arg { - static int --enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -966,6 +1012,8 @@ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj) -+ return (0); -+ -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -973,17 +1021,15 @@ enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - -- if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { -- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { -- dsl_dataset_t *prev; -- err = dsl_dataset_hold_obj(dp, -- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); -+ while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { -+ dsl_dataset_t *prev; -+ err = dsl_dataset_hold_obj(dp, -+ ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - -- dsl_dataset_rele(ds, FTAG); -- if (err) -- return (err); -- ds = prev; -- } -- VERIFY(zap_add_int_key(dp->dp_meta_objset, -- scn->scn_phys.scn_queue_obj, ds->ds_object, -- ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); -+ dsl_dataset_rele(ds, FTAG); -+ if (err) -+ return (err); -+ ds = prev; - } -+ VERIFY(zap_add_int_key(dp->dp_meta_objset, -+ scn->scn_phys.scn_queue_obj, ds->ds_object, -+ ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0); - dsl_dataset_rele(ds, FTAG); -@@ -1077,6 +1123,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) - if (usenext) { -- VERIFY(zap_join_key(dp->dp_meta_objset, -+ VERIFY0(zap_join_key(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - scn->scn_phys.scn_queue_obj, -- ds->ds_phys->ds_creation_txg, tx) == 0); -+ ds->ds_phys->ds_creation_txg, tx)); - } else { -@@ -1086,4 +1132,4 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) - -- (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, -- NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, -+ enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); - } -@@ -1097,3 +1143,3 @@ out: - static int --enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) - { -@@ -1102,6 +1148,5 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) - int err; -- dsl_pool_t *dp = spa->spa_dsl_pool; - dsl_scan_t *scn = dp->dp_scan; - -- err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); -+ err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); - if (err) -@@ -1226,3 +1271,3 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - if (ddp->ddp_phys_birth == 0 || -- ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg) -+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; -@@ -1263,4 +1308,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { -- VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, -- NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); -+ VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, -+ enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { -@@ -1288,4 +1333,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t)); -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_PUSHPAGE); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_PUSHPAGE); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_PUSHPAGE); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); - -@@ -1323,4 +1368,4 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) - out: -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - } -@@ -1332,5 +1377,8 @@ dsl_scan_free_should_pause(dsl_scan_t *scn) - -+ if (zfs_recover) -+ return (B_FALSE); -+ - elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; - return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || -- (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && -+ (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && - txg_sync_waiting(scn->scn_dp)) || -@@ -1347,3 +1395,3 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - if (dsl_scan_free_should_pause(scn)) -- return (ERESTART); -+ return (SET_ERROR(ERESTART)); - } -@@ -1370,9 +1418,6 @@ dsl_scan_active(dsl_scan_t *scn) - -- if (scn->scn_phys.scn_state == DSS_SCANNING) -+ if (scn->scn_phys.scn_state == DSS_SCANNING || -+ scn->scn_async_destroying) - return (B_TRUE); - -- if (spa_feature_is_active(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -- return (B_TRUE); -- } - if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { -@@ -1404,3 +1449,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - func, tx->tx_txg); -- dsl_scan_setup_sync(scn, &func, tx); -+ dsl_scan_setup_sync(&func, tx); - } -@@ -1432,2 +1477,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { -+ ASSERT(scn->scn_async_destroying); - scn->scn_is_bptree = B_TRUE; -@@ -1438,17 +1484,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - scn, tx); -- VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); -- if (err != 0) -- return; -- -- /* disable async destroy feature */ -- spa_feature_decr(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); -- ASSERT(!spa_feature_is_active(spa, -- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); -- VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, -- DMU_POOL_DIRECTORY_OBJECT, -- DMU_POOL_BPTREE_OBJ, tx)); -- VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, -- dp->dp_bptree_obj, tx)); -- dp->dp_bptree_obj = 0; -+ VERIFY0(zio_wait(scn->scn_zio_root)); -+ -+ if (err == 0) { -+ zfeature_info_t *feat = &spa_feature_table -+ [SPA_FEATURE_ASYNC_DESTROY]; -+ /* finished; deactivate async destroy feature */ -+ spa_feature_decr(spa, feat, tx); -+ ASSERT(!spa_feature_is_active(spa, feat)); -+ VERIFY0(zap_remove(dp->dp_meta_objset, -+ DMU_POOL_DIRECTORY_OBJECT, -+ DMU_POOL_BPTREE_OBJ, tx)); -+ VERIFY0(bptree_free(dp->dp_meta_objset, -+ dp->dp_bptree_obj, tx)); -+ dp->dp_bptree_obj = 0; -+ scn->scn_async_destroying = B_FALSE; -+ } - } -@@ -1459,3 +1506,3 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - (longlong_t) -- (gethrtime() - scn->scn_sync_start_time) / MICROSEC, -+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), - (longlong_t)tx->tx_txg); -@@ -1475,2 +1522,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - -+ if (scn->scn_done_txg == tx->tx_txg) { -+ ASSERT(!scn->scn_pausing); -+ /* finished with scan. */ -+ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); -+ dsl_scan_done(scn, B_TRUE, tx); -+ ASSERT3U(spa->spa_scrub_inflight, ==, 0); -+ dsl_scan_sync_state(scn, tx); -+ return; -+ } -+ - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= -@@ -1499,3 +1556,5 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - NULL, ZIO_FLAG_CANFAIL); -+ dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); -+ dsl_pool_config_exit(dp, FTAG); - (void) zio_wait(scn->scn_zio_root); -@@ -1505,8 +1564,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) - (longlong_t)scn->scn_visited_this_txg, -- (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC); -+ (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); - - if (!scn->scn_pausing) { -- /* finished with scan. */ -- zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg); -- dsl_scan_done(scn, B_TRUE, tx); -+ scn->scn_done_txg = tx->tx_txg + 1; -+ zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", -+ tx->tx_txg, scn->scn_done_txg); - } -@@ -1634,3 +1693,2 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; -- int zio_priority = 0; - int scan_delay = 0; -@@ -1647,8 +1705,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - zio_flags |= ZIO_FLAG_SCRUB; -- zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - scan_delay = zfs_scrub_delay; -- } else if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { -+ } else { -+ ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); - zio_flags |= ZIO_FLAG_RESILVER; -- zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; -@@ -1710,3 +1767,3 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, - zio_nowait(zio_read(NULL, spa, bp, data, size, -- dsl_scan_scrub_done, NULL, zio_priority, -+ dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); -@@ -1736,4 +1793,4 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) - -- return (dsl_sync_task_do(dp, dsl_scan_setup_check, -- dsl_scan_setup_sync, dp->dp_scan, &func, 0)); -+ return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, -+ dsl_scan_setup_sync, &func, 0)); - } -diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c -index 75eb507..5f345f4 100644 ---- a/module/zfs/dsl_synctask.c -+++ b/module/zfs/dsl_synctask.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -35,3 +36,3 @@ - static int --dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) -+dsl_null_checkfunc(void *arg, dmu_tx_t *tx) - { -@@ -40,78 +41,64 @@ dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) - --dsl_sync_task_group_t * --dsl_sync_task_group_create(dsl_pool_t *dp) --{ -- dsl_sync_task_group_t *dstg; -- -- dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); -- list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), -- offsetof(dsl_sync_task_t, dst_node)); -- dstg->dstg_pool = dp; -- -- return (dstg); --} -- --void --dsl_sync_task_create(dsl_sync_task_group_t *dstg, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified) --{ -- dsl_sync_task_t *dst; -- -- if (checkfunc == NULL) -- checkfunc = dsl_null_checkfunc; -- dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); -- dst->dst_checkfunc = checkfunc; -- dst->dst_syncfunc = syncfunc; -- dst->dst_arg1 = arg1; -- dst->dst_arg2 = arg2; -- list_insert_tail(&dstg->dstg_tasks, dst); -- -- dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; --} -- -+/* -+ * Called from open context to perform a callback in syncing context. Waits -+ * for the operation to complete. -+ * -+ * The checkfunc will be called from open context as a preliminary check -+ * which can quickly fail. If it succeeds, it will be called again from -+ * syncing context. The checkfunc should generally be designed to work -+ * properly in either context, but if necessary it can check -+ * dmu_tx_is_syncing(tx). -+ * -+ * The synctask infrastructure enforces proper locking strategy with respect -+ * to the dp_config_rwlock -- the lock will always be held when the callbacks -+ * are called. It will be held for read during the open-context (preliminary) -+ * call to the checkfunc, and then held for write from syncing context during -+ * the calls to the check and sync funcs. -+ * -+ * A dataset or pool name can be passed as the first argument. Typically, -+ * the check func will hold, check the return value of the hold, and then -+ * release the dataset. The sync func will VERIFYO(hold()) the dataset. -+ * This is safe because no changes can be made between the check and sync funcs, -+ * and the sync func will only be called if the check func successfully opened -+ * the dataset. -+ */ - int --dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) -+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, -+ dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified) - { -+ spa_t *spa; - dmu_tx_t *tx; -- uint64_t txg; -- dsl_sync_task_t *dst; -+ int err; -+ dsl_sync_task_t dst = { { { NULL } } }; -+ dsl_pool_t *dp; - --top: -- tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); -- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); -- -- txg = dmu_tx_get_txg(tx); -- -- /* Do a preliminary error check. */ -- dstg->dstg_err = 0; -- rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { --#ifdef ZFS_DEBUG -- /* -- * Only check half the time, otherwise, the sync-context -- * check will almost never fail. -- */ -- if (spa_get_random(2) == 0) -- continue; --#endif -- dst->dst_err = -- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); -- if (dst->dst_err) -- dstg->dstg_err = dst->dst_err; -- } -- rw_exit(&dstg->dstg_pool->dp_config_rwlock); -+ err = spa_open(pool, &spa, FTAG); -+ if (err != 0) -+ return (err); -+ dp = spa_get_dsl(spa); - -- if (dstg->dstg_err) { -+top: -+ tx = dmu_tx_create_dd(dp->dp_mos_dir); -+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); -+ -+ dst.dst_pool = dp; -+ dst.dst_txg = dmu_tx_get_txg(tx); -+ dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; -+ dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; -+ dst.dst_syncfunc = syncfunc; -+ dst.dst_arg = arg; -+ dst.dst_error = 0; -+ dst.dst_nowaiter = B_FALSE; -+ -+ dsl_pool_config_enter(dp, FTAG); -+ err = dst.dst_checkfunc(arg, tx); -+ dsl_pool_config_exit(dp, FTAG); -+ -+ if (err != 0) { - dmu_tx_commit(tx); -- return (dstg->dstg_err); -+ spa_close(spa, FTAG); -+ return (err); - } - -- /* -- * We don't generally have many sync tasks, so pay the price of -- * add_tail to get the tasks executed in the right order. -- */ -- VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, -- dstg, txg)); -+ VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); - -@@ -119,6 +106,6 @@ top: - -- txg_wait_synced(dstg->dstg_pool, txg); -+ txg_wait_synced(dp, dst.dst_txg); - -- if (dstg->dstg_err == EAGAIN) { -- txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); -+ if (dst.dst_error == EAGAIN) { -+ txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); - goto top; -@@ -126,3 +113,4 @@ top: - -- return (dstg->dstg_err); -+ spa_close(spa, FTAG); -+ return (dst.dst_error); - } -@@ -130,36 +118,29 @@ top: - void --dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) -+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, -+ int blocks_modified, dmu_tx_t *tx) - { -- uint64_t txg; -- -- dstg->dstg_nowaiter = B_TRUE; -- txg = dmu_tx_get_txg(tx); -- /* -- * We don't generally have many sync tasks, so pay the price of -- * add_tail to get the tasks executed in the right order. -- */ -- VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks, -- dstg, txg)); --} -+ dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - --void --dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) --{ -- dsl_sync_task_t *dst; -+ dst->dst_pool = dp; -+ dst->dst_txg = dmu_tx_get_txg(tx); -+ dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; -+ dst->dst_checkfunc = dsl_null_checkfunc; -+ dst->dst_syncfunc = syncfunc; -+ dst->dst_arg = arg; -+ dst->dst_error = 0; -+ dst->dst_nowaiter = B_TRUE; - -- while ((dst = list_head(&dstg->dstg_tasks))) { -- list_remove(&dstg->dstg_tasks, dst); -- kmem_free(dst, sizeof (dsl_sync_task_t)); -- } -- kmem_free(dstg, sizeof (dsl_sync_task_group_t)); -+ VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); - } - -+/* -+ * Called in syncing context to execute the synctask. -+ */ - void --dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) -+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) - { -- dsl_sync_task_t *dst; -- dsl_pool_t *dp = dstg->dstg_pool; -+ dsl_pool_t *dp = dst->dst_pool; - uint64_t quota, used; - -- ASSERT0(dstg->dstg_err); -+ ASSERT0(dst->dst_error); - -@@ -175,4 +156,6 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) - /* MOS space is triple-dittoed, so we multiply by 3. */ -- if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) { -- dstg->dstg_err = ENOSPC; -+ if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { -+ dst->dst_error = SET_ERROR(ENOSPC); -+ if (dst->dst_nowaiter) -+ kmem_free(dst, sizeof (*dst)); - return; -@@ -181,60 +164,11 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) - /* -- * Check for errors by calling checkfuncs. -+ * Check for errors by calling checkfunc. - */ -- rw_enter(&dp->dp_config_rwlock, RW_WRITER); -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- dst->dst_err = -- dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); -- if (dst->dst_err) -- dstg->dstg_err = dst->dst_err; -- } -- -- if (dstg->dstg_err == 0) { -- /* -- * Execute sync tasks. -- */ -- for (dst = list_head(&dstg->dstg_tasks); dst; -- dst = list_next(&dstg->dstg_tasks, dst)) { -- dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx); -- } -- } -- rw_exit(&dp->dp_config_rwlock); -- -- if (dstg->dstg_nowaiter) -- dsl_sync_task_group_destroy(dstg); --} -- --int --dsl_sync_task_do(dsl_pool_t *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified) --{ -- dsl_sync_task_group_t *dstg; -- int err; -- -- ASSERT(spa_writeable(dp->dp_spa)); -- -- dstg = dsl_sync_task_group_create(dp); -- dsl_sync_task_create(dstg, checkfunc, syncfunc, -- arg1, arg2, blocks_modified); -- err = dsl_sync_task_group_wait(dstg); -- dsl_sync_task_group_destroy(dstg); -- return (err); --} -- --void --dsl_sync_task_do_nowait(dsl_pool_t *dp, -- dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, -- void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) --{ -- dsl_sync_task_group_t *dstg; -- -- if (!spa_writeable(dp->dp_spa)) -- return; -- -- dstg = dsl_sync_task_group_create(dp); -- dsl_sync_task_create(dstg, checkfunc, syncfunc, -- arg1, arg2, blocks_modified); -- dsl_sync_task_group_nowait(dstg, tx); -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); -+ if (dst->dst_error == 0) -+ dst->dst_syncfunc(dst->dst_arg, tx); -+ rrw_exit(&dp->dp_config_rwlock, FTAG); -+ if (dst->dst_nowaiter) -+ kmem_free(dst, sizeof (*dst)); - } -@@ -242,4 +176,4 @@ dsl_sync_task_do_nowait(dsl_pool_t *dp, - #if defined(_KERNEL) && defined(HAVE_SPL) --EXPORT_SYMBOL(dsl_sync_task_do); --EXPORT_SYMBOL(dsl_sync_task_do_nowait); -+EXPORT_SYMBOL(dsl_sync_task); -+EXPORT_SYMBOL(dsl_sync_task_nowait); - #endif -diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c -new file mode 100644 -index 0000000..e24ed64 ---- /dev/null -+++ b/module/zfs/dsl_userhold.c -@@ -0,0 +1,675 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+/* -+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+typedef struct dsl_dataset_user_hold_arg { -+ nvlist_t *dduha_holds; -+ nvlist_t *dduha_chkholds; -+ nvlist_t *dduha_errlist; -+ minor_t dduha_minor; -+} dsl_dataset_user_hold_arg_t; -+ -+/* -+ * If you add new checks here, you may need to add additional checks to the -+ * "temporary" case in snapshot_check() in dmu_objset.c. -+ */ -+int -+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, -+ boolean_t temphold, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ objset_t *mos = dp->dp_meta_objset; -+ int error = 0; -+ -+ ASSERT(dsl_pool_config_held(dp)); -+ -+ if (strlen(htag) > MAXNAMELEN) -+ return (SET_ERROR(E2BIG)); -+ /* Tempholds have a more restricted length */ -+ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) -+ return (SET_ERROR(E2BIG)); -+ -+ /* tags must be unique (if ds already exists) */ -+ if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) { -+ uint64_t value; -+ -+ error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, -+ htag, 8, 1, &value); -+ if (error == 0) -+ error = SET_ERROR(EEXIST); -+ else if (error == ENOENT) -+ error = 0; -+ } -+ -+ return (error); -+} -+ -+static int -+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_hold_arg_t *dduha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) -+ return (SET_ERROR(ENOTSUP)); -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { -+ dsl_dataset_t *ds; -+ int error = 0; -+ char *htag, *name; -+ -+ /* must be a snapshot */ -+ name = nvpair_name(pair); -+ if (strchr(name, '@') == NULL) -+ error = SET_ERROR(EINVAL); -+ -+ if (error == 0) -+ error = nvpair_value_string(pair, &htag); -+ -+ if (error == 0) -+ error = dsl_dataset_hold(dp, name, FTAG, &ds); -+ -+ if (error == 0) { -+ error = dsl_dataset_user_hold_check_one(ds, htag, -+ dduha->dduha_minor != 0, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ -+ if (error == 0) { -+ fnvlist_add_string(dduha->dduha_chkholds, name, htag); -+ } else { -+ /* -+ * We register ENOENT errors so they can be correctly -+ * reported if needed, such as when all holds fail. -+ */ -+ fnvlist_add_int32(dduha->dduha_errlist, name, error); -+ if (error != ENOENT) -+ return (error); -+ } -+ } -+ -+ return (0); -+} -+ -+ -+static void -+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, -+ const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ uint64_t zapobj; -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ if (ds->ds_phys->ds_userrefs_obj == 0) { -+ /* -+ * This is the first user hold for this dataset. Create -+ * the userrefs zap object. -+ */ -+ dmu_buf_will_dirty(ds->ds_dbuf, tx); -+ zapobj = ds->ds_phys->ds_userrefs_obj = -+ zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); -+ } else { -+ zapobj = ds->ds_phys->ds_userrefs_obj; -+ } -+ ds->ds_userrefs++; -+ -+ VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); -+ -+ if (minor != 0) { -+ char name[MAXNAMELEN]; -+ nvlist_t *tags; -+ -+ VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, -+ htag, now, tx)); -+ (void) snprintf(name, sizeof (name), "%llx", -+ (u_longlong_t)ds->ds_object); -+ -+ if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { -+ VERIFY0(nvlist_alloc(&tags, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ fnvlist_add_boolean(tags, htag); -+ fnvlist_add_nvlist(tmpholds, name, tags); -+ fnvlist_free(tags); -+ } else { -+ fnvlist_add_boolean(tags, htag); -+ } -+ } -+ -+ spa_history_log_internal_ds(ds, "hold", tx, -+ "tag=%s temp=%d refs=%llu", -+ htag, minor != 0, ds->ds_userrefs); -+} -+ -+typedef struct zfs_hold_cleanup_arg { -+ char zhca_spaname[MAXNAMELEN]; -+ uint64_t zhca_spa_load_guid; -+ nvlist_t *zhca_holds; -+} zfs_hold_cleanup_arg_t; -+ -+static void -+dsl_dataset_user_release_onexit(void *arg) -+{ -+ zfs_hold_cleanup_arg_t *ca = arg; -+ spa_t *spa; -+ int error; -+ -+ error = spa_open(ca->zhca_spaname, &spa, FTAG); -+ if (error != 0) { -+ zfs_dbgmsg("couldn't release holds on pool=%s " -+ "because pool is no longer loaded", -+ ca->zhca_spaname); -+ return; -+ } -+ if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { -+ zfs_dbgmsg("couldn't release holds on pool=%s " -+ "because pool is no longer loaded (guid doesn't match)", -+ ca->zhca_spaname); -+ spa_close(spa, FTAG); -+ return; -+ } -+ -+ (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); -+ fnvlist_free(ca->zhca_holds); -+ kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); -+ spa_close(spa, FTAG); -+} -+ -+static void -+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) -+{ -+ zfs_hold_cleanup_arg_t *ca; -+ -+ if (minor == 0 || nvlist_empty(holds)) { -+ fnvlist_free(holds); -+ return; -+ } -+ -+ ASSERT(spa != NULL); -+ ca = kmem_alloc(sizeof (*ca), KM_PUSHPAGE); -+ -+ (void) strlcpy(ca->zhca_spaname, spa_name(spa), -+ sizeof (ca->zhca_spaname)); -+ ca->zhca_spa_load_guid = spa_load_guid(spa); -+ ca->zhca_holds = holds; -+ VERIFY0(zfs_onexit_add_cb(minor, -+ dsl_dataset_user_release_onexit, ca, NULL)); -+} -+ -+void -+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, -+ minor_t minor, uint64_t now, dmu_tx_t *tx) -+{ -+ nvlist_t *tmpholds; -+ -+ if (minor != 0) -+ VERIFY0(nvlist_alloc(&tmpholds, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ else -+ tmpholds = NULL; -+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); -+ dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); -+} -+ -+static void -+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_hold_arg_t *dduha = arg; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvlist_t *tmpholds; -+ nvpair_t *pair; -+ uint64_t now = gethrestime_sec(); -+ -+ if (dduha->dduha_minor != 0) -+ VERIFY0(nvlist_alloc(&tmpholds, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ else -+ tmpholds = NULL; -+ for (pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); -+ pair != NULL; -+ pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { -+ dsl_dataset_t *ds; -+ -+ VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); -+ dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, -+ fnvpair_value_string(pair), dduha->dduha_minor, now, tx); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); -+} -+ -+/* -+ * The full semantics of this function are described in the comment above -+ * lzc_hold(). -+ * -+ * To summarize: -+ * holds is nvl of snapname -> holdname -+ * errlist will be filled in with snapname -> error -+ * -+ * The snaphosts must all be in the same pool. -+ * -+ * Holds for snapshots that don't exist will be skipped. -+ * -+ * If none of the snapshots for requested holds exist then ENOENT will be -+ * returned. -+ * -+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned -+ * up when the process exits. -+ * -+ * On success all the holds, for snapshots that existed, will be created and 0 -+ * will be returned. -+ * -+ * On failure no holds will be created, the errlist will be filled in, -+ * and an errno will returned. -+ * -+ * In all cases the errlist will contain entries for holds where the snapshot -+ * didn't exist. -+ */ -+int -+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) -+{ -+ dsl_dataset_user_hold_arg_t dduha; -+ nvpair_t *pair; -+ int ret; -+ -+ pair = nvlist_next_nvpair(holds, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ dduha.dduha_holds = holds; -+ VERIFY0(nvlist_alloc(&dduha.dduha_chkholds, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ dduha.dduha_errlist = errlist; -+ dduha.dduha_minor = cleanup_minor; -+ -+ ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, -+ dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds)); -+ fnvlist_free(dduha.dduha_chkholds); -+ -+ return (ret); -+} -+ -+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, -+ dsl_dataset_t **dsp); -+ -+typedef struct dsl_dataset_user_release_arg { -+ dsl_holdfunc_t *ddura_holdfunc; -+ nvlist_t *ddura_holds; -+ nvlist_t *ddura_todelete; -+ nvlist_t *ddura_errlist; -+ nvlist_t *ddura_chkholds; -+} dsl_dataset_user_release_arg_t; -+ -+/* Place a dataset hold on the snapshot identified by passed dsobj string */ -+static int -+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, -+ dsl_dataset_t **dsp) -+{ -+ return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp)); -+} -+ -+static int -+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, -+ dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) -+{ -+ uint64_t zapobj; -+ nvlist_t *holds_found; -+ nvpair_t *pair; -+ objset_t *mos; -+ int numholds; -+ -+ if (!dsl_dataset_is_snapshot(ds)) -+ return (SET_ERROR(EINVAL)); -+ -+ if (nvlist_empty(holds)) -+ return (0); -+ -+ numholds = 0; -+ mos = ds->ds_dir->dd_pool->dp_meta_objset; -+ zapobj = ds->ds_phys->ds_userrefs_obj; -+ VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ uint64_t tmp; -+ int error; -+ const char *holdname = nvpair_name(pair); -+ -+ if (zapobj != 0) -+ error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); -+ else -+ error = SET_ERROR(ENOENT); -+ -+ /* -+ * Non-existent holds are put on the errlist, but don't -+ * cause an overall failure. -+ */ -+ if (error == ENOENT) { -+ if (ddura->ddura_errlist != NULL) { -+ char *errtag = kmem_asprintf("%s#%s", -+ snapname, holdname); -+ fnvlist_add_int32(ddura->ddura_errlist, errtag, -+ ENOENT); -+ strfree(errtag); -+ } -+ continue; -+ } -+ -+ if (error != 0) { -+ fnvlist_free(holds_found); -+ return (error); -+ } -+ -+ fnvlist_add_boolean(holds_found, holdname); -+ numholds++; -+ } -+ -+ if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && -+ ds->ds_userrefs == numholds) { -+ /* we need to destroy the snapshot as well */ -+ if (dsl_dataset_long_held(ds)) { -+ fnvlist_free(holds_found); -+ return (SET_ERROR(EBUSY)); -+ } -+ fnvlist_add_boolean(ddura->ddura_todelete, snapname); -+ } -+ -+ if (numholds != 0) { -+ fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, -+ holds_found); -+ } -+ fnvlist_free(holds_found); -+ -+ return (0); -+} -+ -+static int -+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_release_arg_t *ddura; -+ dsl_holdfunc_t *holdfunc; -+ dsl_pool_t *dp; -+ nvpair_t *pair; -+ -+ if (!dmu_tx_is_syncing(tx)) -+ return (0); -+ -+ dp = dmu_tx_pool(tx); -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ ddura = arg; -+ holdfunc = ddura->ddura_holdfunc; -+ -+ for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { -+ int error; -+ dsl_dataset_t *ds; -+ nvlist_t *holds; -+ const char *snapname = nvpair_name(pair); -+ -+ error = nvpair_value_nvlist(pair, &holds); -+ if (error != 0) -+ error = (SET_ERROR(EINVAL)); -+ else -+ error = holdfunc(dp, snapname, FTAG, &ds); -+ if (error == 0) { -+ error = dsl_dataset_user_release_check_one(ddura, ds, -+ holds, snapname); -+ dsl_dataset_rele(ds, FTAG); -+ } -+ if (error != 0) { -+ if (ddura->ddura_errlist != NULL) { -+ fnvlist_add_int32(ddura->ddura_errlist, -+ snapname, error); -+ } -+ /* -+ * Non-existent snapshots are put on the errlist, -+ * but don't cause an overall failure. -+ */ -+ if (error != ENOENT) -+ return (error); -+ } -+ } -+ -+ return (0); -+} -+ -+static void -+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, -+ dmu_tx_t *tx) -+{ -+ dsl_pool_t *dp = ds->ds_dir->dd_pool; -+ objset_t *mos = dp->dp_meta_objset; -+ nvpair_t *pair; -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ int error; -+ const char *holdname = nvpair_name(pair); -+ -+ /* Remove temporary hold if one exists. */ -+ error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); -+ VERIFY(error == 0 || error == ENOENT); -+ -+ VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname, -+ tx)); -+ ds->ds_userrefs--; -+ -+ spa_history_log_internal_ds(ds, "release", tx, -+ "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); -+ } -+} -+ -+static void -+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) -+{ -+ dsl_dataset_user_release_arg_t *ddura = arg; -+ dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; -+ dsl_pool_t *dp = dmu_tx_pool(tx); -+ nvpair_t *pair; -+ -+ ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); -+ -+ for (pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); -+ pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, -+ pair)) { -+ dsl_dataset_t *ds; -+ const char *name = nvpair_name(pair); -+ -+ VERIFY0(holdfunc(dp, name, FTAG, &ds)); -+ -+ dsl_dataset_user_release_sync_one(ds, -+ fnvpair_value_nvlist(pair), tx); -+ if (nvlist_exists(ddura->ddura_todelete, name)) { -+ ASSERT(ds->ds_userrefs == 0 && -+ ds->ds_phys->ds_num_children == 1 && -+ DS_IS_DEFER_DESTROY(ds)); -+ dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ } -+} -+ -+/* -+ * The full semantics of this function are described in the comment above -+ * lzc_release(). -+ * -+ * To summarize: -+ * Releases holds specified in the nvl holds. -+ * -+ * holds is nvl of snapname -> { holdname, ... } -+ * errlist will be filled in with snapname -> error -+ * -+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, -+ * otherwise they should be the names of shapshots. -+ * -+ * As a release may cause snapshots to be destroyed this trys to ensure they -+ * aren't mounted. -+ * -+ * The release of non-existent holds are skipped. -+ * -+ * At least one hold must have been released for the this function to succeed -+ * and return 0. -+ */ -+static int -+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, -+ dsl_pool_t *tmpdp) -+{ -+ dsl_dataset_user_release_arg_t ddura; -+ nvpair_t *pair; -+ char *pool; -+ int error; -+ -+ pair = nvlist_next_nvpair(holds, NULL); -+ if (pair == NULL) -+ return (0); -+ -+ /* -+ * The release may cause snapshots to be destroyed; make sure they -+ * are not mounted. -+ */ -+ if (tmpdp != NULL) { -+ /* Temporary holds are specified by dsobj string. */ -+ ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; -+ pool = spa_name(tmpdp->dp_spa); -+#ifdef _KERNEL -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ dsl_dataset_t *ds; -+ -+ dsl_pool_config_enter(tmpdp, FTAG); -+ error = dsl_dataset_hold_obj_string(tmpdp, -+ nvpair_name(pair), FTAG, &ds); -+ if (error == 0) { -+ char name[MAXNAMELEN]; -+ dsl_dataset_name(ds, name); -+ dsl_pool_config_exit(tmpdp, FTAG); -+ dsl_dataset_rele(ds, FTAG); -+ (void) zfs_unmount_snap(name); -+ } else { -+ dsl_pool_config_exit(tmpdp, FTAG); -+ } -+ } -+#endif -+ } else { -+ /* Non-temporary holds are specified by name. */ -+ ddura.ddura_holdfunc = dsl_dataset_hold; -+ pool = nvpair_name(pair); -+#ifdef _KERNEL -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ (void) zfs_unmount_snap(nvpair_name(pair)); -+ } -+#endif -+ } -+ -+ ddura.ddura_holds = holds; -+ ddura.ddura_errlist = errlist; -+ VERIFY0(nvlist_alloc(&ddura.ddura_todelete, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ VERIFY0(nvlist_alloc(&ddura.ddura_chkholds, NV_UNIQUE_NAME, -+ KM_PUSHPAGE)); -+ -+ error = dsl_sync_task(pool, dsl_dataset_user_release_check, -+ dsl_dataset_user_release_sync, &ddura, -+ fnvlist_num_pairs(holds)); -+ fnvlist_free(ddura.ddura_todelete); -+ fnvlist_free(ddura.ddura_chkholds); -+ -+ return (error); -+} -+ -+/* -+ * holds is nvl of snapname -> { holdname, ... } -+ * errlist will be filled in with snapname -> error -+ */ -+int -+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) -+{ -+ return (dsl_dataset_user_release_impl(holds, errlist, NULL)); -+} -+ -+/* -+ * holds is nvl of snapdsobj -> { holdname, ... } -+ */ -+void -+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) -+{ -+ ASSERT(dp != NULL); -+ (void) dsl_dataset_user_release_impl(holds, NULL, dp); -+} -+ -+int -+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; -+ int err; -+ -+ err = dsl_pool_hold(dsname, FTAG, &dp); -+ if (err != 0) -+ return (err); -+ err = dsl_dataset_hold(dp, dsname, FTAG, &ds); -+ if (err != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (err); -+ } -+ -+ if (ds->ds_phys->ds_userrefs_obj != 0) { -+ zap_attribute_t *za; -+ zap_cursor_t zc; -+ -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); -+ for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, -+ ds->ds_phys->ds_userrefs_obj); -+ zap_cursor_retrieve(&zc, za) == 0; -+ zap_cursor_advance(&zc)) { -+ fnvlist_add_uint64(nvl, za->za_name, -+ za->za_first_integer); -+ } -+ zap_cursor_fini(&zc); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ } -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (0); -+} -diff --git a/module/zfs/fm.c b/module/zfs/fm.c -index c004032..246b3d2 100644 ---- a/module/zfs/fm.c -+++ b/module/zfs/fm.c -@@ -86,2 +86,10 @@ static int zevent_flags = 0; - -+/* -+ * The EID (Event IDentifier) is used to uniquely tag a zevent when it is -+ * posted. The posted EIDs are monotonically increasing but not persistent. -+ * They will be reset to the initial value (1) each time the kernel module is -+ * loaded. -+ */ -+static uint64_t zevent_eid = 0; -+ - static kmutex_t zevent_lock; -@@ -278,4 +286,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -292,4 +300,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -306,4 +314,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -320,4 +328,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -334,4 +342,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -348,4 +356,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -362,4 +370,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -376,4 +384,4 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) - for (i = 0; i < nelem; i++) -- c = fm_printf(d + 1, c, cols, "0x%llx ", -- (u_longlong_t)val[i]); -+ c = fm_printf(d + 1, c, cols, "0x%llx ", -+ (u_longlong_t)val[i]); - -@@ -420,7 +428,7 @@ zfs_zevent_alloc(void) - -- ev = kmem_zalloc(sizeof(zevent_t), KM_PUSHPAGE); -+ ev = kmem_zalloc(sizeof (zevent_t), KM_PUSHPAGE); - if (ev == NULL) -- return NULL; -+ return (NULL); - -- list_create(&ev->ev_ze_list, sizeof(zfs_zevent_t), -+ list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), - offsetof(zfs_zevent_t, ze_node)); -@@ -428,3 +436,3 @@ zfs_zevent_alloc(void) - -- return ev; -+ return (ev); - } -@@ -438,3 +446,3 @@ zfs_zevent_free(zevent_t *ev) - list_destroy(&ev->ev_ze_list); -- kmem_free(ev, sizeof(zevent_t)); -+ kmem_free(ev, sizeof (zevent_t)); - } -@@ -500,2 +508,3 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - timestruc_t tv; -+ uint64_t eid; - size_t nvl_size = 0; -@@ -511,2 +520,8 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - -+ eid = atomic_inc_64_nv(&zevent_eid); -+ if (nvlist_add_uint64(nvl, FM_EREPORT_EID, eid)) { -+ atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1); -+ return; -+ } -+ - (void) nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); -@@ -526,5 +541,6 @@ zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) - -- ev->ev_nvl = nvl; -+ ev->ev_nvl = nvl; - ev->ev_detector = detector; - ev->ev_cb = cb; -+ ev->ev_eid = eid; - -@@ -552,8 +568,8 @@ zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) - -- fp = getf(fd); -- if (fp == NULL) -- return (EBADF); -+ fp = getf(fd); -+ if (fp == NULL) -+ return (EBADF); - -- *minorp = zfsdev_getminor(fp->f_file); -- error = zfs_zevent_minor_to_state(*minorp, ze); -+ *minorp = zfsdev_getminor(fp->f_file); -+ error = zfs_zevent_minor_to_state(*minorp, ze); - -@@ -579,3 +595,3 @@ int - zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, -- uint64_t *dropped) -+ uint64_t *dropped) - { -@@ -594,4 +610,6 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, - } else { -- /* Existing stream continue with the next element and remove -- * ourselves from the wait queue for the previous element */ -+ /* -+ * Existing stream continue with the next element and remove -+ * ourselves from the wait queue for the previous element -+ */ - ev = list_prev(&zevent_list, ze->ze_zevent); -@@ -621,3 +639,3 @@ out: - -- return error; -+ return (error); - } -@@ -645,3 +663,64 @@ out: - -- return error; -+ return (error); -+} -+ -+/* -+ * The caller may seek to a specific EID by passing that EID. If the EID -+ * is still available in the posted list of events the cursor is positioned -+ * there. Otherwise ENOENT is returned and the cursor is not moved. -+ * -+ * There are two reserved EIDs which may be passed and will never fail. -+ * ZEVENT_SEEK_START positions the cursor at the start of the list, and -+ * ZEVENT_SEEK_END positions the cursor at the end of the list. -+ */ -+int -+zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) -+{ -+ zevent_t *ev; -+ int error = 0; -+ -+ mutex_enter(&zevent_lock); -+ -+ if (eid == ZEVENT_SEEK_START) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ze->ze_zevent = NULL; -+ goto out; -+ } -+ -+ if (eid == ZEVENT_SEEK_END) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ev = list_head(&zevent_list); -+ if (ev) { -+ ze->ze_zevent = ev; -+ list_insert_head(&ev->ev_ze_list, ze); -+ } else { -+ ze->ze_zevent = NULL; -+ } -+ -+ goto out; -+ } -+ -+ for (ev = list_tail(&zevent_list); ev != NULL; -+ ev = list_prev(&zevent_list, ev)) { -+ if (ev->ev_eid == eid) { -+ if (ze->ze_zevent) -+ list_remove(&ze->ze_zevent->ev_ze_list, ze); -+ -+ ze->ze_zevent = ev; -+ list_insert_head(&ev->ev_ze_list, ze); -+ break; -+ } -+ } -+ -+ if (ev == NULL) -+ error = ENOENT; -+ -+out: -+ mutex_exit(&zevent_lock); -+ -+ return (error); - } -@@ -1514,3 +1593,4 @@ fm_init(void) - mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); -- list_create(&zevent_list, sizeof(zevent_t), offsetof(zevent_t, ev_node)); -+ list_create(&zevent_list, sizeof (zevent_t), -+ offsetof(zevent_t, ev_node)); - cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); -diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c -index 155404e..011fb91 100644 ---- a/module/zfs/gzip.c -+++ b/module/zfs/gzip.c -@@ -37,4 +37,4 @@ - typedef size_t zlen_t; --#define compress_func z_compress_level --#define uncompress_func z_uncompress -+#define compress_func z_compress_level -+#define uncompress_func z_uncompress - -@@ -46,4 +46,4 @@ typedef size_t zlen_t; - typedef uLongf zlen_t; --#define compress_func compress2 --#define uncompress_func uncompress -+#define compress_func compress2 -+#define uncompress_func uncompress - -diff --git a/module/zfs/lz4.c b/module/zfs/lz4.c -index 8afaad1..df96373 100644 ---- a/module/zfs/lz4.c -+++ b/module/zfs/lz4.c -@@ -49,3 +49,4 @@ static kmem_cache_t *lz4_cache; - size_t --lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, -+ size_t d_len, int n) - { -@@ -76,3 +77,4 @@ lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - int --lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) -+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, -+ size_t d_len, int n) - { -@@ -145,4 +147,4 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated -- * by the caller (either on the stack or using kmem_cache_alloc). Passing NULL -- * isn't valid. -+ * by the caller (either on the stack or using kmem_cache_alloc). Passing -+ * NULL isn't valid. - * -@@ -153,4 +155,4 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated -- * by the caller (either on the stack or using kmem_cache_alloc). Passing NULL -- * isn't valid. -+ * by the caller (either on the stack or using kmem_cache_alloc). Passing -+ * NULL isn't valid. - */ -@@ -238,2 +240,5 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - #undef LZ4_FORCE_SW_BITCOUNT -+#if defined(__sparc) -+#define LZ4_FORCE_SW_BITCOUNT -+#endif - -@@ -269,3 +274,3 @@ lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - --#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ -+#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ - (((x) & 0xffu) << 8))) -@@ -1011,2 +1016 @@ lz4_fini(void) - } -- -diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c -index 43d0df0..83ff409 100644 ---- a/module/zfs/lzjb.c -+++ b/module/zfs/lzjb.c -@@ -52,3 +52,4 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - uchar_t *dst = d_start; -- uchar_t *cpy, *copymap = NULL; -+ uchar_t *cpy; -+ uchar_t *copymap = NULL; - int copymask = 1 << (NBBY - 1); -@@ -62,3 +63,4 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { -- kmem_free(lempel, LEMPEL_SIZE*sizeof(uint16_t)); -+ kmem_free(lempel, -+ LEMPEL_SIZE*sizeof (uint16_t)); - return (s_len); -@@ -106,3 +108,4 @@ lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) - uchar_t *d_end = (uchar_t *)d_start + d_len; -- uchar_t *cpy, copymap = 0; -+ uchar_t *cpy; -+ uchar_t copymap = 0; - int copymask = 1 << (NBBY - 1); -diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c -index cd1b6ce..6356f79 100644 ---- a/module/zfs/metaslab.c -+++ b/module/zfs/metaslab.c -@@ -22,3 +22,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ -@@ -33,3 +34,3 @@ - --#define WITH_DF_BLOCK_ALLOCATOR -+#define WITH_DF_BLOCK_ALLOCATOR - -@@ -61,5 +62,21 @@ int zfs_condense_pct = 200; - * If a device reaches this threshold in a given txg then we consider skipping -- * allocations on that device. -+ * allocations on that device. The value of zfs_mg_alloc_failures is computed -+ * in zio_init() unless it has been overridden in /etc/system. - */ --int zfs_mg_alloc_failures; -+int zfs_mg_alloc_failures = 0; -+ -+/* -+ * The zfs_mg_noalloc_threshold defines which metaslab groups should -+ * be eligible for allocation. The value is defined as a percentage of -+ * a free space. Metaslab groups that have more free space than -+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once -+ * a metaslab group's free space is less than or equal to the -+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that -+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. -+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all -+ * groups are allowed to accept allocations. Gang blocks are always -+ * eligible to allocate on any metaslab group. The default value of 0 means -+ * no metaslab group will be excluded based on this criterion. -+ */ -+int zfs_mg_noalloc_threshold = 0; - -@@ -103,2 +120,7 @@ int metaslab_smo_bonus_pct = 150; - /* -+ * Should we be willing to write data to degraded vdevs? -+ */ -+boolean_t zfs_write_to_degraded = B_FALSE; -+ -+/* - * ========================================================================== -@@ -224,2 +246,49 @@ metaslab_compare(const void *x1, const void *x2) - -+/* -+ * Update the allocatable flag and the metaslab group's capacity. -+ * The allocatable flag is set to true if the capacity is below -+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions -+ * from allocatable to non-allocatable or vice versa then the metaslab -+ * group's class is updated to reflect the transition. -+ */ -+static void -+metaslab_group_alloc_update(metaslab_group_t *mg) -+{ -+ vdev_t *vd = mg->mg_vd; -+ metaslab_class_t *mc = mg->mg_class; -+ vdev_stat_t *vs = &vd->vdev_stat; -+ boolean_t was_allocatable; -+ -+ ASSERT(vd == vd->vdev_top); -+ -+ mutex_enter(&mg->mg_lock); -+ was_allocatable = mg->mg_allocatable; -+ -+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / -+ (vs->vs_space + 1); -+ -+ mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); -+ -+ /* -+ * The mc_alloc_groups maintains a count of the number of -+ * groups in this metaslab class that are still above the -+ * zfs_mg_noalloc_threshold. This is used by the allocating -+ * threads to determine if they should avoid allocations to -+ * a given group. The allocator will avoid allocations to a group -+ * if that group has reached or is below the zfs_mg_noalloc_threshold -+ * and there are still other groups that are above the threshold. -+ * When a group transitions from allocatable to non-allocatable or -+ * vice versa we update the metaslab class to reflect that change. -+ * When the mc_alloc_groups value drops to 0 that means that all -+ * groups have reached the zfs_mg_noalloc_threshold making all groups -+ * eligible for allocations. This effectively means that all devices -+ * are balanced again. -+ */ -+ if (was_allocatable && !mg->mg_allocatable) -+ mc->mc_alloc_groups--; -+ else if (!was_allocatable && mg->mg_allocatable) -+ mc->mc_alloc_groups++; -+ mutex_exit(&mg->mg_lock); -+} -+ - metaslab_group_t * -@@ -274,2 +343,3 @@ metaslab_group_activate(metaslab_group_t *mg) - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); -+ metaslab_group_alloc_update(mg); - -@@ -359,2 +429,25 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) - /* -+ * Determine if a given metaslab group should skip allocations. A metaslab -+ * group should avoid allocations if its used capacity has crossed the -+ * zfs_mg_noalloc_threshold and there is at least one metaslab group -+ * that can still handle allocations. -+ */ -+static boolean_t -+metaslab_group_allocatable(metaslab_group_t *mg) -+{ -+ vdev_t *vd = mg->mg_vd; -+ spa_t *spa = vd->vdev_spa; -+ metaslab_class_t *mc = mg->mg_class; -+ -+ /* -+ * A metaslab group is considered allocatable if its free capacity -+ * is greater than the set value of zfs_mg_noalloc_threshold, it's -+ * associated with a slog, or there are no other metaslab groups -+ * with free capacity greater than zfs_mg_noalloc_threshold. -+ */ -+ return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || -+ mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); -+} -+ -+/* - * ========================================================================== -@@ -833,2 +926,12 @@ metaslab_weight(metaslab_t *msp) - /* -+ * This vdev is in the process of being removed so there is nothing -+ * for us to do here. -+ */ -+ if (vd->vdev_removing) { -+ ASSERT0(smo->smo_alloc); -+ ASSERT0(vd->vdev_ms_shift); -+ return (0); -+ } -+ -+ /* - * The baseline weight is the metaslab's free space. -@@ -1214,4 +1317,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - space_map_t *sm = msp->ms_map; -- space_map_t *freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -- space_map_t *defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; -+ space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -+ space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; - metaslab_group_t *mg = msp->ms_group; -@@ -1229,4 +1332,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - */ -- if (freed_map == NULL) { -- ASSERT(defer_map == NULL); -+ if (*freed_map == NULL) { -+ ASSERT(*defer_map == NULL); - for (t = 0; t < TXG_SIZE; t++) { -@@ -1249,4 +1352,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - -- freed_map = msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -- defer_map = msp->ms_defermap[txg % TXG_DEFER_SIZE]; -+ freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; -+ defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; - -@@ -1256,3 +1359,3 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - alloc_delta = smosync->smo_alloc - smo->smo_alloc; -- defer_delta = freed_map->sm_space - defer_map->sm_space; -+ defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; - -@@ -1266,8 +1369,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - * so that we have a consistent view of the in-core space map. -- * Then, add defer_map (oldest deferred frees) to this map and -- * transfer freed_map (this txg's frees) to defer_map. - */ - space_map_load_wait(sm); -- space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); -- space_map_vacate(freed_map, space_map_add, defer_map); -+ -+ /* -+ * Move the frees from the defer_map to this map (if it's loaded). -+ * Swap the freed_map and the defer_map -- this is safe to do -+ * because we've just emptied out the defer_map. -+ */ -+ space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); -+ ASSERT0((*defer_map)->sm_space); -+ ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); -+ space_map_swap(freed_map, defer_map); - -@@ -1286,2 +1395,4 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) - -+ metaslab_group_alloc_update(mg); -+ - /* -@@ -1390,2 +1501,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - } -+ -+ /* -+ * If the selected metaslab is condensing, skip it. -+ */ -+ if (msp->ms_map->sm_condensing) -+ continue; -+ - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; -@@ -1408,2 +1526,4 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - -+ mutex_enter(&msp->ms_lock); -+ - /* -@@ -1424,15 +1544,4 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - mg->mg_alloc_failures); -- return (-1ULL); -- } -- -- mutex_enter(&msp->ms_lock); -- -- /* -- * If this metaslab is currently condensing then pick again as -- * we can't manipulate this metaslab until it's committed -- * to disk. -- */ -- if (msp->ms_map->sm_condensing) { - mutex_exit(&msp->ms_lock); -- continue; -+ return (-1ULL); - } -@@ -1465,2 +1574,12 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, - -+ /* -+ * If this metaslab is currently condensing then pick again as -+ * we can't manipulate this metaslab until it's committed -+ * to disk. -+ */ -+ if (msp->ms_map->sm_condensing) { -+ mutex_exit(&msp->ms_lock); -+ continue; -+ } -+ - if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) -@@ -1508,3 +1627,3 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -1593,2 +1712,17 @@ top: - } -+ -+ /* -+ * Determine if the selected metaslab group is eligible -+ * for allocations. If we're ganging or have requested -+ * an allocation for the smallest gang block size -+ * then we don't want to avoid allocating to the this -+ * metaslab group. If we're in this condition we should -+ * try to allocate from any device possible so that we -+ * don't inadvertently return ENOSPC and suspend the pool -+ * even though space is still available. -+ */ -+ if (allocatable && CAN_FASTGANG(flags) && -+ psize > SPA_GANGBLOCKSIZE) -+ allocatable = metaslab_group_allocatable(mg); -+ - if (!allocatable) -@@ -1598,2 +1732,3 @@ top: - * Avoid writing single-copy data to a failing vdev -+ * unless the user instructs us that it is okay. - */ -@@ -1601,3 +1736,5 @@ top: - vd->vdev_state < VDEV_STATE_HEALTHY) && -- d == 0 && dshift == 3) { -+ d == 0 && dshift == 3 && -+ !(zfs_write_to_degraded && vd->vdev_state == -+ VDEV_STATE_DEGRADED)) { - all_zero = B_FALSE; -@@ -1691,3 +1828,4 @@ next: - mutex_exit(&mc->mc_fastwrite_lock); -- return (ENOSPC); -+ -+ return (SET_ERROR(ENOSPC)); - } -@@ -1760,3 +1898,3 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) - (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -1773,3 +1911,3 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) - if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -1808,3 +1946,3 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - spa_config_exit(spa, SCL_ALLOC, FTAG); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -1885,3 +2023,4 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) - --void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -+void -+metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) - { -@@ -1907,3 +2046,4 @@ void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) - --void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -+void -+metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) - { -@@ -1930,2 +2070,42 @@ void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) - -+static void -+checkmap(space_map_t *sm, uint64_t off, uint64_t size) -+{ -+ space_seg_t *ss; -+ avl_index_t where; -+ -+ mutex_enter(sm->sm_lock); -+ ss = space_map_find(sm, off, size, &where); -+ if (ss != NULL) -+ panic("freeing free block; ss=%p", (void *)ss); -+ mutex_exit(sm->sm_lock); -+} -+ -+void -+metaslab_check_free(spa_t *spa, const blkptr_t *bp) -+{ -+ int i, j; -+ -+ if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) -+ return; -+ -+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); -+ for (i = 0; i < BP_GET_NDVAS(bp); i++) { -+ uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); -+ vdev_t *vd = vdev_lookup_top(spa, vdid); -+ uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); -+ uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); -+ metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; -+ -+ if (ms->ms_map->sm_loaded) -+ checkmap(ms->ms_map, off, size); -+ -+ for (j = 0; j < TXG_SIZE; j++) -+ checkmap(ms->ms_freemap[j], off, size); -+ for (j = 0; j < TXG_DEFER_SIZE; j++) -+ checkmap(ms->ms_defermap[j], off, size); -+ } -+ spa_config_exit(spa, SCL_VDEV, FTAG); -+} -+ - #if defined(_KERNEL) && defined(HAVE_SPL) -diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c -index e43807c..49980ef 100644 ---- a/module/zfs/refcount.c -+++ b/module/zfs/refcount.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2012 by Delphix. All rights reserved. - */ -@@ -34,3 +35,3 @@ int reference_tracking_enable = TRUE; - #endif --int reference_history = 4; /* tunable */ -+int reference_history = 3; /* tunable */ - -@@ -66,2 +67,10 @@ refcount_create(refcount_t *rc) - rc->rc_removed_count = 0; -+ rc->rc_tracked = reference_tracking_enable; -+} -+ -+void -+refcount_create_untracked(refcount_t *rc) -+{ -+ refcount_create(rc); -+ rc->rc_tracked = B_FALSE; - } -@@ -98,3 +107,2 @@ refcount_is_zero(refcount_t *rc) - { -- ASSERT(rc->rc_count >= 0); - return (rc->rc_count == 0); -@@ -105,3 +113,2 @@ refcount_count(refcount_t *rc) - { -- ASSERT(rc->rc_count >= 0); - return (rc->rc_count); -@@ -115,3 +122,3 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) - -- if (reference_tracking_enable) { -+ if (rc->rc_tracked) { - ref = kmem_cache_alloc(reference_cache, KM_PUSHPAGE); -@@ -122,3 +129,3 @@ refcount_add_many(refcount_t *rc, uint64_t number, void *holder) - ASSERT(rc->rc_count >= 0); -- if (reference_tracking_enable) -+ if (rc->rc_tracked) - list_insert_head(&rc->rc_list, ref); -@@ -146,3 +153,3 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) - -- if (!reference_tracking_enable) { -+ if (!rc->rc_tracked) { - rc->rc_count -= number; -@@ -163,3 +170,3 @@ refcount_remove_many(refcount_t *rc, uint64_t number, void *holder) - rc->rc_removed_count++; -- if (rc->rc_removed_count >= reference_history) { -+ if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); -diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c -index 4cef53f..357afbf 100644 ---- a/module/zfs/rrwlock.c -+++ b/module/zfs/rrwlock.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2012 by Delphix. All rights reserved. -+ */ - -@@ -74,4 +77,5 @@ uint_t rrw_tsd_key; - typedef struct rrw_node { -- struct rrw_node *rn_next; -- rrwlock_t *rn_rrl; -+ struct rrw_node *rn_next; -+ rrwlock_t *rn_rrl; -+ void *rn_tag; - } rrw_node_t; -@@ -97,3 +101,3 @@ rrn_find(rrwlock_t *rrl) - static void --rrn_add(rrwlock_t *rrl) -+rrn_add(rrwlock_t *rrl, void *tag) - { -@@ -101,5 +105,6 @@ rrn_add(rrwlock_t *rrl) - -- rn = kmem_alloc(sizeof (*rn), KM_SLEEP); -+ rn = kmem_alloc(sizeof (*rn), KM_PUSHPAGE); - rn->rn_rrl = rrl; - rn->rn_next = tsd_get(rrw_tsd_key); -+ rn->rn_tag = tag; - VERIFY(tsd_set(rrw_tsd_key, rn) == 0); -@@ -112,3 +117,3 @@ rrn_add(rrwlock_t *rrl) - static boolean_t --rrn_find_and_remove(rrwlock_t *rrl) -+rrn_find_and_remove(rrwlock_t *rrl, void *tag) - { -@@ -121,3 +126,3 @@ rrn_find_and_remove(rrwlock_t *rrl) - for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { -- if (rn->rn_rrl == rrl) { -+ if (rn->rn_rrl == rrl && rn->rn_tag == tag) { - if (prev) -@@ -135,3 +140,3 @@ rrn_find_and_remove(rrwlock_t *rrl) - void --rrw_init(rrwlock_t *rrl) -+rrw_init(rrwlock_t *rrl, boolean_t track_all) - { -@@ -143,2 +148,3 @@ rrw_init(rrwlock_t *rrl) - rrl->rr_writer_wanted = B_FALSE; -+ rrl->rr_track_all = track_all; - } -@@ -155,3 +161,3 @@ rrw_destroy(rrwlock_t *rrl) - --static void -+void - rrw_enter_read(rrwlock_t *rrl, void *tag) -@@ -160,3 +166,4 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - #if !defined(DEBUG) && defined(_KERNEL) -- if (!rrl->rr_writer && !rrl->rr_writer_wanted) { -+ if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && -+ !rrl->rr_track_all) { - rrl->rr_anon_rcount.rc_count++; -@@ -170,3 +177,3 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - -- while (rrl->rr_writer || (rrl->rr_writer_wanted && -+ while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && - refcount_is_zero(&rrl->rr_anon_rcount) && -@@ -175,5 +182,5 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - -- if (rrl->rr_writer_wanted) { -+ if (rrl->rr_writer_wanted || rrl->rr_track_all) { - /* may or may not be a re-entrant enter */ -- rrn_add(rrl); -+ rrn_add(rrl, tag); - (void) refcount_add(&rrl->rr_linked_rcount, tag); -@@ -186,3 +193,3 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) - --static void -+void - rrw_enter_write(rrwlock_t *rrl) -@@ -232,6 +239,8 @@ rrw_exit(rrwlock_t *rrl, void *tag) - int64_t count; -- if (rrn_find_and_remove(rrl)) -+ if (rrn_find_and_remove(rrl, tag)) { - count = refcount_remove(&rrl->rr_linked_rcount, tag); -- else -+ } else { -+ ASSERT(!rrl->rr_track_all); - count = refcount_remove(&rrl->rr_anon_rcount, tag); -+ } - if (count == 0) -@@ -248,2 +257,7 @@ rrw_exit(rrwlock_t *rrl, void *tag) - -+/* -+ * If the lock was created with track_all, rrw_held(RW_READER) will return -+ * B_TRUE iff the current thread has the lock for reader. Otherwise it may -+ * return B_TRUE if any thread has the lock for reader. -+ */ - boolean_t -@@ -258,3 +272,3 @@ rrw_held(rrwlock_t *rrl, krw_t rw) - held = (!refcount_is_zero(&rrl->rr_anon_rcount) || -- !refcount_is_zero(&rrl->rr_linked_rcount)); -+ rrn_find(rrl) != NULL); - } -@@ -264 +278,11 @@ rrw_held(rrwlock_t *rrl, krw_t rw) - } -+ -+void -+rrw_tsd_destroy(void *arg) -+{ -+ rrw_node_t *rn = arg; -+ if (rn != NULL) { -+ panic("thread %p terminating with rrw lock %p held", -+ (void *)curthread, (void *)rn->rn_rrl); -+ } -+} -diff --git a/module/zfs/sa.c b/module/zfs/sa.c -index 581cf4b..fcc5f3b 100644 ---- a/module/zfs/sa.c -+++ b/module/zfs/sa.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -112,2 +112,3 @@ - * Byteswap implications: -+ * - * Since the SA attributes are not entirely self describing we can't do -@@ -190,3 +191,2 @@ sa_attr_reg_t sa_legacy_attrs[] = { - /* -- * ZPL legacy layout - * This is only used for objects of type DMU_OT_ZNODE -@@ -200,3 +200,2 @@ sa_attr_type_t sa_legacy_zpl_layout[] = { - */ -- - sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; -@@ -254,3 +253,3 @@ sa_spill_alloc(int flags) - { -- return kmem_cache_alloc(spill_cache, flags); -+ return (kmem_cache_alloc(spill_cache, flags)); - } -@@ -392,3 +391,3 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, - if (bulk[i].sa_addr == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if (bulk[i].sa_data) { -@@ -524,3 +523,3 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) - ASSERT(0); -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - } else { -@@ -574,6 +573,5 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - int i; -- int j = -1; - int full_space; - int hdrsize; -- boolean_t done = B_FALSE; -+ int extra_hdrsize; - -@@ -588,6 +586,5 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *total = 0; -+ *will_spill = B_FALSE; - -- if (buftype == SA_BONUS) -- *will_spill = B_FALSE; -- -+ extra_hdrsize = 0; - hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : -@@ -603,4 +600,4 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *total += attr_desc[i].sa_length; -- if (done) -- goto next; -+ if (*will_spill) -+ continue; - -@@ -612,3 +609,8 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - if (is_var_sz && var_size > 1) { -- if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + -+ /* -+ * Don't worry that the spill block might overflow. -+ * It will be resized if needed in sa_build_layouts(). -+ */ -+ if (buftype == SA_SPILL || -+ P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + - *total < full_space) { -@@ -617,12 +619,14 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - * optional sizes of variable-length attributes. -- * Record the index in case this increase needs -- * to be reversed due to spill-over. -+ * Record the extra header size in case this -+ * increase needs to be reversed due to -+ * spill-over. - */ - hdrsize += sizeof (uint16_t); -- j = i; -+ if (*index != -1) -+ extra_hdrsize += sizeof (uint16_t); - } else { -- done = B_TRUE; -- *index = i; -- if (buftype == SA_BONUS) -- *will_spill = B_TRUE; -+ ASSERT(buftype == SA_BONUS); -+ if (*index == -1) -+ *index = i; -+ *will_spill = B_TRUE; - continue; -@@ -641,6 +645,4 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, - *index = i; -- done = B_TRUE; - } - --next: - if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && -@@ -650,9 +652,4 @@ next: - -- /* -- * j holds the index of the last variable-sized attribute for -- * which hdrsize was increased. Reverse the increase if that -- * attribute will be relocated to the spill block. -- */ -- if (*will_spill && j == *index) -- hdrsize -= sizeof (uint16_t); -+ if (*will_spill) -+ hdrsize -= extra_hdrsize; - -@@ -681,3 +678,4 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - int i, lot_count; -- int hdrsize, spillhdrsize = 0; -+ int hdrsize; -+ int spillhdrsize = 0; - int used; -@@ -697,3 +695,3 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - if (used > SPA_MAXBLOCKSIZE) -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - -@@ -721,3 +719,3 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, - if (spill_used > SPA_MAXBLOCKSIZE) -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - -@@ -878,3 +876,3 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) - if (error == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto bail; -@@ -909,3 +907,3 @@ sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - switch (error) { -@@ -1021,6 +1019,6 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - -- mutex_enter(&os->os_lock); -+ mutex_enter(&os->os_user_ptr_lock); - if (os->os_sa) { - mutex_enter(&os->os_sa->sa_lock); -- mutex_exit(&os->os_lock); -+ mutex_exit(&os->os_user_ptr_lock); - tb = os->os_sa->sa_user_table; -@@ -1037,3 +1035,3 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - mutex_enter(&sa->sa_lock); -- mutex_exit(&os->os_lock); -+ mutex_exit(&os->os_user_ptr_lock); - avl_create(&sa->sa_layout_num_tree, layout_num_compare, -@@ -1068,3 +1066,3 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - if (error == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto fail; -@@ -1147,3 +1145,4 @@ sa_tear_down(objset_t *os) - cookie = NULL; -- while ((layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))){ -+ while ((layout = -+ avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) { - sa_idx_tab_t *tab; -@@ -1156,3 +1155,3 @@ sa_tear_down(objset_t *os) - cookie = NULL; -- while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))){ -+ while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) { - kmem_free(layout->lot_attrs, -@@ -1735,3 +1734,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - for (; k != 2; k++) { -- /* iterate over each attribute in layout */ -+ /* -+ * Iterate over each attribute in layout. Fetch the -+ * size of variable-length attributes needing rewrite -+ * from sa_lengths[]. -+ */ - for (i = 0, length_idx = 0; i != count; i++) { -@@ -1740,3 +1743,6 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - attr = idx_tab->sa_layout->lot_attrs[i]; -+ length = SA_REGISTERED_LEN(sa, attr); - if (attr == newattr) { -+ if (length == 0) -+ ++length_idx; - if (action == SA_REMOVE) { -@@ -1745,3 +1751,3 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - } -- ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); -+ ASSERT(length == 0); - ASSERT(action == SA_REPLACE); -@@ -1750,6 +1756,4 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - } else { -- length = SA_REGISTERED_LEN(sa, attr); -- if (length == 0) { -+ if (length == 0) - length = hdr->sa_lengths[length_idx++]; -- } - -@@ -1775,3 +1779,3 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, - SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, -- datastart, buflen); -+ datastart, length); - } -diff --git a/module/zfs/spa.c b/module/zfs/spa.c -index 65f78b7..af93b7c 100644 ---- a/module/zfs/spa.c -+++ b/module/zfs/spa.c -@@ -23,4 +23,4 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - */ -@@ -28,2 +28,4 @@ - /* -+ * SPA: Storage Pool Allocator -+ * - * This file contains all the routines used when modifying on-disk SPA state. -@@ -66,2 +68,3 @@ - #include -+#include - #include -@@ -82,3 +85,2 @@ typedef enum zti_modes { - ZTI_MODE_FIXED, /* value is # of threads (min 1) */ -- ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ -@@ -133,6 +135,4 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - --static dsl_syncfunc_t spa_sync_version; --static dsl_syncfunc_t spa_sync_props; --static dsl_checkfunc_t spa_change_guid_check; --static dsl_syncfunc_t spa_change_guid_sync; -+static void spa_sync_version(void *arg, dmu_tx_t *tx); -+static void spa_sync_props(void *arg, dmu_tx_t *tx); - static boolean_t spa_has_active_shared_spare(spa_t *spa); -@@ -143,3 +143,3 @@ static void spa_vdev_resilver_done(spa_t *spa); - --uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ -+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ - id_t zio_taskq_psrset_bind = PS_NONE; -@@ -290,3 +290,3 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - if (err) -- return err; -+ return (err); - -@@ -331,6 +331,6 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -+ dsl_pool_config_enter(dp, FTAG); - if ((err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds))) { -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_config_exit(dp, FTAG); - break; -@@ -343,3 +343,3 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) - dsl_dataset_rele(ds, FTAG); -- rw_exit(&dp->dp_config_rwlock); -+ dsl_pool_config_exit(dp, FTAG); - } else { -@@ -408,3 +408,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!zpool_prop_feature(propname)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -416,3 +416,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (nvpair_type(elem) != DATA_TYPE_UINT64) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -421,3 +421,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (nvpair_value_uint64(elem, &intval) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -426,3 +426,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (intval != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -432,3 +432,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (zfeature_lookup_name(fname, NULL) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -445,3 +445,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - has_feature)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -454,3 +454,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!error && intval > 1) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -464,3 +464,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (spa_version(spa) < SPA_VERSION_BOOTFS) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - break; -@@ -472,3 +472,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!vdev_is_bootable(spa->spa_root_vdev)) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - break; -@@ -490,3 +490,4 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - -- if ((error = dmu_objset_hold(strval,FTAG,&os))) -+ error = dmu_objset_hold(strval, FTAG, &os); -+ if (error) - break; -@@ -496,8 +497,9 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (dmu_objset_type(os) != DMU_OST_ZFS) { -- error = ENOTSUP; -- } else if ((error = dsl_prop_get_integer(strval, -+ error = SET_ERROR(ENOTSUP); -+ } else if ((error = -+ dsl_prop_get_int_ds(dmu_objset_ds(os), - zfs_prop_to_name(ZFS_PROP_COMPRESSION), -- &compress, NULL)) == 0 && -+ &compress)) == 0 && - !BOOTFS_COMPRESS_VALID(compress)) { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } else { -@@ -513,3 +515,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - intval > ZIO_FAILURE_MODE_PANIC)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - -@@ -527,3 +529,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - spa->spa_failmode = intval; -- error = EIO; -+ error = SET_ERROR(EIO); - } -@@ -542,3 +544,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (strval[0] != '/') { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -551,3 +553,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - strcmp(slash, "/..") == 0) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -559,3 +561,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (!isprint(*check)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -565,3 +567,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (strlen(strval) > ZPROP_MAX_COMMENT) -- error = E2BIG; -+ error = SET_ERROR(E2BIG); - break; -@@ -570,3 +572,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - if (spa_version(spa) < SPA_VERSION_DEDUP) -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - else -@@ -575,3 +577,3 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) - intval != 0 && intval < ZIO_DEDUPDITTO_MIN) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -663,4 +665,4 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) - */ -- error = dsl_sync_task_do(spa_get_dsl(spa), NULL, -- spa_sync_version, spa, &ver, 6); -+ error = dsl_sync_task(spa->spa_name, NULL, -+ spa_sync_version, &ver, 6); - if (error) -@@ -675,4 +677,4 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) - if (need_sync) { -- return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, -- spa, nvp, 6)); -+ return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, -+ nvp, 6)); - } -@@ -698,8 +700,8 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) - static int --spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_change_guid_check(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *rvd = spa->spa_root_vdev; - uint64_t vdev_state; -- ASSERTV(uint64_t *newguid = arg2); -+ ASSERTV(uint64_t *newguid = arg); - -@@ -710,3 +712,3 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) - if (vdev_state != VDEV_STATE_HEALTHY) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -718,6 +720,6 @@ spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_change_guid_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- uint64_t *newguid = arg2; -+ uint64_t *newguid = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - uint64_t oldguid; -@@ -733,4 +735,4 @@ spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- spa_history_log_internal(LOG_POOL_GUID_CHANGE, spa, tx, -- "old=%lld new=%lld", oldguid, *newguid); -+ spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", -+ oldguid, *newguid); - } -@@ -752,2 +754,3 @@ spa_change_guid(spa_t *spa) - -+ mutex_enter(&spa->spa_vdev_top_lock); - mutex_enter(&spa_namespace_lock); -@@ -755,4 +758,4 @@ spa_change_guid(spa_t *spa) - -- error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check, -- spa_change_guid_sync, spa, &guid, 5); -+ error = dsl_sync_task(spa->spa_name, spa_change_guid_check, -+ spa_change_guid_sync, &guid, 5); - -@@ -764,2 +767,3 @@ spa_change_guid(spa_t *spa) - mutex_exit(&spa_namespace_lock); -+ mutex_exit(&spa->spa_vdev_top_lock); - -@@ -835,27 +839,23 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) - -- for (i = 0; i < count; i++) { -- taskq_t *tq; -- -- switch (mode) { -- case ZTI_MODE_FIXED: -- ASSERT3U(value, >=, 1); -- value = MAX(value, 1); -- break; -+ switch (mode) { -+ case ZTI_MODE_FIXED: -+ ASSERT3U(value, >=, 1); -+ value = MAX(value, 1); -+ break; - -- case ZTI_MODE_BATCH: -- batch = B_TRUE; -- flags |= TASKQ_THREADS_CPU_PCT; -- value = zio_taskq_batch_pct; -- break; -+ case ZTI_MODE_BATCH: -+ batch = B_TRUE; -+ flags |= TASKQ_THREADS_CPU_PCT; -+ value = zio_taskq_batch_pct; -+ break; - -- case ZTI_MODE_ONLINE_PERCENT: -- flags |= TASKQ_THREADS_CPU_PCT; -- break; -+ default: -+ panic("unrecognized mode for %s_%s taskq (%u:%u) in " -+ "spa_activate()", -+ zio_type_name[t], zio_taskq_types[q], mode, value); -+ break; -+ } - -- default: -- panic("unrecognized mode for %s_%s taskq (%u:%u) in " -- "spa_activate()", -- zio_type_name[t], zio_taskq_types[q], mode, value); -- break; -- } -+ for (i = 0; i < count; i++) { -+ taskq_t *tq; - -@@ -876,3 +876,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) - } else { -- tq = taskq_create_proc(name, value, maxclsyspri, 50, -+ pri_t pri = maxclsyspri; -+ /* -+ * The write issue taskq can be extremely CPU -+ * intensive. Run it at slightly lower priority -+ * than the other taskqs. -+ */ -+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) -+ pri--; -+ -+ tq = taskq_create_proc(name, value, pri, 50, - INT_MAX, spa->spa_proc, flags); -@@ -1201,3 +1210,3 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, - *vdp = NULL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1370,3 +1379,3 @@ spa_load_spares(spa_t *spa) - */ -- spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), -+ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), - KM_PUSHPAGE); -@@ -1445,3 +1454,3 @@ spa_load_l2cache(spa_t *spa) - uint64_t guid; -- vdev_t *vd, **oldvdevs, **newvdevs = NULL; -+ vdev_t *vd, **oldvdevs, **newvdevs; - spa_aux_vdev_t *sav = &spa->spa_l2cache; -@@ -1456,2 +1465,3 @@ spa_load_l2cache(spa_t *spa) - nl2cache = 0; -+ newvdevs = NULL; - } -@@ -1598,3 +1608,4 @@ spa_check_removed(vdev_t *vd) - -- if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { -+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && -+ !vd->vdev_ishole) { - zfs_ereport_post(FM_EREPORT_RESOURCE_AUTOREPLACE, -@@ -1731,5 +1742,7 @@ spa_config_valid(spa_t *spa, nvlist_t *config) - */ --static int -+static boolean_t - spa_check_logs(spa_t *spa) - { -+ boolean_t rv = B_FALSE; -+ - switch (spa->spa_log_state) { -@@ -1740,10 +1753,9 @@ spa_check_logs(spa_t *spa) - case SPA_LOG_UNKNOWN: -- if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, -- DS_FIND_CHILDREN)) { -+ rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, -+ NULL, DS_FIND_CHILDREN) != 0); -+ if (rv) - spa_set_log_state(spa, SPA_LOG_MISSING); -- return (1); -- } - break; - } -- return (0); -+ return (rv); - } -@@ -1795,7 +1807,7 @@ spa_offline_log(spa_t *spa) - { -- int error = 0; -- -- if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, -- NULL, DS_FIND_CHILDREN)) == 0) { -+ int error; - -+ error = dmu_objset_find(spa_name(spa), zil_vdev_offline, -+ NULL, DS_FIND_CHILDREN); -+ if (error == 0) { - /* -@@ -1920,3 +1932,3 @@ spa_load_verify(spa_t *spa) - if (error != ENXIO && error != EIO) -- error = EIO; -+ error = SET_ERROR(EIO); - return (error); -@@ -2048,3 +2060,3 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2067,3 +2079,3 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - spa_guid_exists(pool_guid, 0)) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - } else { -@@ -2134,3 +2146,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2194,3 +2206,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -2425,8 +2437,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - cmn_err(CE_WARN, "pool '%s' could not be " -- "loaded as it was last accessed by " -- "another system (host: %s hostid: 0x%lx). " -- "See: http://zfsonlinux.org/msg/ZFS-8000-EY", -+ "loaded as it was last accessed by another " -+ "system (host: %s hostid: 0x%lx). See: " -+ "http://zfsonlinux.org/msg/ZFS-8000-EY", - spa_name(spa), hostname, - (unsigned long)hostid); -- return (EBADF); -+ return (SET_ERROR(EBADF)); - } -@@ -2619,3 +2631,3 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - -@@ -2718,2 +2730,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - /* -+ * Log the fact that we booted up (so that we can detect if -+ * we rebooted in the middle of an operation). -+ */ -+ spa_history_log_version(spa, "open"); -+ -+ /* - * Delete any inconsistent datasets. -@@ -2877,3 +2895,3 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -2912,3 +2930,3 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -3248,3 +3266,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - if (ndev == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3255,3 +3273,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - if (spa_version(spa) < version) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -3271,3 +3289,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - vdev_free(vd); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3282,3 +3300,3 @@ spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, - strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { -- error = ENOTBLK; -+ error = SET_ERROR(ENOTBLK); - vdev_free(vd); -@@ -3401,3 +3419,3 @@ int - spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, -- const char *history_str, nvlist_t *zplprops) -+ nvlist_t *zplprops) - { -@@ -3423,3 +3441,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - mutex_exit(&spa_namespace_lock); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -3476,3 +3494,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - if (error == 0 && !zfs_allocatable_devs(nvroot)) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - -@@ -3606,3 +3624,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - spa_configfile_set(spa, props, B_FALSE); -- spa_sync_props(spa, props, tx); -+ spa_sync_props(props, tx); - } -@@ -3622,5 +3640,3 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - -- if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) -- (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); -- spa_history_log_version(spa, LOG_POOL_CREATE); -+ spa_history_log_version(spa, "create"); - -@@ -3752,3 +3768,3 @@ spa_import_rootpool(char *devpath, char *devid) - devpath); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -3795,3 +3811,3 @@ spa_import_rootpool(char *devpath, char *devid) - (u_longlong_t)guid); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - goto out; -@@ -3807,3 +3823,3 @@ spa_import_rootpool(char *devpath, char *devid) - "try booting from '%s'", avd->vdev_path); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3821,3 +3837,3 @@ spa_import_rootpool(char *devpath, char *devid) - vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -3826,3 +3842,2 @@ spa_import_rootpool(char *devpath, char *devid) - error = 0; -- spa_history_log_version(spa, LOG_POOL_IMPORT); - out: -@@ -3843,3 +3858,3 @@ out: - int --spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) -+spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - { -@@ -3862,3 +3877,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -3888,3 +3903,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- spa_history_log_version(spa, LOG_POOL_IMPORT); -+ spa_history_log_version(spa, "import"); - -@@ -4019,3 +4034,3 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) - mutex_exit(&spa_namespace_lock); -- spa_history_log_version(spa, LOG_POOL_IMPORT); -+ spa_history_log_version(spa, "import"); - -@@ -4070,2 +4085,4 @@ spa_tryimport(nvlist_t *tryconfig) - spa->spa_load_info) == 0); -+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, -+ spa->spa_errata) == 0); - -@@ -4086,3 +4103,5 @@ spa_tryimport(nvlist_t *tryconfig) - char *cp; -- char *dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ char *dsname; -+ -+ dsname = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); - -@@ -4139,3 +4158,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - if (!(spa_mode_global & FWRITE)) -- return (EROFS); -+ return (SET_ERROR(EROFS)); - -@@ -4144,3 +4163,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -4178,3 +4197,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -4191,3 +4210,3 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - mutex_exit(&spa_namespace_lock); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -4387,3 +4406,2 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - uint64_t txg, dtl_max_txg; -- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) - vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; -@@ -4393,2 +4411,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - int error; -+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev); - -@@ -4501,3 +4520,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - /* mark the device being resilvered */ -- newvd->vdev_resilvering = B_TRUE; -+ newvd->vdev_resilver_txg = txg; - -@@ -4562,3 +4581,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - -- spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, -+ spa_history_log_internal(spa, "vdev attach", NULL, - "%s vdev=%s %s vdev=%s", -@@ -4579,2 +4598,3 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) - * Detach a device from a mirror or replacing vdev. -+ * - * If 'replace_done' is specified, only detach if the parent -@@ -4587,3 +4607,2 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - int error; -- ASSERTV(vdev_t *rvd = spa->spa_root_vdev;) - vdev_t *vd, *pvd, *cvd, *tvd; -@@ -4593,3 +4612,3 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - int c, t; -- -+ ASSERTV(vdev_t *rvd = spa->spa_root_vdev); - ASSERT(spa_writeable(spa)); -@@ -4731,3 +4750,2 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - vdev_remove_parent(cvd); -- cvd->vdev_resilvering = B_FALSE; - } -@@ -4780,3 +4798,3 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) - -- spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, -+ spa_history_log_internal(spa, "detach", NULL, - "vdev=%s", vdpath); -@@ -4901,3 +4919,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - } else { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4909,3 +4927,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - &glist[c]) != 0) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4916,3 +4934,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (vml[c] == NULL) { -- error = ENODEV; -+ error = SET_ERROR(ENODEV); - break; -@@ -4930,3 +4948,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4935,3 +4953,3 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (vdev_dtl_required(vml[c])) { -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - break; -@@ -5049,5 +5067,4 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - if (error == 0) -- spa_history_log_internal(LOG_POOL_VDEV_DETACH, -- spa, tx, "vdev=%s", -- vml[c]->vdev_path); -+ spa_history_log_internal(spa, "detach", tx, -+ "vdev=%s", vml[c]->vdev_path); - vdev_free(vml[c]); -@@ -5066,4 +5083,4 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - /* split is complete; log a history record */ -- spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, -- "split new pool %s from pool %s", newname, spa_name(spa)); -+ spa_history_log_internal(newspa, "split", NULL, -+ "from pool %s", spa_name(spa)); - -@@ -5167,3 +5184,3 @@ spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) - } else { -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -5238,7 +5255,5 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) - * lock. During each step the configuration is synced out. -- */ -- --/* -- * Remove a device from the pool. Currently, this supports removing only hot -- * spares, slogs, and level 2 ARC devices. -+ * -+ * Currently, this supports removing only hot spares, slogs, and level 2 ARC -+ * devices. - */ -@@ -5276,3 +5291,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - } else { -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - } -@@ -5336,3 +5351,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - */ -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } else { -@@ -5341,3 +5356,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - */ -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -5352,3 +5367,3 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) - * Find any device that's done replacing, or a vdev marked 'unspare' that's -- * current spared, so we can detach it. -+ * currently spared, so we can detach it. - */ -@@ -5455,2 +5470,4 @@ spa_vdev_resilver_done(spa_t *spa) - } -+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); -+ - spa_config_exit(spa, SCL_ALL, FTAG); -@@ -5529,3 +5546,3 @@ spa_scan_stop(spa_t *spa) - if (dsl_scan_resilvering(spa->spa_dsl_pool)) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - return (dsl_scan_cancel(spa->spa_dsl_pool)); -@@ -5539,3 +5556,3 @@ spa_scan(spa_t *spa, pool_scan_func_t func) - if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -5649,4 +5666,3 @@ spa_async_thread(spa_t *spa) - if (new_space != old_space) { -- spa_history_log_internal(LOG_POOL_VDEV_ONLINE, -- spa, NULL, -+ spa_history_log_internal(spa, "vdev online", NULL, - "pool '%s' size: %llu(+%llu)", -@@ -5770,2 +5786,27 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) - -+/* -+ * Note: this simple function is not inlined to make it easier to dtrace the -+ * amount of time spent syncing frees. -+ */ -+static void -+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(spa, NULL, NULL, 0); -+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx); -+ VERIFY(zio_wait(zio) == 0); -+} -+ -+/* -+ * Note: this simple function is not inlined to make it easier to dtrace the -+ * amount of time spent syncing deferred frees. -+ */ -+static void -+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) -+{ -+ zio_t *zio = zio_root(spa, NULL, NULL, 0); -+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, -+ spa_free_sync_cb, zio, tx), ==, 0); -+ VERIFY0(zio_wait(zio)); -+} -+ - static void -@@ -5831,3 +5872,3 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, - } else { -- list = kmem_alloc(sav->sav_count * sizeof (void *), KM_PUSHPAGE); -+ list = kmem_alloc(sav->sav_count*sizeof (void *), KM_PUSHPAGE); - for (i = 0; i < sav->sav_count; i++) -@@ -5879,6 +5920,7 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) - static void --spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_sync_version(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- uint64_t version = *(uint64_t *)arg2; -+ uint64_t *versionp = arg; -+ uint64_t version = *versionp; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - -@@ -5894,2 +5936,3 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) - vdev_config_dirty(spa->spa_root_vdev); -+ spa_history_log_internal(spa, "set", tx, "version=%lld", version); - } -@@ -5900,7 +5943,7 @@ spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx) - static void --spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_sync_props(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -+ nvlist_t *nvp = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; -- nvlist_t *nvp = arg2; - nvpair_t *elem = NULL; -@@ -5929,2 +5972,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - spa_feature_enable(spa, feature, tx); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=enabled", nvpair_name(elem)); - break; -@@ -5968,2 +6013,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - vdev_config_dirty(spa->spa_root_vdev); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%s", nvpair_name(elem), strval); - break; -@@ -5990,3 +6037,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - 1, strlen(strval) + 1, strval, tx) == 0); -- -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%s", nvpair_name(elem), strval); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { -@@ -6002,2 +6050,4 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - 8, 1, &intval, tx) == 0); -+ spa_history_log_internal(spa, "set", tx, -+ "%s=%lld", nvpair_name(elem), intval); - } else { -@@ -6030,9 +6080,2 @@ spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) - -- /* log internal history if this is not a zpool create */ -- if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && -- tx->tx_txg != TXG_INITIAL) { -- spa_history_log_internal(LOG_POOL_PROPSET, -- spa, tx, "%s %lld %s", -- nvpair_name(elem), intval, spa_name(spa)); -- } - } -@@ -6056,2 +6099,4 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) - -+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); -+ - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && -@@ -6081,2 +6126,3 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) - } -+ rrw_exit(&dp->dp_config_rwlock, FTAG); - } -@@ -6092,3 +6138,2 @@ spa_sync(spa_t *spa, uint64_t txg) - objset_t *mos = spa->spa_meta_objset; -- bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; - bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; -@@ -6139,3 +6184,3 @@ spa_sync(spa_t *spa, uint64_t txg) - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, -- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + -+ spa_deadman, spa, TQ_PUSHPAGE, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); -@@ -6175,6 +6220,3 @@ spa_sync(spa_t *spa, uint64_t txg) - txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { -- zio_t *zio = zio_root(spa, NULL, NULL, 0); -- VERIFY3U(bpobj_iterate(defer_bpo, -- spa_free_sync_cb, zio, tx), ==, 0); -- VERIFY0(zio_wait(zio)); -+ spa_sync_deferred_frees(spa, tx); - } -@@ -6196,9 +6238,6 @@ spa_sync(spa_t *spa, uint64_t txg) - if (pass < zfs_sync_pass_deferred_free) { -- zio_t *zio = zio_root(spa, NULL, NULL, 0); -- bplist_iterate(free_bpl, spa_free_sync_cb, -- zio, tx); -- VERIFY(zio_wait(zio) == 0); -+ spa_sync_frees(spa, free_bpl, tx); - } else { - bplist_iterate(free_bpl, bpobj_enqueue_cb, -- defer_bpo, tx); -+ &spa->spa_deferred_bpobj, tx); - } -diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c -index 5e5b405..c8fe79e 100644 ---- a/module/zfs/spa_config.c -+++ b/module/zfs/spa_config.c -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -198,3 +198,8 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) - * Synchronize pool configuration to disk. This must be called with the -- * namespace lock held. -+ * namespace lock held. Synchronizing the pool cache is typically done after -+ * the configuration has been synced to the MOS. This exposes a window where -+ * the MOS config will have been updated but the cache file has not. If -+ * the system were to crash at that instant then the cached config may not -+ * contain the correct information to open the pool and an explicity import -+ * would be required. - */ -@@ -205,2 +210,3 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) - nvlist_t *nvl; -+ char *pool_name; - -@@ -251,3 +257,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) - -- VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, -+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { -+ VERIFY0(nvlist_lookup_string(spa->spa_config, -+ ZPOOL_CONFIG_POOL_NAME, &pool_name)); -+ } else -+ pool_name = spa_name(spa); -+ -+ VERIFY(nvlist_add_nvlist(nvl, pool_name, - spa->spa_config) == 0); -@@ -322,2 +334,3 @@ spa_config_set(spa_t *spa, nvlist_t *config) - * Generate the pool's configuration based on the current in-core state. -+ * - * We infer whether to generate a complete config or just one top-level config -@@ -333,2 +346,3 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - uint64_t split_guid; -+ char *pool_name; - -@@ -349,2 +363,18 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - -+ /* -+ * Originally, users had to handle spa namespace collisions by either -+ * exporting the already imported pool or by specifying a new name for -+ * the pool with a conflicting name. In the case of root pools from -+ * virtual guests, neither approach to collision resolution is -+ * reasonable. This is addressed by extending the new name syntax with -+ * an option to specify that the new name is temporary. When specified, -+ * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us -+ * to use the previous name, which we do below. -+ */ -+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { -+ VERIFY0(nvlist_lookup_string(spa->spa_config, -+ ZPOOL_CONFIG_POOL_NAME, &pool_name)); -+ } else -+ pool_name = spa_name(spa); -+ - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); -@@ -354,3 +384,3 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, -- spa_name(spa)) == 0); -+ pool_name) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, -@@ -361,2 +391,4 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) - spa_guid(spa)) == 0); -+ VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, -+ spa->spa_errata) == 0); - VERIFY(spa->spa_comment == NULL || nvlist_add_string(config, -diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c -index 3fab192..35853e2 100644 ---- a/module/zfs/spa_errlog.c -+++ b/module/zfs/spa_errlog.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -177,3 +178,3 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) - zap_cursor_fini(&zc); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -184,4 +185,6 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) - (*count - 1) * sizeof (zbookmark_t), -- sizeof (zbookmark_t)) != 0) -- return (EFAULT); -+ sizeof (zbookmark_t)) != 0) { -+ zap_cursor_fini(&zc); -+ return (SET_ERROR(EFAULT)); -+ } - -@@ -203,3 +206,3 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) - if (*count == 0) -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - -@@ -208,3 +211,3 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) - sizeof (zbookmark_t)) != 0) -- return (EFAULT); -+ return (SET_ERROR(EFAULT)); - -diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c -index 9fb75f3..5b82238 100644 ---- a/module/zfs/spa_history.c -+++ b/module/zfs/spa_history.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2011 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -32,2 +32,4 @@ - #include -+#include -+#include - #include -@@ -35,2 +37,3 @@ - #include -+#include - #include "zfs_comutil.h" -@@ -187,3 +190,3 @@ spa_history_zone(void) - #else -- return ("global"); -+ return (NULL); - #endif -@@ -196,7 +199,6 @@ spa_history_zone(void) - static void --spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+spa_history_log_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- history_arg_t *hap = arg2; -- const char *history_str = hap->ha_history_str; -+ nvlist_t *nvl = arg; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; - objset_t *mos = spa->spa_meta_objset; -@@ -206,3 +208,2 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - uint64_t le_len; -- nvlist_t *nvrecord; - char *record_packed = NULL; -@@ -223,3 +224,3 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - */ -- VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); -+ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; -@@ -236,42 +237,32 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_PUSHPAGE) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, -- gethrestime_sec()) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, hap->ha_uid) == 0); -- if (hap->ha_zone != NULL) -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, -- hap->ha_zone) == 0); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); - #ifdef _KERNEL -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, -- utsname.nodename) == 0); -+ fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); - #endif -- if (hap->ha_log_type == LOG_CMD_POOL_CREATE || -- hap->ha_log_type == LOG_CMD_NORMAL) { -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, -- history_str) == 0); -- -- zfs_dbgmsg("command: %s", history_str); -- } else { -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, -- hap->ha_event) == 0); -- VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, -- tx->tx_txg) == 0); -- VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, -- history_str) == 0); -- -- zfs_dbgmsg("internal %s pool:%s txg:%llu %s", -- zfs_history_event_names[hap->ha_event], spa_name(spa), -- (longlong_t)tx->tx_txg, history_str); -- -+ if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { -+ zfs_dbgmsg("command: %s", -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); -+ } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { -+ if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { -+ zfs_dbgmsg("txg %lld %s %s (id %llu) %s", -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); -+ } else { -+ zfs_dbgmsg("txg %lld %s %s", -+ fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); -+ } -+ } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { -+ zfs_dbgmsg("ioctl %s", -+ fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); - } - -- VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); -- record_packed = kmem_alloc(reclen, KM_PUSHPAGE); -- -- VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, -- NV_ENCODE_XDR, KM_PUSHPAGE) == 0); -+ VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE, -+ KM_PUSHPAGE), ==, 0); - - mutex_enter(&spa->spa_history_lock); -- if (hap->ha_log_type == LOG_CMD_POOL_CREATE) -- VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); - -@@ -283,5 +274,6 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - -- if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { -- shpp->sh_pool_create_len += sizeof (le_len) + reclen; -- shpp->sh_bof = shpp->sh_pool_create_len; -+ /* The first command is the create, which we keep forever */ -+ if (ret == 0 && shpp->sh_pool_create_len == 0 && -+ nvlist_exists(nvl, ZPOOL_HIST_CMD)) { -+ shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; - } -@@ -289,10 +281,5 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - mutex_exit(&spa->spa_history_lock); -- nvlist_free(nvrecord); -- kmem_free(record_packed, reclen); -+ fnvlist_pack_free(record_packed, reclen); - dmu_buf_rele(dbp, FTAG); -- -- strfree(hap->ha_history_str); -- if (hap->ha_zone != NULL) -- strfree(hap->ha_zone); -- kmem_free(hap, sizeof (history_arg_t)); -+ fnvlist_free(nvl); - } -@@ -303,9 +290,24 @@ spa_history_log_sync(void *arg1, void *arg2, dmu_tx_t *tx) - int --spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) -+spa_history_log(spa_t *spa, const char *msg) -+{ -+ int err; -+ nvlist_t *nvl; -+ -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ -+ fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); -+ err = spa_history_log_nvl(spa, nvl); -+ fnvlist_free(nvl); -+ return (err); -+} -+ -+int -+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) - { -- history_arg_t *ha; - int err = 0; - dmu_tx_t *tx; -+ nvlist_t *nvarg; - -- ASSERT(what != LOG_INTERNAL); -+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) -+ return (SET_ERROR(EINVAL)); - -@@ -318,15 +320,17 @@ spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) - -- ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); -- ha->ha_history_str = strdup(history_str); -- ha->ha_zone = strdup(spa_history_zone()); -- ha->ha_log_type = what; -- ha->ha_uid = crgetuid(CRED()); -+ VERIFY0(nvlist_dup(nvl, &nvarg, KM_PUSHPAGE)); -+ if (spa_history_zone() != NULL) { -+ fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, -+ spa_history_zone()); -+ } -+ fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); - - /* Kick this off asynchronously; errors are ignored. */ -- dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, -- spa_history_log_sync, spa, ha, 0, tx); -+ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, -+ nvarg, 0, tx); - dmu_tx_commit(tx); - -- /* spa_history_log_sync will free ha and strings */ -+ /* spa_history_log_sync will free nvl */ - return (err); -+ - } -@@ -347,3 +351,3 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - /* -- * If the command history doesn't exist (older pool), -+ * If the command history doesn't exist (older pool), - * that's ok, just return ENOENT. -@@ -351,3 +355,3 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - if (!spa->spa_history) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -430,8 +434,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) - -+/* -+ * The nvlist will be consumed by this call. -+ */ - static void --log_internal(history_internal_events_t event, spa_t *spa, -+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, - dmu_tx_t *tx, const char *fmt, va_list adx) - { -- history_arg_t *ha; -- va_list adx_copy; -+ char *msg; -+ va_list adx1; -+ int size; - -@@ -440,22 +448,29 @@ log_internal(history_internal_events_t event, spa_t *spa, - * initialized yet, so don't bother logging the internal events. -+ * Likewise if the pool is not writeable. - */ -- if (tx->tx_txg == TXG_INITIAL) -+ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { -+ fnvlist_free(nvl); - return; -+ } -+ -+ va_copy(adx1, adx); -+ size = vsnprintf(NULL, 0, fmt, adx1) + 1; -+ msg = kmem_alloc(size, KM_PUSHPAGE); -+ va_end(adx1); -+ va_copy(adx1, adx); -+ (void) vsprintf(msg, fmt, adx1); -+ va_end(adx1); -+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); -+ kmem_free(msg, size); - -- ha = kmem_alloc(sizeof (history_arg_t), KM_PUSHPAGE); -- va_copy(adx_copy, adx); -- ha->ha_history_str = kmem_vasprintf(fmt, adx_copy); -- va_end(adx_copy); -- ha->ha_log_type = LOG_INTERNAL; -- ha->ha_event = event; -- ha->ha_zone = NULL; -- ha->ha_uid = 0; -+ fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); - - if (dmu_tx_is_syncing(tx)) { -- spa_history_log_sync(spa, ha, tx); -+ spa_history_log_sync(nvl, tx); - } else { -- dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, -- spa_history_log_sync, spa, ha, 0, tx); -+ dsl_sync_task_nowait(spa_get_dsl(spa), -+ spa_history_log_sync, nvl, 0, tx); - } -- /* spa_history_log_sync() will free ha and strings */ -+ /* spa_history_log_sync() will free nvl */ - } -@@ -463,3 +478,3 @@ log_internal(history_internal_events_t event, spa_t *spa, - void --spa_history_log_internal(history_internal_events_t event, spa_t *spa, -+spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...) -@@ -468,2 +483,3 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - va_list adx; -+ nvlist_t *nvl; - -@@ -479,3 +495,4 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - va_start(adx, fmt); -- log_internal(event, spa, htx, fmt, adx); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ log_internal(nvl, operation, spa, htx, fmt, adx); - va_end(adx); -@@ -488,19 +505,50 @@ spa_history_log_internal(history_internal_events_t event, spa_t *spa, - void --spa_history_log_version(spa_t *spa, history_internal_events_t event) -+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...) -+{ -+ va_list adx; -+ char namebuf[MAXNAMELEN]; -+ nvlist_t *nvl; -+ -+ ASSERT(tx != NULL); -+ -+ dsl_dataset_name(ds, namebuf); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); -+ -+ va_start(adx, fmt); -+ log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); -+ va_end(adx); -+} -+ -+void -+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, -+ dmu_tx_t *tx, const char *fmt, ...) - { --#ifdef _KERNEL -- uint64_t current_vers = spa_version(spa); -- -- if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { -- spa_history_log_internal(event, spa, NULL, -- "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", -- (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, -- utsname.nodename, utsname.release, utsname.version, -- utsname.machine); -- } -- cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", -- event == LOG_POOL_IMPORT ? "imported" : -- event == LOG_POOL_CREATE ? "created" : "accessed", -- (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); --#endif -+ va_list adx; -+ char namebuf[MAXNAMELEN]; -+ nvlist_t *nvl; -+ -+ ASSERT(tx != NULL); -+ -+ dsl_dir_name(dd, namebuf); -+ VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); -+ fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, -+ dd->dd_phys->dd_head_dataset_obj); -+ -+ va_start(adx, fmt); -+ log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); -+ va_end(adx); -+} -+ -+void -+spa_history_log_version(spa_t *spa, const char *operation) -+{ -+ spa_history_log_internal(spa, operation, NULL, -+ "pool version %llu; software version %llu/%d; uts %s %s %s %s", -+ (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, -+ utsname.nodename, utsname.release, utsname.version, -+ utsname.machine); - } -diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c -index 0ca9f3a..f1e1a72 100644 ---- a/module/zfs/spa_misc.c -+++ b/module/zfs/spa_misc.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -@@ -50,2 +50,3 @@ - #include -+#include - #include "zfs_prop.h" -@@ -239,12 +240,10 @@ int spa_mode_global; - /* -- * Expiration time in units of zfs_txg_synctime_ms. This value has two -- * meanings. First it is used to determine when the spa_deadman logic -- * should fire. By default the spa_deadman will fire if spa_sync has -- * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). -- * Secondly, the value determines if an I/O is considered "hung". -- * Any I/O that has not completed in zfs_deadman_synctime is considered -- * "hung" resulting in a zevent being posted. -- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). -+ * Expiration time in milliseconds. This value has two meanings. First it is -+ * used to determine when the spa_deadman() logic should fire. By default the -+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. -+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that -+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting -+ * in a system panic. - */ --unsigned long zfs_deadman_synctime = 1000ULL; -+unsigned long zfs_deadman_synctime_ms = 1000000ULL; - -@@ -255,2 +254,12 @@ int zfs_deadman_enabled = 1; - -+/* -+ * The worst case is single-sector max-parity RAID-Z blocks, in which -+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) -+ * times the size; so just assume that. Add to this the fact that -+ * we can have up to 3 DVAs per bp, and one more factor of 2 because -+ * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, -+ * the worst case is: -+ * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 -+ */ -+int spa_asize_inflation = 24; - -@@ -270,3 +279,3 @@ spa_config_lock_init(spa_t *spa) - cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); -- refcount_create(&scl->scl_count); -+ refcount_create_untracked(&scl->scl_count); - scl->scl_writer = NULL; -@@ -328,2 +337,4 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) - -+ ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); -+ - for (i = 0; i < SCL_LOCKS; i++) { -@@ -408,3 +419,2 @@ spa_lookup(const char *name) - avl_index_t where; -- char c = 0; - char *cp; -@@ -413,2 +423,4 @@ spa_lookup(const char *name) - -+ (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); -+ - /* -@@ -417,14 +429,8 @@ spa_lookup(const char *name) - */ -- cp = strpbrk(name, "/@"); -- if (cp) { -- c = *cp; -+ cp = strpbrk(search.spa_name, "/@"); -+ if (cp != NULL) - *cp = '\0'; -- } - -- (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); - spa = avl_find(&spa_namespace_avl, &search, &where); - -- if (cp) -- *cp = c; -- - return (spa); -@@ -449,3 +455,3 @@ spa_deadman(void *arg) - spa->spa_deadman_tqid = taskq_dispatch_delay(system_taskq, -- spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + -+ spa_deadman, spa, TQ_PUSHPAGE, ddi_get_lbolt() + - NSEC_TO_TICK(spa->spa_deadman_synctime)); -@@ -495,4 +501,3 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - -- spa->spa_deadman_synctime = zfs_deadman_synctime * -- zfs_txg_synctime_ms * MICROSEC; -+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); - -@@ -500,2 +505,3 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - spa_config_lock_init(spa); -+ spa_stats_init(spa); - -@@ -538,5 +544,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) - VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, -- KM_SLEEP) == 0); -+ KM_PUSHPAGE) == 0); - } - -+ spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); -+ - return (spa); -@@ -583,2 +591,3 @@ spa_remove(spa_t *spa) - -+ spa_stats_destroy(spa); - spa_config_lock_destroy(spa); -@@ -1289,3 +1298,3 @@ spa_freeze(spa_t *spa) - * This is a stripped-down version of strtoull, suitable only for converting -- * lowercase hexidecimal numbers that don't overflow. -+ * lowercase hexadecimal numbers that don't overflow. - */ -@@ -1454,10 +1463,3 @@ spa_get_asize(spa_t *spa, uint64_t lsize) - { -- /* -- * The worst case is single-sector max-parity RAID-Z blocks, in which -- * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) -- * times the size; so just assume that. Add to this the fact that -- * we can have up to 3 DVAs per bp, and one more factor of 2 because -- * the block may be dittoed with up to 3 DVAs by ddt_sync(). -- */ -- return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); -+ return (lsize * spa_asize_inflation); - } -@@ -1632,2 +1634,19 @@ spa_init(int mode) - -+#ifndef _KERNEL -+ if (spa_mode_global != FREAD && dprintf_find_string("watch")) { -+ struct sigaction sa; -+ -+ sa.sa_flags = SA_SIGINFO; -+ sigemptyset(&sa.sa_mask); -+ sa.sa_sigaction = arc_buf_sigsegv; -+ -+ if (sigaction(SIGSEGV, &sa, NULL) == -1) { -+ perror("could not enable watchpoints: " -+ "sigaction(SIGSEGV, ...) = "); -+ } else { -+ arc_watch = B_TRUE; -+ } -+ } -+#endif -+ - fm_init(); -@@ -1636,2 +1655,3 @@ spa_init(int mode) - space_map_init(); -+ ddt_init(); - zio_init(); -@@ -1658,2 +1678,3 @@ spa_fini(void) - zio_fini(); -+ ddt_fini(); - space_map_fini(); -@@ -1759,3 +1780,3 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - bzero(ps, sizeof (pool_scan_stat_t)); -@@ -1865,4 +1886,4 @@ EXPORT_SYMBOL(spa_namespace_lock); - --module_param(zfs_deadman_synctime, ulong, 0644); --MODULE_PARM_DESC(zfs_deadman_synctime,"Expire in units of zfs_txg_synctime_ms"); -+module_param(zfs_deadman_synctime_ms, ulong, 0644); -+MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds"); - -@@ -1870,2 +1891,6 @@ module_param(zfs_deadman_enabled, int, 0644); - MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); -+ -+module_param(spa_asize_inflation, int, 0644); -+MODULE_PARM_DESC(spa_asize_inflation, -+ "SPA size estimate multiplication factor"); - #endif -diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c -new file mode 100644 -index 0000000..dbc761e ---- /dev/null -+++ b/module/zfs/spa_stats.c -@@ -0,0 +1,691 @@ -+/* -+ * CDDL HEADER START -+ * -+ * The contents of this file are subject to the terms of the -+ * Common Development and Distribution License (the "License"). -+ * You may not use this file except in compliance with the License. -+ * -+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+ * or http://www.opensolaris.org/os/licensing. -+ * See the License for the specific language governing permissions -+ * and limitations under the License. -+ * -+ * When distributing Covered Code, include this CDDL HEADER in each -+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+ * If applicable, add the following below this CDDL HEADER, with the -+ * fields enclosed by brackets "[]" replaced with your own identifying -+ * information: Portions Copyright [yyyy] [name of copyright owner] -+ * -+ * CDDL HEADER END -+ */ -+ -+#include -+#include -+ -+/* -+ * Keeps stats on last N reads per spa_t, disabled by default. -+ */ -+int zfs_read_history = 0; -+ -+/* -+ * Include cache hits in history, disabled by default. -+ */ -+int zfs_read_history_hits = 0; -+ -+/* -+ * Keeps stats on the last N txgs, disabled by default. -+ */ -+int zfs_txg_history = 0; -+ -+/* -+ * ========================================================================== -+ * SPA Read History Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Read statistics - Information exported regarding each arc_read call -+ */ -+typedef struct spa_read_history { -+ uint64_t uid; /* unique identifier */ -+ hrtime_t start; /* time read completed */ -+ uint64_t objset; /* read from this objset */ -+ uint64_t object; /* read of this object number */ -+ uint64_t level; /* block's indirection level */ -+ uint64_t blkid; /* read of this block id */ -+ char origin[24]; /* read originated from here */ -+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ -+ pid_t pid; /* PID of task doing read */ -+ char comm[16]; /* process name of task doing read */ -+ list_node_t srh_link; -+} spa_read_history_t; -+ -+static int -+spa_read_history_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " -+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", -+ "level", "blkid", "aflags", "origin", "pid", "process"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+static int -+spa_read_history_data(char *buf, size_t size, void *data) -+{ -+ spa_read_history_t *srh = (spa_read_history_t *)data; -+ -+ size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " -+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", -+ (u_longlong_t)srh->uid, srh->start, -+ (longlong_t)srh->objset, (longlong_t)srh->object, -+ (longlong_t)srh->level, (longlong_t)srh->blkid, -+ srh->aflags, srh->origin, srh->pid, srh->comm); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+/* -+ * Calculate the address for the next spa_stats_history_t entry. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static void * -+spa_read_history_addr(kstat_t *ksp, loff_t n) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (n == 0) -+ ssh->private = list_tail(&ssh->list); -+ else if (ssh->private) -+ ssh->private = list_prev(&ssh->list, ssh->private); -+ -+ return (ssh->private); -+} -+ -+/* -+ * When the kstat is written discard all spa_read_history_t entires. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static int -+spa_read_history_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ -+ if (rw == KSTAT_WRITE) { -+ spa_read_history_t *srh; -+ -+ while ((srh = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(srh, sizeof (spa_read_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ } -+ -+ ksp->ks_ndata = ssh->size; -+ ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t); -+ -+ return (0); -+} -+ -+static void -+spa_read_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ list_create(&ssh->list, sizeof (spa_read_history_t), -+ offsetof(spa_read_history_t, srh_link)); -+ -+ ssh->count = 0; -+ ssh->size = 0; -+ ssh->private = NULL; -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "reads", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = NULL; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_read_history_update; -+ kstat_set_raw_ops(ksp, spa_read_history_headers, -+ spa_read_history_data, spa_read_history_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_read_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ spa_read_history_t *srh; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_enter(&ssh->lock); -+ while ((srh = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(srh, sizeof (spa_read_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ list_destroy(&ssh->list); -+ mutex_exit(&ssh->lock); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.read_history; -+ spa_read_history_t *srh, *rm; -+ -+ ASSERT3P(spa, !=, NULL); -+ ASSERT3P(zb, !=, NULL); -+ -+ if (zfs_read_history == 0 && ssh->size == 0) -+ return; -+ -+ if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) -+ return; -+ -+ srh = kmem_zalloc(sizeof (spa_read_history_t), KM_PUSHPAGE); -+ strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); -+ srh->start = gethrtime(); -+ srh->objset = zb->zb_objset; -+ srh->object = zb->zb_object; -+ srh->level = zb->zb_level; -+ srh->blkid = zb->zb_blkid; -+ srh->aflags = aflags; -+ srh->pid = getpid(); -+ -+ mutex_enter(&ssh->lock); -+ -+ srh->uid = ssh->count++; -+ list_insert_head(&ssh->list, srh); -+ ssh->size++; -+ -+ while (ssh->size > zfs_read_history) { -+ ssh->size--; -+ rm = list_remove_tail(&ssh->list); -+ kmem_free(rm, sizeof (spa_read_history_t)); -+ } -+ -+ mutex_exit(&ssh->lock); -+} -+ -+/* -+ * ========================================================================== -+ * SPA TXG History Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Txg statistics - Information exported regarding each txg sync -+ */ -+ -+typedef struct spa_txg_history { -+ uint64_t txg; /* txg id */ -+ txg_state_t state; /* active txg state */ -+ uint64_t nread; /* number of bytes read */ -+ uint64_t nwritten; /* number of bytes written */ -+ uint64_t reads; /* number of read operations */ -+ uint64_t writes; /* number of write operations */ -+ uint64_t ndirty; /* number of dirty bytes */ -+ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ -+ list_node_t sth_link; -+} spa_txg_history_t; -+ -+static int -+spa_txg_history_headers(char *buf, size_t size) -+{ -+ size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " -+ "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", -+ "ndirty", "nread", "nwritten", "reads", "writes", -+ "otime", "qtime", "wtime", "stime"); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+static int -+spa_txg_history_data(char *buf, size_t size, void *data) -+{ -+ spa_txg_history_t *sth = (spa_txg_history_t *)data; -+ uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; -+ char state; -+ -+ switch (sth->state) { -+ case TXG_STATE_BIRTH: state = 'B'; break; -+ case TXG_STATE_OPEN: state = 'O'; break; -+ case TXG_STATE_QUIESCED: state = 'Q'; break; -+ case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; -+ case TXG_STATE_SYNCED: state = 'S'; break; -+ case TXG_STATE_COMMITTED: state = 'C'; break; -+ default: state = '?'; break; -+ } -+ -+ if (sth->times[TXG_STATE_OPEN]) -+ open = sth->times[TXG_STATE_OPEN] - -+ sth->times[TXG_STATE_BIRTH]; -+ -+ if (sth->times[TXG_STATE_QUIESCED]) -+ quiesce = sth->times[TXG_STATE_QUIESCED] - -+ sth->times[TXG_STATE_OPEN]; -+ -+ if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) -+ wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - -+ sth->times[TXG_STATE_QUIESCED]; -+ -+ if (sth->times[TXG_STATE_SYNCED]) -+ sync = sth->times[TXG_STATE_SYNCED] - -+ sth->times[TXG_STATE_WAIT_FOR_SYNC]; -+ -+ size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " -+ "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", -+ (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, -+ (u_longlong_t)sth->ndirty, -+ (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, -+ (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, -+ (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, -+ (u_longlong_t)sync); -+ buf[size] = '\0'; -+ -+ return (0); -+} -+ -+/* -+ * Calculate the address for the next spa_stats_history_t entry. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static void * -+spa_txg_history_addr(kstat_t *ksp, loff_t n) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (n == 0) -+ ssh->private = list_tail(&ssh->list); -+ else if (ssh->private) -+ ssh->private = list_prev(&ssh->list, ssh->private); -+ -+ return (ssh->private); -+} -+ -+/* -+ * When the kstat is written discard all spa_txg_history_t entires. The -+ * ssh->lock will be held until ksp->ks_ndata entries are processed. -+ */ -+static int -+spa_txg_history_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ -+ ASSERT(MUTEX_HELD(&ssh->lock)); -+ -+ if (rw == KSTAT_WRITE) { -+ spa_txg_history_t *sth; -+ -+ while ((sth = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(sth, sizeof (spa_txg_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ } -+ -+ ksp->ks_ndata = ssh->size; -+ ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t); -+ -+ return (0); -+} -+ -+static void -+spa_txg_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ list_create(&ssh->list, sizeof (spa_txg_history_t), -+ offsetof(spa_txg_history_t, sth_link)); -+ -+ ssh->count = 0; -+ ssh->size = 0; -+ ssh->private = NULL; -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "txgs", "misc", -+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = NULL; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_txg_history_update; -+ kstat_set_raw_ops(ksp, spa_txg_history_headers, -+ spa_txg_history_data, spa_txg_history_addr); -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_txg_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ mutex_enter(&ssh->lock); -+ while ((sth = list_remove_head(&ssh->list))) { -+ ssh->size--; -+ kmem_free(sth, sizeof (spa_txg_history_t)); -+ } -+ -+ ASSERT3U(ssh->size, ==, 0); -+ list_destroy(&ssh->list); -+ mutex_exit(&ssh->lock); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+/* -+ * Add a new txg to historical record. -+ */ -+void -+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth, *rm; -+ -+ if (zfs_txg_history == 0 && ssh->size == 0) -+ return; -+ -+ sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_PUSHPAGE); -+ sth->txg = txg; -+ sth->state = TXG_STATE_OPEN; -+ sth->times[TXG_STATE_BIRTH] = birth_time; -+ -+ mutex_enter(&ssh->lock); -+ -+ list_insert_head(&ssh->list, sth); -+ ssh->size++; -+ -+ while (ssh->size > zfs_txg_history) { -+ ssh->size--; -+ rm = list_remove_tail(&ssh->list); -+ kmem_free(rm, sizeof (spa_txg_history_t)); -+ } -+ -+ mutex_exit(&ssh->lock); -+} -+ -+/* -+ * Set txg state completion time and increment current state. -+ */ -+int -+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, -+ hrtime_t completed_time) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ int error = ENOENT; -+ -+ if (zfs_txg_history == 0) -+ return (0); -+ -+ mutex_enter(&ssh->lock); -+ for (sth = list_head(&ssh->list); sth != NULL; -+ sth = list_next(&ssh->list, sth)) { -+ if (sth->txg == txg) { -+ sth->times[completed_state] = completed_time; -+ sth->state++; -+ error = 0; -+ break; -+ } -+ } -+ mutex_exit(&ssh->lock); -+ -+ return (error); -+} -+ -+/* -+ * Set txg IO stats. -+ */ -+int -+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, -+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.txg_history; -+ spa_txg_history_t *sth; -+ int error = ENOENT; -+ -+ if (zfs_txg_history == 0) -+ return (0); -+ -+ mutex_enter(&ssh->lock); -+ for (sth = list_head(&ssh->list); sth != NULL; -+ sth = list_next(&ssh->list, sth)) { -+ if (sth->txg == txg) { -+ sth->nread = nread; -+ sth->nwritten = nwritten; -+ sth->reads = reads; -+ sth->writes = writes; -+ sth->ndirty = ndirty; -+ error = 0; -+ break; -+ } -+ } -+ mutex_exit(&ssh->lock); -+ -+ return (error); -+} -+ -+/* -+ * ========================================================================== -+ * SPA TX Assign Histogram Routines -+ * ========================================================================== -+ */ -+ -+/* -+ * Tx statistics - Information exported regarding dmu_tx_assign time. -+ */ -+ -+/* -+ * When the kstat is written zero all buckets. When the kstat is read -+ * count the number of trailing buckets set to zero and update ks_ndata -+ * such that they are not output. -+ */ -+static int -+spa_tx_assign_update(kstat_t *ksp, int rw) -+{ -+ spa_t *spa = ksp->ks_private; -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ int i; -+ -+ if (rw == KSTAT_WRITE) { -+ for (i = 0; i < ssh->count; i++) -+ ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; -+ } -+ -+ for (i = ssh->count; i > 0; i--) -+ if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) -+ break; -+ -+ ksp->ks_ndata = i; -+ ksp->ks_data_size = i * sizeof (kstat_named_t); -+ -+ return (0); -+} -+ -+static void -+spa_tx_assign_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ char name[KSTAT_STRLEN]; -+ kstat_named_t *ks; -+ kstat_t *ksp; -+ int i; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ -+ ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ -+ ssh->size = ssh->count * sizeof (kstat_named_t); -+ ssh->private = kmem_alloc(ssh->size, KM_SLEEP); -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ for (i = 0; i < ssh->count; i++) { -+ ks = &((kstat_named_t *)ssh->private)[i]; -+ ks->data_type = KSTAT_DATA_UINT64; -+ ks->value.ui64 = 0; -+ (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", -+ (u_longlong_t)1 << i); -+ } -+ -+ ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", -+ KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_data = ssh->private; -+ ksp->ks_ndata = ssh->count; -+ ksp->ks_data_size = ssh->size; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_tx_assign_update; -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_tx_assign_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ kstat_t *ksp; -+ -+ ksp = ssh->kstat; -+ if (ksp) -+ kstat_delete(ksp); -+ -+ kmem_free(ssh->private, ssh->size); -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; -+ uint64_t idx = 0; -+ -+ while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) -+ idx++; -+ -+ atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); -+} -+ -+/* -+ * ========================================================================== -+ * SPA IO History Routines -+ * ========================================================================== -+ */ -+static int -+spa_io_history_update(kstat_t *ksp, int rw) -+{ -+ if (rw == KSTAT_WRITE) -+ memset(ksp->ks_data, 0, ksp->ks_data_size); -+ -+ return (0); -+} -+ -+static void -+spa_io_history_init(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ char name[KSTAT_STRLEN]; -+ kstat_t *ksp; -+ -+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); -+ -+ (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); -+ name[KSTAT_STRLEN-1] = '\0'; -+ -+ ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); -+ ssh->kstat = ksp; -+ -+ if (ksp) { -+ ksp->ks_lock = &ssh->lock; -+ ksp->ks_private = spa; -+ ksp->ks_update = spa_io_history_update; -+ kstat_install(ksp); -+ } -+} -+ -+static void -+spa_io_history_destroy(spa_t *spa) -+{ -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ if (ssh->kstat) -+ kstat_delete(ssh->kstat); -+ -+ mutex_destroy(&ssh->lock); -+} -+ -+void -+spa_stats_init(spa_t *spa) -+{ -+ spa_read_history_init(spa); -+ spa_txg_history_init(spa); -+ spa_tx_assign_init(spa); -+ spa_io_history_init(spa); -+} -+ -+void -+spa_stats_destroy(spa_t *spa) -+{ -+ spa_tx_assign_destroy(spa); -+ spa_txg_history_destroy(spa); -+ spa_read_history_destroy(spa); -+ spa_io_history_destroy(spa); -+} -+ -+#if defined(_KERNEL) && defined(HAVE_SPL) -+module_param(zfs_read_history, int, 0644); -+MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); -+ -+module_param(zfs_read_history_hits, int, 0644); -+MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); -+ -+module_param(zfs_txg_history, int, 0644); -+MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); -+#endif -diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c -index a031f3a..2cf1d2a 100644 ---- a/module/zfs/space_map.c -+++ b/module/zfs/space_map.c -@@ -104,3 +104,3 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) - avl_index_t where; -- space_seg_t ssearch, *ss_before, *ss_after, *ss; -+ space_seg_t *ss_before, *ss_after, *ss; - uint64_t end = start + size; -@@ -117,7 +117,4 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) - -- ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -- -- if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { -+ ss = space_map_find(sm, start, size, &where); -+ if (ss != NULL) { - zfs_panic_recover("zfs: allocating allocated segment" -@@ -173,3 +170,3 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - avl_index_t where; -- space_seg_t ssearch, *ss, *newseg; -+ space_seg_t *ss, *newseg; - uint64_t end = start + size; -@@ -177,11 +174,4 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - -- ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(!sm->sm_condensing); -- VERIFY(size != 0); -- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); -- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); -- -- ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -+ ss = space_map_find(sm, start, size, &where); - -@@ -228,8 +218,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) - --boolean_t --space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -+space_seg_t * -+space_map_find(space_map_t *sm, uint64_t start, uint64_t size, -+ avl_index_t *wherep) - { -- avl_index_t where; - space_seg_t ssearch, *ss; -- uint64_t end = start + size; - -@@ -241,6 +230,16 @@ space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) - ssearch.ss_start = start; -- ssearch.ss_end = end; -- ss = avl_find(&sm->sm_root, &ssearch, &where); -+ ssearch.ss_end = start + size; -+ ss = avl_find(&sm->sm_root, &ssearch, wherep); -+ -+ if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size) -+ return (ss); -+ return (NULL); -+} -+ -+boolean_t -+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -+{ -+ avl_index_t where; - -- return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); -+ return (space_map_find(sm, start, size, &where) != 0); - } -diff --git a/module/zfs/txg.c b/module/zfs/txg.c -index 7c820af..524fe8e 100644 ---- a/module/zfs/txg.c -+++ b/module/zfs/txg.c -@@ -29,2 +29,3 @@ - #include -+#include - #include -@@ -33,3 +34,2 @@ - #include --#include - -@@ -48,3 +48,3 @@ - * (though it may be blocked waiting to enter the quiescing state). In broad -- * strokes, transactions — operations that change in-memory structures — are -+ * strokes, transactions -- operations that change in-memory structures -- are - * accepted into the txg in the open state, and are completed while the txg is -@@ -56,3 +56,3 @@ - * When a new txg becomes active, it first enters the open state. New -- * transactions — updates to in-memory structures — are assigned to the -+ * transactions -- updates to in-memory structures -- are assigned to the - * currently open txg. There is always a txg in the open state so that ZFS can -@@ -129,2 +129,4 @@ txg_init(dsl_pool_t *dp, uint64_t txg) - mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, -+ NULL); - for (i = 0; i < TXG_SIZE; i++) { -@@ -171,2 +173,3 @@ txg_fini(dsl_pool_t *dp) - -+ mutex_destroy(&tx->tx_cpu[c].tc_open_lock); - mutex_destroy(&tx->tx_cpu[c].tc_lock); -@@ -235,3 +238,3 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) - static void --txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) -+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) - { -@@ -305,6 +308,8 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) - -- mutex_enter(&tc->tc_lock); -- -+ mutex_enter(&tc->tc_open_lock); - txg = tx->tx_open_txg; -+ -+ mutex_enter(&tc->tc_lock); - tc->tc_count[txg & TXG_MASK]++; -+ mutex_exit(&tc->tc_lock); - -@@ -321,3 +326,4 @@ txg_rele_to_quiesce(txg_handle_t *th) - -- mutex_exit(&tc->tc_lock); -+ ASSERT(!MUTEX_HELD(&tc->tc_lock)); -+ mutex_exit(&tc->tc_open_lock); - } -@@ -350,2 +356,8 @@ txg_rele_to_sync(txg_handle_t *th) - -+/* -+ * Blocks until all transactions in the group are committed. -+ * -+ * On return, the transaction group has reached a stable state in which it can -+ * then be passed off to the syncing context. -+ */ - static void -@@ -353,4 +365,2 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - { -- hrtime_t start; -- txg_history_t *th; - tx_state_t *tx = &dp->dp_tx; -@@ -360,6 +370,6 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - /* -- * Grab all tx_cpu locks so nobody else can get into this txg. -+ * Grab all tc_open_locks so nobody else can get into this txg. - */ - for (c = 0; c < max_ncpus; c++) -- mutex_enter(&tx->tx_cpu[c].tc_lock); -+ mutex_enter(&tx->tx_cpu[c].tc_open_lock); - -@@ -367,2 +377,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - tx->tx_open_txg++; -+ tx->tx_open_time = gethrtime(); -+ -+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx->tx_open_time); -+ spa_txg_history_add(dp->dp_spa, tx->tx_open_txg, tx->tx_open_time); -+ -+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); -+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); - -@@ -373,12 +390,3 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - for (c = 0; c < max_ncpus; c++) -- mutex_exit(&tx->tx_cpu[c].tc_lock); -- -- /* -- * Measure how long the txg was open and replace the kstat. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; -- th->th_kstat.state = TXG_STATE_QUIESCING; -- dsl_pool_txg_history_put(th); -- dsl_pool_txg_history_add(dp, tx->tx_open_txg); -+ mutex_exit(&tx->tx_cpu[c].tc_open_lock); - -@@ -387,4 +395,2 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - */ -- start = gethrtime(); -- - for (c = 0; c < max_ncpus; c++) { -@@ -397,8 +403,3 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) - -- /* -- * Measure how long the txg took to quiesce. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.quiesce_time = gethrtime() - start; -- dsl_pool_txg_history_put(th); -+ spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); - } -@@ -417,2 +418,5 @@ txg_do_callbacks(list_t *cb_list) - * Dispatch the commit callbacks registered on this txg to worker threads. -+ * -+ * If no callbacks are registered for a given TXG, nothing happens. -+ * This function creates a taskq for the associated pool, if needed. - */ -@@ -427,3 +431,6 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) - tx_cpu_t *tc = &tx->tx_cpu[c]; -- /* No need to lock tx_cpu_t at this point */ -+ /* -+ * No need to lock tx_cpu_t at this point, since this can -+ * only be called once a txg has been synced. -+ */ - -@@ -474,2 +481,3 @@ txg_sync_thread(dsl_pool_t *dp) - callb_cpr_t cpr; -+ vdev_stat_t *vs1, *vs2; - uint64_t start, delta; -@@ -487,8 +495,10 @@ txg_sync_thread(dsl_pool_t *dp) - -+ vs1 = kmem_alloc(sizeof (vdev_stat_t), KM_PUSHPAGE); -+ vs2 = kmem_alloc(sizeof (vdev_stat_t), KM_PUSHPAGE); -+ - start = delta = 0; - for (;;) { -- hrtime_t hrstart; -- txg_history_t *th; - uint64_t timer, timeout; - uint64_t txg; -+ uint64_t ndirty; - -@@ -505,3 +515,4 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_synced_txg >= tx->tx_sync_txg_waiting && -- tx->tx_quiesced_txg == 0) { -+ tx->tx_quiesced_txg == 0 && -+ dp->dp_dirty_total < zfs_dirty_data_sync) { - dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", -@@ -524,4 +535,9 @@ txg_sync_thread(dsl_pool_t *dp) - -- if (tx->tx_exiting) -+ if (tx->tx_exiting) { -+ kmem_free(vs2, sizeof (vdev_stat_t)); -+ kmem_free(vs1, sizeof (vdev_stat_t)); - txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); -+ } -+ -+ vdev_get_stats(spa->spa_root_vdev, vs1); - -@@ -535,9 +551,5 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_syncing_txg = txg; -+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_quiesce_more_cv); - -- th = dsl_pool_txg_history_get(dp, txg); -- th->th_kstat.state = TXG_STATE_SYNCING; -- vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); -- dsl_pool_txg_history_put(th); -- - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", -@@ -546,4 +558,7 @@ txg_sync_thread(dsl_pool_t *dp) - -+ spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, -+ gethrtime()); -+ ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; -+ - start = ddi_get_lbolt(); -- hrstart = gethrtime(); - spa_sync(spa, txg); -@@ -554,2 +569,3 @@ txg_sync_thread(dsl_pool_t *dp) - tx->tx_syncing_txg = 0; -+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_done_cv); -@@ -561,18 +577,10 @@ txg_sync_thread(dsl_pool_t *dp) - -- /* -- * Measure the txg sync time determine the amount of I/O done. -- */ -- th = dsl_pool_txg_history_get(dp, txg); -- vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); -- th->th_kstat.sync_time = gethrtime() - hrstart; -- th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - -- th->th_vs1.vs_bytes[ZIO_TYPE_READ]; -- th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - -- th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; -- th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - -- th->th_vs1.vs_ops[ZIO_TYPE_READ]; -- th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - -- th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; -- th->th_kstat.state = TXG_STATE_COMMITTED; -- dsl_pool_txg_history_put(th); -+ vdev_get_stats(spa->spa_root_vdev, vs2); -+ spa_txg_history_set_io(spa, txg, -+ vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], -+ vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], -+ vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], -+ vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], -+ ndirty); -+ spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); - } -@@ -619,2 +627,3 @@ txg_quiesce_thread(dsl_pool_t *dp) - tx->tx_quiesced_txg = txg; -+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); - cv_broadcast(&tx->tx_sync_more_cv); -@@ -625,13 +634,13 @@ txg_quiesce_thread(dsl_pool_t *dp) - /* -- * Delay this thread by 'ticks' if we are still in the open transaction -- * group and there is already a waiting txg quiesing or quiesced. Abort -- * the delay if this txg stalls or enters the quiesing state. -+ * Delay this thread by delay nanoseconds if we are still in the open -+ * transaction group and there is already a waiting txg quiesing or quiesced. -+ * Abort the delay if this txg stalls or enters the quiesing state. - */ - void --txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) -+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) - { - tx_state_t *tx = &dp->dp_tx; -- clock_t timeout = ddi_get_lbolt() + ticks; -+ hrtime_t start = gethrtime(); - -- /* don't delay if this txg could transition to quiesing immediately */ -+ /* don't delay if this txg could transition to quiescing immediately */ - if (tx->tx_open_txg > txg || -@@ -646,6 +655,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) - -- while (ddi_get_lbolt() < timeout && -- tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) -- (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, -- timeout); -+ while (gethrtime() - start < delay && -+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { -+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, -+ &tx->tx_sync_lock, delay, resolution, 0); -+ } - -@@ -661,2 +671,4 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) - -+ ASSERT(!dsl_pool_config_held(dp)); -+ - mutex_enter(&tx->tx_sync_lock); -@@ -684,2 +696,4 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) - -+ ASSERT(!dsl_pool_config_held(dp)); -+ - mutex_enter(&tx->tx_sync_lock); -@@ -699,2 +713,24 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg) - -+/* -+ * If there isn't a txg syncing or in the pipeline, push another txg through -+ * the pipeline by queiscing the open txg. -+ */ -+void -+txg_kick(dsl_pool_t *dp) -+{ -+ tx_state_t *tx = &dp->dp_tx; -+ -+ ASSERT(!dsl_pool_config_held(dp)); -+ -+ mutex_enter(&tx->tx_sync_lock); -+ if (tx->tx_syncing_txg == 0 && -+ tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && -+ tx->tx_sync_txg_waiting <= tx->tx_synced_txg && -+ tx->tx_quiesced_txg <= tx->tx_synced_txg) { -+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; -+ cv_broadcast(&tx->tx_quiesce_more_cv); -+ } -+ mutex_exit(&tx->tx_sync_lock); -+} -+ - boolean_t -@@ -749,6 +785,6 @@ txg_list_empty(txg_list_t *tl, uint64_t txg) - /* -- * Add an entry to the list. -- * Returns 0 if it's a new entry, 1 if it's already there. -+ * Add an entry to the list (unless it's already on the list). -+ * Returns B_TRUE if it was actually added. - */ --int -+boolean_t - txg_list_add(txg_list_t *tl, void *p, uint64_t txg) -@@ -757,7 +793,7 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); -- int already_on_list; -+ boolean_t add; - - mutex_enter(&tl->tl_lock); -- already_on_list = tn->tn_member[t]; -- if (!already_on_list) { -+ add = (tn->tn_member[t] == 0); -+ if (add) { - tn->tn_member[t] = 1; -@@ -768,3 +804,3 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - -- return (already_on_list); -+ return (add); - } -@@ -772,6 +808,7 @@ txg_list_add(txg_list_t *tl, void *p, uint64_t txg) - /* -- * Add an entry to the end of the list (walks list to find end). -- * Returns 0 if it's a new entry, 1 if it's already there. -+ * Add an entry to the end of the list, unless it's already on the list. -+ * (walks list to find end) -+ * Returns B_TRUE if it was actually added. - */ --int -+boolean_t - txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) -@@ -780,7 +817,7 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) - txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); -- int already_on_list; -+ boolean_t add; - - mutex_enter(&tl->tl_lock); -- already_on_list = tn->tn_member[t]; -- if (!already_on_list) { -+ add = (tn->tn_member[t] == 0); -+ if (add) { - txg_node_t **tp; -@@ -796,3 +833,3 @@ txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) - -- return (already_on_list); -+ return (add); - } -@@ -847,3 +884,3 @@ txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) - --int -+boolean_t - txg_list_member(txg_list_t *tl, void *p, uint64_t txg) -@@ -853,3 +890,3 @@ txg_list_member(txg_list_t *tl, void *p, uint64_t txg) - -- return (tn->tn_member[t]); -+ return (tn->tn_member[t] != 0); - } -diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c -index 692cda1..a07dc00 100644 ---- a/module/zfs/uberblock.c -+++ b/module/zfs/uberblock.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -35,3 +36,3 @@ uberblock_verify(uberblock_t *ub) - if (ub->ub_magic != UBERBLOCK_MAGIC) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c -index 662a877..7751683 100644 ---- a/module/zfs/vdev.c -+++ b/module/zfs/vdev.c -@@ -24,3 +24,3 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -357,6 +357,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if ((ops = vdev_getops(type)) == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -371,15 +371,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - label_id != id) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_SPARE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_L2CACHE) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -390,3 +390,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -398,6 +398,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (islog && spa_version(spa) < SPA_VERSION_SLOGS) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - - if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -411,3 +411,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - /* -@@ -418,6 +418,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - spa_version(spa) < SPA_VERSION_RAIDZ2) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } else { -@@ -428,3 +428,3 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - /* -@@ -528,4 +528,4 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, - -- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING, -- &vd->vdev_resilvering); -+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, -+ &vd->vdev_resilver_txg); - -@@ -951,3 +951,3 @@ vdev_probe_done(zio_t *zio) - spa, vd, NULL, 0, 0); -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - } -@@ -961,3 +961,3 @@ vdev_probe_done(zio_t *zio) - if (!vdev_accessible(vd, pio)) -- pio->io_error = ENXIO; -+ pio->io_error = SET_ERROR(ENXIO); - -@@ -968,5 +968,7 @@ vdev_probe_done(zio_t *zio) - /* -- * Determine whether this device is accessible by reading and writing -- * to several known locations: the pad regions of each vdev label -- * but the first (which we leave alone in case it contains a VTOC). -+ * Determine whether this device is accessible. -+ * -+ * Read and write to several known locations: the pad regions of each -+ * vdev label but the first, which we leave alone in case it contains -+ * a VTOC. - */ -@@ -1154,3 +1156,3 @@ vdev_open(vdev_t *vd) - vd->vdev_label_aux); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } else if (vd->vdev_offline) { -@@ -1158,3 +1160,3 @@ vdev_open(vdev_t *vd) - vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -1193,3 +1195,3 @@ vdev_open(vdev_t *vd) - vd->vdev_label_aux); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -1225,3 +1227,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_TOO_SMALL); -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - } -@@ -1236,3 +1238,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_TOO_SMALL); -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - } -@@ -1251,3 +1253,3 @@ vdev_open(vdev_t *vd) - VDEV_AUX_BAD_LABEL); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1338,3 +1340,3 @@ vdev_validate(vdev_t *vd, boolean_t strict) - if (vdev_validate(vd->vdev_child[c], strict) != 0) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -1424,3 +1426,3 @@ vdev_validate(vdev_t *vd, boolean_t strict) - state != POOL_STATE_ACTIVE) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -1683,2 +1685,71 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) - /* -+ * Returns the lowest txg in the DTL range. -+ */ -+static uint64_t -+vdev_dtl_min(vdev_t *vd) -+{ -+ space_seg_t *ss; -+ -+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); -+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0); -+ ASSERT0(vd->vdev_children); -+ -+ ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); -+ return (ss->ss_start - 1); -+} -+ -+/* -+ * Returns the highest txg in the DTL. -+ */ -+static uint64_t -+vdev_dtl_max(vdev_t *vd) -+{ -+ space_seg_t *ss; -+ -+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); -+ ASSERT3U(vd->vdev_dtl[DTL_MISSING].sm_space, !=, 0); -+ ASSERT0(vd->vdev_children); -+ -+ ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); -+ return (ss->ss_end); -+} -+ -+/* -+ * Determine if a resilvering vdev should remove any DTL entries from -+ * its range. If the vdev was resilvering for the entire duration of the -+ * scan then it should excise that range from its DTLs. Otherwise, this -+ * vdev is considered partially resilvered and should leave its DTL -+ * entries intact. The comment in vdev_dtl_reassess() describes how we -+ * excise the DTLs. -+ */ -+static boolean_t -+vdev_dtl_should_excise(vdev_t *vd) -+{ -+ spa_t *spa = vd->vdev_spa; -+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; -+ -+ ASSERT0(scn->scn_phys.scn_errors); -+ ASSERT0(vd->vdev_children); -+ -+ if (vd->vdev_resilver_txg == 0 || -+ vd->vdev_dtl[DTL_MISSING].sm_space == 0) -+ return (B_TRUE); -+ -+ /* -+ * When a resilver is initiated the scan will assign the scn_max_txg -+ * value to the highest txg value that exists in all DTLs. If this -+ * device's max DTL is not part of this scan (i.e. it is not in -+ * the range (scn_min_txg, scn_max_txg] then it is not eligible -+ * for excision. -+ */ -+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { -+ ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); -+ ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); -+ ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); -+ return (B_TRUE); -+ } -+ return (B_FALSE); -+} -+ -+/* - * Reassess DTLs after a config change or scrub completion. -@@ -1705,5 +1776,13 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) - mutex_enter(&vd->vdev_dtl_lock); -+ -+ /* -+ * If we've completed a scan cleanly then determine -+ * if this vdev should remove any DTLs. We only want to -+ * excise regions on vdevs that were available during -+ * the entire duration of this scan. -+ */ - if (scrub_txg != 0 && - (spa->spa_scrub_started || -- (scn && scn->scn_phys.scn_errors == 0))) { -+ (scn != NULL && scn->scn_phys.scn_errors == 0)) && -+ vdev_dtl_should_excise(vd)) { - /* -@@ -1746,2 +1825,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) - space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); -+ -+ /* -+ * If the vdev was resilvering and no longer has any -+ * DTLs then reset its resilvering flag. -+ */ -+ if (vd->vdev_resilver_txg != 0 && -+ vd->vdev_dtl[DTL_MISSING].sm_space == 0 && -+ vd->vdev_dtl[DTL_OUTAGE].sm_space == 0) -+ vd->vdev_resilver_txg = 0; -+ - mutex_exit(&vd->vdev_dtl_lock); -@@ -1922,8 +2011,5 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) - vdev_writeable(vd)) { -- space_seg_t *ss; - -- ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); -- thismin = ss->ss_start - 1; -- ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); -- thismax = ss->ss_end; -+ thismin = vdev_dtl_min(vd); -+ thismax = vdev_dtl_max(vd); - needed = B_TRUE; -@@ -2204,6 +2290,8 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) - /* -- * Online the given vdev. If 'unspare' is set, it implies two things. First, -- * any attached spare device should be detached when the device finishes -- * resilvering. Second, the online should be treated like a 'test' online case, -- * so no FMA events are generated if the device fails to open. -+ * Online the given vdev. -+ * -+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached -+ * spare device should be detached when the device finishes resilvering. -+ * Second, the online should be treated like a 'test' online case, so no FMA -+ * events are generated if the device fails to open. - */ -@@ -3210,3 +3298,3 @@ vdev_deadman(vdev_t *vd) - mutex_enter(&vq->vq_lock); -- if (avl_numnodes(&vq->vq_pending_tree) > 0) { -+ if (avl_numnodes(&vq->vq_active_tree) > 0) { - spa_t *spa = vd->vdev_spa; -@@ -3220,3 +3308,3 @@ vdev_deadman(vdev_t *vd) - */ -- fio = avl_first(&vq->vq_pending_tree); -+ fio = avl_first(&vq->vq_active_tree); - delta = gethrtime() - fio->io_timestamp; -diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c -index bf4ae7b..ffd50ec 100644 ---- a/module/zfs/vdev_cache.c -+++ b/module/zfs/vdev_cache.c -@@ -24,2 +24,5 @@ - */ -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ - -@@ -255,4 +258,4 @@ vdev_cache_read(zio_t *zio) - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); -- ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);) - zio_t *fio; -+ ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); - -@@ -261,6 +264,6 @@ vdev_cache_read(zio_t *zio) - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (zio->io_size > zfs_vdev_cache_max) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - -@@ -270,3 +273,3 @@ vdev_cache_read(zio_t *zio) - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - -@@ -276,6 +279,6 @@ vdev_cache_read(zio_t *zio) - -- ve_search = kmem_alloc(sizeof(vdev_cache_entry_t), KM_PUSHPAGE); -+ ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_PUSHPAGE); - ve_search->ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); -- kmem_free(ve_search, sizeof(vdev_cache_entry_t)); -+ kmem_free(ve_search, sizeof (vdev_cache_entry_t)); - -@@ -284,3 +287,3 @@ vdev_cache_read(zio_t *zio) - mutex_exit(&vc->vc_lock); -- return (ESTALE); -+ return (SET_ERROR(ESTALE)); - } -@@ -307,3 +310,3 @@ vdev_cache_read(zio_t *zio) - mutex_exit(&vc->vc_lock); -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - } -@@ -311,3 +314,3 @@ vdev_cache_read(zio_t *zio) - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, -- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, -+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); -diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c -index 2869716..cb0cdd7 100644 ---- a/module/zfs/vdev_disk.c -+++ b/module/zfs/vdev_disk.c -@@ -25,2 +25,3 @@ - * LLNL-CODE-403049. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -48,3 +49,3 @@ typedef struct dio_request { - int dr_bio_count; /* Count of bio's */ -- struct bio *dr_bio[0]; /* Attached bio's */ -+ struct bio *dr_bio[0]; /* Attached bio's */ - } dio_request_t; -@@ -66,3 +67,3 @@ vdev_bdev_mode(int smode) - -- return mode; -+ return (mode); - } -@@ -79,3 +80,3 @@ vdev_bdev_mode(int smode) - -- return mode; -+ return (mode); - } -@@ -140,3 +141,3 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - /* Leave existing scheduler when set to "none" */ -- if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4)) -+ if (strncmp(elevator, "none", 4) && (strlen(elevator) == 4) == 0) - return (0); -@@ -146,3 +147,4 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - #else -- /* For pre-2.6.36 kernels elevator_change() is not available. -+ /* -+ * For pre-2.6.36 kernels elevator_change() is not available. - * Therefore we fall back to using a usermodehelper to echo the -@@ -151,3 +153,3 @@ vdev_elevator_switch(vdev_t *v, char *elevator) - */ --# define SET_SCHEDULER_CMD \ -+#define SET_SCHEDULER_CMD \ - "exec 0vdev_path, device, error); -+ elevator, v->vdev_path, device, error); - -@@ -208,3 +210,3 @@ vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) - if (IS_ERR(bdev)) -- return bdev; -+ return (bdev); - -@@ -232,5 +234,5 @@ vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) - -- return result; -+ return (result); - #else -- return ERR_PTR(-EOPNOTSUPP); -+ return (ERR_PTR(-EOPNOTSUPP)); - #endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ -@@ -249,3 +251,3 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return EINVAL; -+ return (EINVAL); - } -@@ -262,5 +264,5 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - -- vd = kmem_zalloc(sizeof(vdev_disk_t), KM_PUSHPAGE); -+ vd = kmem_zalloc(sizeof (vdev_disk_t), KM_PUSHPAGE); - if (vd == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -287,4 +289,4 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, - if (IS_ERR(bdev)) { -- kmem_free(vd, sizeof(vdev_disk_t)); -- return -PTR_ERR(bdev); -+ kmem_free(vd, sizeof (vdev_disk_t)); -+ return (-PTR_ERR(bdev)); - } -@@ -313,3 +315,3 @@ skip_open: - -- return 0; -+ return (0); - } -@@ -326,5 +328,5 @@ vdev_disk_close(vdev_t *v) - vdev_bdev_close(vd->vd_bdev, -- vdev_bdev_mode(spa_mode(v->vdev_spa))); -+ vdev_bdev_mode(spa_mode(v->vdev_spa))); - -- kmem_free(vd, sizeof(vdev_disk_t)); -+ kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; -@@ -338,4 +340,4 @@ vdev_disk_dio_alloc(int bio_count) - -- dr = kmem_zalloc(sizeof(dio_request_t) + -- sizeof(struct bio *) * bio_count, KM_PUSHPAGE); -+ dr = kmem_zalloc(sizeof (dio_request_t) + -+ sizeof (struct bio *) * bio_count, KM_PUSHPAGE); - if (dr) { -@@ -350,3 +352,3 @@ vdev_disk_dio_alloc(int bio_count) - -- return dr; -+ return (dr); - } -@@ -362,4 +364,4 @@ vdev_disk_dio_free(dio_request_t *dr) - -- kmem_free(dr, sizeof(dio_request_t) + -- sizeof(struct bio *) * dr->dr_bio_count); -+ kmem_free(dr, sizeof (dio_request_t) + -+ sizeof (struct bio *) * dr->dr_bio_count); - } -@@ -371,15 +373,15 @@ vdev_disk_dio_is_sync(dio_request_t *dr) - /* BIO_RW_SYNC preferred interface from 2.6.12-2.6.29 */ -- return (dr->dr_rw & (1 << BIO_RW_SYNC)); -+ return (dr->dr_rw & (1 << BIO_RW_SYNC)); - #else --# ifdef HAVE_BIO_RW_SYNCIO -+#ifdef HAVE_BIO_RW_SYNCIO - /* BIO_RW_SYNCIO preferred interface from 2.6.30-2.6.35 */ -- return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); --# else --# ifdef HAVE_REQ_SYNC -+ return (dr->dr_rw & (1 << BIO_RW_SYNCIO)); -+#else -+#ifdef HAVE_REQ_SYNC - /* REQ_SYNC preferred interface from 2.6.36-2.6.xx */ -- return (dr->dr_rw & REQ_SYNC); --# else --# error "Unable to determine bio sync flag" --# endif /* HAVE_REQ_SYNC */ --# endif /* HAVE_BIO_RW_SYNC */ -+ return (dr->dr_rw & REQ_SYNC); -+#else -+#error "Unable to determine bio sync flag" -+#endif /* HAVE_REQ_SYNC */ -+#endif /* HAVE_BIO_RW_SYNC */ - #endif /* HAVE_BIO_RW_SYNCIO */ -@@ -418,3 +420,3 @@ vdev_disk_dio_put(dio_request_t *dr) - -- return rc; -+ return (rc); - } -@@ -432,3 +434,3 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, -- bio->bi_idx, bio->bi_size, bio->bi_end_io, -+ BIO_BI_IDX(bio), BIO_BI_SIZE(bio), bio->bi_end_io, - atomic_read(&bio->bi_cnt)); -@@ -436,4 +438,4 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - #ifndef HAVE_2ARGS_BIO_END_IO_T -- if (bio->bi_size) -- return 1; -+ if (BIO_BI_SIZE(bio)) -+ return (1); - #endif /* HAVE_2ARGS_BIO_END_IO_T */ -@@ -441,3 +443,3 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error) - if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) -- error = -EIO; -+ error = (-EIO); - -@@ -460,3 +462,3 @@ bio_nr_pages(void *bio_ptr, unsigned int bio_size) - return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >> -- PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); -+ PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT)); - } -@@ -492,3 +494,3 @@ bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) - -- return bio_size; -+ return (bio_size); - } -@@ -497,5 +499,5 @@ static int - __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, -- size_t kbuf_size, uint64_t kbuf_offset, int flags) -+ size_t kbuf_size, uint64_t kbuf_offset, int flags) - { -- dio_request_t *dr; -+ dio_request_t *dr; - caddr_t bio_ptr; -@@ -510,3 +512,3 @@ retry: - if (dr == NULL) -- return ENOMEM; -+ return (ENOMEM); - -@@ -546,6 +548,6 @@ retry: - dr->dr_bio[i] = bio_alloc(GFP_NOIO, -- bio_nr_pages(bio_ptr, bio_size)); -+ bio_nr_pages(bio_ptr, bio_size)); - if (dr->dr_bio[i] == NULL) { - vdev_disk_dio_free(dr); -- return ENOMEM; -+ return (ENOMEM); - } -@@ -556,3 +558,3 @@ retry: - dr->dr_bio[i]->bi_bdev = bdev; -- dr->dr_bio[i]->bi_sector = bio_offset >> 9; -+ BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_rw = dr->dr_rw; -@@ -565,4 +567,4 @@ retry: - /* Advance in buffer and construct another bio if needed */ -- bio_ptr += dr->dr_bio[i]->bi_size; -- bio_offset += dr->dr_bio[i]->bi_size; -+ bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); -+ bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); - } -@@ -593,5 +595,5 @@ retry: - -- (void)vdev_disk_dio_put(dr); -+ (void) vdev_disk_dio_put(dr); - -- return error; -+ return (error); - } -@@ -600,6 +602,6 @@ int - vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, -- size_t size, uint64_t offset, int flags) -+ size_t size, uint64_t offset, int flags) - { - bio_set_flags_failfast(bdev, &flags); -- return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); -+ return (__vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags)); - } -@@ -632,3 +634,3 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - if (!q) -- return ENXIO; -+ return (ENXIO); - -@@ -636,3 +638,3 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - if (!bio) -- return ENOMEM; -+ return (ENOMEM); - -@@ -643,4 +645,5 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) - submit_bio(VDEV_WRITE_FLUSH_FUA, bio); -+ invalidate_bdev(bdev); - -- return 0; -+ return (0); - } -@@ -658,4 +661,4 @@ vdev_disk_io_start(zio_t *zio) - if (!vdev_readable(v)) { -- zio->io_error = ENXIO; -- return ZIO_PIPELINE_CONTINUE; -+ zio->io_error = SET_ERROR(ENXIO); -+ return (ZIO_PIPELINE_CONTINUE); - } -@@ -669,3 +672,3 @@ vdev_disk_io_start(zio_t *zio) - if (v->vdev_nowritecache) { -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - break; -@@ -675,3 +678,3 @@ vdev_disk_io_start(zio_t *zio) - if (error == 0) -- return ZIO_PIPELINE_STOP; -+ return (ZIO_PIPELINE_STOP); - -@@ -684,6 +687,6 @@ vdev_disk_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - } - -- return ZIO_PIPELINE_CONTINUE; -+ return (ZIO_PIPELINE_CONTINUE); - -@@ -698,4 +701,4 @@ vdev_disk_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -- return ZIO_PIPELINE_CONTINUE; -+ zio->io_error = SET_ERROR(ENOTSUP); -+ return (ZIO_PIPELINE_CONTINUE); - } -@@ -703,9 +706,9 @@ vdev_disk_io_start(zio_t *zio) - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, -- zio->io_size, zio->io_offset, flags); -+ zio->io_size, zio->io_offset, flags); - if (error) { - zio->io_error = error; -- return ZIO_PIPELINE_CONTINUE; -+ return (ZIO_PIPELINE_CONTINUE); - } - -- return ZIO_PIPELINE_STOP; -+ return (ZIO_PIPELINE_STOP); - } -@@ -721,3 +724,3 @@ vdev_disk_io_done(zio_t *zio) - if (zio->io_error == EIO) { -- vdev_t *v = zio->io_vd; -+ vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; -@@ -788,3 +791,3 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - if (IS_ERR(bdev)) -- return -PTR_ERR(bdev); -+ return (-PTR_ERR(bdev)); - -@@ -793,10 +796,10 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); -- return EIO; -+ return (EIO); - } - -- size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); -- label = vmem_alloc(sizeof(vdev_label_t), KM_PUSHPAGE); -+ size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); -+ label = vmem_alloc(sizeof (vdev_label_t), KM_PUSHPAGE); - - for (i = 0; i < VDEV_LABELS; i++) { -- uint64_t offset, state, txg = 0; -+ uint64_t offset, state, txg = 0; - -@@ -831,6 +834,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) - -- vmem_free(label, sizeof(vdev_label_t)); -+ vmem_free(label, sizeof (vdev_label_t)); - vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); - -- return 0; -+ return (0); - } -diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c -index 06999a8..858582a 100644 ---- a/module/zfs/vdev_file.c -+++ b/module/zfs/vdev_file.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -64,3 +64,3 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -102,3 +102,3 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; -- return (ENODEV); -+ return (SET_ERROR(ENODEV)); - } -@@ -156,3 +156,3 @@ vdev_file_io_strategy(void *arg) - if (resid != 0 && zio->io_error == 0) -- zio->io_error = ENOSPC; -+ zio->io_error = SET_ERROR(ENOSPC); - -@@ -164,3 +164,2 @@ vdev_file_io_start(zio_t *zio) - { -- spa_t *spa = zio->io_spa; - vdev_t *vd = zio->io_vd; -@@ -171,3 +170,3 @@ vdev_file_io_start(zio_t *zio) - if (!vdev_readable(vd)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - return (ZIO_PIPELINE_CONTINUE); -@@ -181,3 +180,3 @@ vdev_file_io_start(zio_t *zio) - default: -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - } -@@ -187,4 +186,4 @@ vdev_file_io_start(zio_t *zio) - -- spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, -- vdev_file_io_strategy, zio, 0, &zio->io_tqent); -+ VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, zio, -+ TQ_PUSHPAGE), !=, 0); - -diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c -index 1fe36fe..d5af110 100644 ---- a/module/zfs/vdev_label.c -+++ b/module/zfs/vdev_label.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -220,24 +220,19 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, -- vd->vdev_ops->vdev_op_type) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); - if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) -- == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); - - if (vd->vdev_path != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, -- vd->vdev_path) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); - - if (vd->vdev_devid != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, -- vd->vdev_devid) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); - - if (vd->vdev_physpath != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, -- vd->vdev_physpath) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, -+ vd->vdev_physpath); - - if (vd->vdev_fru != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, -- vd->vdev_fru) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - -@@ -262,4 +257,3 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - */ -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, -- vd->vdev_nparity) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } -@@ -267,10 +261,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_wholedisk != -1ULL) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, -- vd->vdev_wholedisk) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, -+ vd->vdev_wholedisk); - - if (vd->vdev_not_present) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); - - if (vd->vdev_isspare) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); - -@@ -278,15 +272,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - vd == vd->vdev_top) { -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, -- vd->vdev_ms_array) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, -- vd->vdev_ms_shift) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, -- vd->vdev_ashift) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, -- vd->vdev_asize) == 0); -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, -- vd->vdev_islog) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, -+ vd->vdev_ms_array); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, -+ vd->vdev_ms_shift); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, -+ vd->vdev_asize); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, -- vd->vdev_removing) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, -+ vd->vdev_removing); - } -@@ -294,8 +286,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_dtl_smo.smo_object != 0) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, -- vd->vdev_dtl_smo.smo_object) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, -+ vd->vdev_dtl_smo.smo_object); - - if (vd->vdev_crtxg) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, -- vd->vdev_crtxg) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); - -@@ -306,4 +297,4 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - vdev_get_stats(vd, &vs); -- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, -- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); -+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, -+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); - -@@ -311,6 +302,5 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (spa_scan_get_stats(spa, &ps) == 0) { -- VERIFY(nvlist_add_uint64_array(nv, -+ fnvlist_add_uint64_array(nv, - ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, -- sizeof (pool_scan_stat_t) / sizeof (uint64_t)) -- == 0); -+ sizeof (pool_scan_stat_t) / sizeof (uint64_t)); - } -@@ -344,4 +334,4 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (idx) { -- VERIFY(nvlist_add_nvlist_array(nv, -- ZPOOL_CONFIG_CHILDREN, child, idx) == 0); -+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, -+ child, idx); - } -@@ -357,22 +347,16 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (vd->vdev_offline && !vd->vdev_tmpoffline) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, -- B_TRUE) == 0); -- if (vd->vdev_resilvering) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); -+ if (vd->vdev_resilver_txg != 0) -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, -+ vd->vdev_resilver_txg); - if (vd->vdev_faulted) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); - if (vd->vdev_degraded) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); - if (vd->vdev_removed) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); - if (vd->vdev_unspare) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); - if (vd->vdev_ishole) -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, -- B_TRUE) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); - -@@ -389,8 +373,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - if (aux != NULL) -- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, -- aux) == 0); -+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); - - if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { -- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, -- vd->vdev_orig_guid) == 0); -+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, -+ vd->vdev_orig_guid); - } -@@ -663,3 +646,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - -- if (!vd->vdev_ops->vdev_op_leaf) -+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) - return (0); -@@ -670,3 +653,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - if (vdev_is_dead(vd)) -- return (EIO); -+ return (SET_ERROR(EIO)); - -@@ -677,3 +660,3 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) - vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -1037,2 +1020,3 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) - -+/* Sync the uberblocks to all vdevs in svd[] */ - int -@@ -1088,3 +1072,3 @@ vdev_label_sync_top_done(zio_t *zio) - if (*good_writes == 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - -@@ -1134,3 +1118,3 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) - -- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE) == 0) { -+ if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_PUSHPAGE)) { - for (; l < VDEV_LABELS; l += 2) { -diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c -index e0884dc..99b35f0 100644 ---- a/module/zfs/vdev_mirror.c -+++ b/module/zfs/vdev_mirror.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -91,13 +91,10 @@ vdev_mirror_pending(vdev_t *vd) - { -- vdev_queue_t *vq = &vd->vdev_queue; -- int pending; -- -- mutex_enter(&vq->vq_lock); -- pending = avl_numnodes(&vq->vq_pending_tree); -- mutex_exit(&vq->vq_lock); -- -- return (pending); -+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); - } - --static mirror_map_t * -+/* -+ * Avoid inlining the function to keep vdev_mirror_io_start(), which -+ * is this functions only caller, as small as possible on the stack. -+ */ -+noinline static mirror_map_t * - vdev_mirror_map_alloc(zio_t *zio) -@@ -115,3 +112,4 @@ vdev_mirror_map_alloc(zio_t *zio) - -- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); -+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), -+ KM_PUSHPAGE); - mm->mm_children = c; -@@ -145,3 +143,4 @@ vdev_mirror_map_alloc(zio_t *zio) - -- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE); -+ mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), -+ KM_PUSHPAGE); - mm->mm_children = c; -@@ -161,3 +160,3 @@ vdev_mirror_map_alloc(zio_t *zio) - if (!vdev_readable(mc->mc_vd)) { -- mc->mc_error = ENXIO; -+ mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; -@@ -207,3 +206,3 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -304,3 +303,3 @@ vdev_mirror_child_select(zio_t *zio) - if (!vdev_readable(mc->mc_vd)) { -- mc->mc_error = ENXIO; -+ mc->mc_error = SET_ERROR(ENXIO); - mc->mc_tried = 1; /* don't even try */ -@@ -311,3 +310,3 @@ vdev_mirror_child_select(zio_t *zio) - return (c); -- mc->mc_error = ESTALE; -+ mc->mc_error = SET_ERROR(ESTALE); - mc->mc_skipped = 1; -@@ -496,3 +495,3 @@ vdev_mirror_io_done(zio_t *zio) - continue; -- mc->mc_error = ESTALE; -+ mc->mc_error = SET_ERROR(ESTALE); - } -@@ -502,3 +501,3 @@ vdev_mirror_io_done(zio_t *zio) - zio->io_data, zio->io_size, -- ZIO_TYPE_WRITE, zio->io_priority, -+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? -diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c -index 3bd8c90..b9eb99d 100644 ---- a/module/zfs/vdev_missing.c -+++ b/module/zfs/vdev_missing.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -71,3 +71,3 @@ vdev_missing_io_start(zio_t *zio) - { -- zio->io_error = ENOTSUP; -+ zio->io_error = SET_ERROR(ENOTSUP); - return (ZIO_PIPELINE_CONTINUE); -diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c -index b2cc6b8..0dc733e 100644 ---- a/module/zfs/vdev_queue.c -+++ b/module/zfs/vdev_queue.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -31,25 +31,133 @@ - #include -+#include - #include - #include -+#include -+#include -+#include -+#include - - /* -- * These tunables are for performance analysis. -+ * ZFS I/O Scheduler -+ * --------------- -+ * -+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The -+ * I/O scheduler determines when and in what order those operations are -+ * issued. The I/O scheduler divides operations into five I/O classes -+ * prioritized in the following order: sync read, sync write, async read, -+ * async write, and scrub/resilver. Each queue defines the minimum and -+ * maximum number of concurrent operations that may be issued to the device. -+ * In addition, the device has an aggregate maximum. Note that the sum of the -+ * per-queue minimums must not exceed the aggregate maximum. If the -+ * sum of the per-queue maximums exceeds the aggregate maximum, then the -+ * number of active i/os may reach zfs_vdev_max_active, in which case no -+ * further i/os will be issued regardless of whether all per-queue -+ * minimums have been met. -+ * -+ * For many physical devices, throughput increases with the number of -+ * concurrent operations, but latency typically suffers. Further, physical -+ * devices typically have a limit at which more concurrent operations have no -+ * effect on throughput or can actually cause it to decrease. -+ * -+ * The scheduler selects the next operation to issue by first looking for an -+ * I/O class whose minimum has not been satisfied. Once all are satisfied and -+ * the aggregate maximum has not been hit, the scheduler looks for classes -+ * whose maximum has not been satisfied. Iteration through the I/O classes is -+ * done in the order specified above. No further operations are issued if the -+ * aggregate maximum number of concurrent operations has been hit or if there -+ * are no operations queued for an I/O class that has not hit its maximum. -+ * Every time an i/o is queued or an operation completes, the I/O scheduler -+ * looks for new operations to issue. -+ * -+ * All I/O classes have a fixed maximum number of outstanding operations -+ * except for the async write class. Asynchronous writes represent the data -+ * that is committed to stable storage during the syncing stage for -+ * transaction groups (see txg.c). Transaction groups enter the syncing state -+ * periodically so the number of queued async writes will quickly burst up and -+ * then bleed down to zero. Rather than servicing them as quickly as possible, -+ * the I/O scheduler changes the maximum number of active async write i/os -+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since -+ * both throughput and latency typically increase with the number of -+ * concurrent operations issued to physical devices, reducing the burstiness -+ * in the number of concurrent operations also stabilizes the response time of -+ * operations from other -- and in particular synchronous -- queues. In broad -+ * strokes, the I/O scheduler will issue more concurrent operations from the -+ * async write queue as there's more dirty data in the pool. -+ * -+ * Async Writes -+ * -+ * The number of concurrent operations issued for the async write I/O class -+ * follows a piece-wise linear function defined by a few adjustable points. -+ * -+ * | o---------| <-- zfs_vdev_async_write_max_active -+ * ^ | /^ | -+ * | | / | | -+ * active | / | | -+ * I/O | / | | -+ * count | / | | -+ * | / | | -+ * |------------o | | <-- zfs_vdev_async_write_min_active -+ * 0|____________^______|_________| -+ * 0% | | 100% of zfs_dirty_data_max -+ * | | -+ * | `-- zfs_vdev_async_write_active_max_dirty_percent -+ * `--------- zfs_vdev_async_write_active_min_dirty_percent -+ * -+ * Until the amount of dirty data exceeds a minimum percentage of the dirty -+ * data allowed in the pool, the I/O scheduler will limit the number of -+ * concurrent operations to the minimum. As that threshold is crossed, the -+ * number of concurrent operations issued increases linearly to the maximum at -+ * the specified maximum percentage of the dirty data allowed in the pool. -+ * -+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped -+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent -+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the -+ * maximum percentage, this indicates that the rate of incoming data is -+ * greater than the rate that the backend storage can handle. In this case, we -+ * must further throttle incoming writes (see dmu_tx_delay() for details). - */ -+ - /* -- * zfs_vdev_max_pending is the maximum number of i/os concurrently -- * pending to each device. zfs_vdev_min_pending is the initial number -- * of i/os pending to each device (before it starts ramping up to -- * max_pending). -+ * The maximum number of i/os active to each device. Ideally, this will be >= -+ * the sum of each queue's max_active. It must be at least the sum of each -+ * queue's min_active. - */ --int zfs_vdev_max_pending = 10; --int zfs_vdev_min_pending = 4; -+uint32_t zfs_vdev_max_active = 1000; - - /* -- * The deadlines are grouped into buckets based on zfs_vdev_time_shift: -- * deadline = pri + gethrtime() >> time_shift) -+ * Per-queue limits on the number of i/os active to each device. If the -+ * number of active i/os is < zfs_vdev_max_active, then the min_active comes -+ * into play. We will send min_active from each queue, and then select from -+ * queues in the order defined by zio_priority_t. -+ * -+ * In general, smaller max_active's will lead to lower latency of synchronous -+ * operations. Larger max_active's may lead to higher overall throughput, -+ * depending on underlying storage. -+ * -+ * The ratio of the queues' max_actives determines the balance of performance -+ * between reads, writes, and scrubs. E.g., increasing -+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete -+ * more quickly, but reads and writes to have higher latency and lower -+ * throughput. - */ --int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */ -+uint32_t zfs_vdev_sync_read_min_active = 10; -+uint32_t zfs_vdev_sync_read_max_active = 10; -+uint32_t zfs_vdev_sync_write_min_active = 10; -+uint32_t zfs_vdev_sync_write_max_active = 10; -+uint32_t zfs_vdev_async_read_min_active = 1; -+uint32_t zfs_vdev_async_read_max_active = 3; -+uint32_t zfs_vdev_async_write_min_active = 1; -+uint32_t zfs_vdev_async_write_max_active = 10; -+uint32_t zfs_vdev_scrub_min_active = 1; -+uint32_t zfs_vdev_scrub_max_active = 2; - --/* exponential I/O issue ramp-up rate */ --int zfs_vdev_ramp_rate = 2; -+/* -+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent -+ * dirty data, use zfs_vdev_async_write_min_active. When it has more than -+ * zfs_vdev_async_write_active_max_dirty_percent, use -+ * zfs_vdev_async_write_max_active. The value is linearly interpolated -+ * between min and max. -+ */ -+int zfs_vdev_async_write_active_min_dirty_percent = 30; -+int zfs_vdev_async_write_active_max_dirty_percent = 60; - -@@ -65,7 +173,4 @@ int zfs_vdev_write_gap_limit = 4 << 10; - --/* -- * Virtual device vector for disk I/O scheduling. -- */ - int --vdev_queue_deadline_compare(const void *x1, const void *x2) -+vdev_queue_offset_compare(const void *x1, const void *x2) - { -@@ -74,7 +179,2 @@ vdev_queue_deadline_compare(const void *x1, const void *x2) - -- if (z1->io_deadline < z2->io_deadline) -- return (-1); -- if (z1->io_deadline > z2->io_deadline) -- return (1); -- - if (z1->io_offset < z2->io_offset) -@@ -93,3 +193,3 @@ vdev_queue_deadline_compare(const void *x1, const void *x2) - int --vdev_queue_offset_compare(const void *x1, const void *x2) -+vdev_queue_timestamp_compare(const void *x1, const void *x2) - { -@@ -98,5 +198,5 @@ vdev_queue_offset_compare(const void *x1, const void *x2) - -- if (z1->io_offset < z2->io_offset) -+ if (z1->io_timestamp < z2->io_timestamp) - return (-1); -- if (z1->io_offset > z2->io_offset) -+ if (z1->io_timestamp > z2->io_timestamp) - return (1); -@@ -111,2 +211,109 @@ vdev_queue_offset_compare(const void *x1, const void *x2) - -+static int -+vdev_queue_class_min_active(zio_priority_t p) -+{ -+ switch (p) { -+ case ZIO_PRIORITY_SYNC_READ: -+ return (zfs_vdev_sync_read_min_active); -+ case ZIO_PRIORITY_SYNC_WRITE: -+ return (zfs_vdev_sync_write_min_active); -+ case ZIO_PRIORITY_ASYNC_READ: -+ return (zfs_vdev_async_read_min_active); -+ case ZIO_PRIORITY_ASYNC_WRITE: -+ return (zfs_vdev_async_write_min_active); -+ case ZIO_PRIORITY_SCRUB: -+ return (zfs_vdev_scrub_min_active); -+ default: -+ panic("invalid priority %u", p); -+ return (0); -+ } -+} -+ -+static int -+vdev_queue_max_async_writes(uint64_t dirty) -+{ -+ int writes; -+ uint64_t min_bytes = zfs_dirty_data_max * -+ zfs_vdev_async_write_active_min_dirty_percent / 100; -+ uint64_t max_bytes = zfs_dirty_data_max * -+ zfs_vdev_async_write_active_max_dirty_percent / 100; -+ -+ if (dirty < min_bytes) -+ return (zfs_vdev_async_write_min_active); -+ if (dirty > max_bytes) -+ return (zfs_vdev_async_write_max_active); -+ -+ /* -+ * linear interpolation: -+ * slope = (max_writes - min_writes) / (max_bytes - min_bytes) -+ * move right by min_bytes -+ * move up by min_writes -+ */ -+ writes = (dirty - min_bytes) * -+ (zfs_vdev_async_write_max_active - -+ zfs_vdev_async_write_min_active) / -+ (max_bytes - min_bytes) + -+ zfs_vdev_async_write_min_active; -+ ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); -+ ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); -+ return (writes); -+} -+ -+static int -+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) -+{ -+ switch (p) { -+ case ZIO_PRIORITY_SYNC_READ: -+ return (zfs_vdev_sync_read_max_active); -+ case ZIO_PRIORITY_SYNC_WRITE: -+ return (zfs_vdev_sync_write_max_active); -+ case ZIO_PRIORITY_ASYNC_READ: -+ return (zfs_vdev_async_read_max_active); -+ case ZIO_PRIORITY_ASYNC_WRITE: -+ return (vdev_queue_max_async_writes( -+ spa->spa_dsl_pool->dp_dirty_total)); -+ case ZIO_PRIORITY_SCRUB: -+ return (zfs_vdev_scrub_max_active); -+ default: -+ panic("invalid priority %u", p); -+ return (0); -+ } -+} -+ -+/* -+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if -+ * there is no eligible class. -+ */ -+static zio_priority_t -+vdev_queue_class_to_issue(vdev_queue_t *vq) -+{ -+ spa_t *spa = vq->vq_vdev->vdev_spa; -+ zio_priority_t p; -+ -+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) -+ return (ZIO_PRIORITY_NUM_QUEUEABLE); -+ -+ /* find a queue that has not reached its minimum # outstanding i/os */ -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && -+ vq->vq_class[p].vqc_active < -+ vdev_queue_class_min_active(p)) -+ return (p); -+ } -+ -+ /* -+ * If we haven't found a queue, look for one that hasn't reached its -+ * maximum # outstanding i/os. -+ */ -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 && -+ vq->vq_class[p].vqc_active < -+ vdev_queue_class_max_active(spa, p)) -+ return (p); -+ } -+ -+ /* No eligible queued i/os */ -+ return (ZIO_PRIORITY_NUM_QUEUEABLE); -+} -+ - void -@@ -115,2 +322,4 @@ vdev_queue_init(vdev_t *vd) - vdev_queue_t *vq = &vd->vdev_queue; -+ int max_active_sum; -+ zio_priority_t p; - int i; -@@ -118,14 +327,21 @@ vdev_queue_init(vdev_t *vd) - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); -+ vq->vq_vdev = vd; - -- avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, -- sizeof (zio_t), offsetof(struct zio, io_deadline_node)); -- -- avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -- -- avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, -+ sizeof (zio_t), offsetof(struct zio, io_queue_node)); - -- avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, -- sizeof (zio_t), offsetof(struct zio, io_offset_node)); -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { -+ /* -+ * The synchronous i/o queues are FIFO rather than LBA ordered. -+ * This provides more consistent latency for these i/os, and -+ * they tend to not be tightly clustered anyway so there is -+ * little to no throughput loss. -+ */ -+ boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ || -+ p == ZIO_PRIORITY_SYNC_WRITE); -+ avl_create(&vq->vq_class[p].vqc_queued_tree, -+ fifo ? vdev_queue_timestamp_compare : -+ vdev_queue_offset_compare, -+ sizeof (zio_t), offsetof(struct zio, io_queue_node)); -+ } - -@@ -138,3 +354,6 @@ vdev_queue_init(vdev_t *vd) - -- for (i = 0; i < zfs_vdev_max_pending; i++) -+ max_active_sum = zfs_vdev_sync_read_max_active + -+ zfs_vdev_sync_write_max_active + zfs_vdev_async_read_max_active + -+ zfs_vdev_async_write_max_active + zfs_vdev_scrub_max_active; -+ for (i = 0; i < max_active_sum; i++) - list_insert_tail(&vq->vq_io_list, zio_vdev_alloc()); -@@ -147,7 +366,7 @@ vdev_queue_fini(vdev_t *vd) - vdev_io_t *vi; -+ zio_priority_t p; - -- avl_destroy(&vq->vq_deadline_tree); -- avl_destroy(&vq->vq_read_tree); -- avl_destroy(&vq->vq_write_tree); -- avl_destroy(&vq->vq_pending_tree); -+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) -+ avl_destroy(&vq->vq_class[p].vqc_queued_tree); -+ avl_destroy(&vq->vq_active_tree); - -@@ -166,4 +385,13 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) - { -- avl_add(&vq->vq_deadline_tree, zio); -- avl_add(zio->io_vdev_tree, zio); -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_waitq_enter(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } - } -@@ -173,4 +401,58 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) - { -- avl_remove(&vq->vq_deadline_tree, zio); -- avl_remove(zio->io_vdev_tree, zio); -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_waitq_exit(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } -+} -+ -+static void -+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) -+{ -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ vq->vq_class[zio->io_priority].vqc_active++; -+ avl_add(&vq->vq_active_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ mutex_enter(&ssh->lock); -+ kstat_runq_enter(ssh->kstat->ks_data); -+ mutex_exit(&ssh->lock); -+ } -+} -+ -+static void -+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) -+{ -+ spa_t *spa = zio->io_spa; -+ spa_stats_history_t *ssh = &spa->spa_stats.io_history; -+ -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); -+ vq->vq_class[zio->io_priority].vqc_active--; -+ avl_remove(&vq->vq_active_tree, zio); -+ -+ if (ssh->kstat != NULL) { -+ kstat_io_t *ksio = ssh->kstat->ks_data; -+ -+ mutex_enter(&ssh->lock); -+ kstat_runq_exit(ksio); -+ if (zio->io_type == ZIO_TYPE_READ) { -+ ksio->reads++; -+ ksio->nread += zio->io_size; -+ } else if (zio->io_type == ZIO_TYPE_WRITE) { -+ ksio->writes++; -+ ksio->nwritten += zio->io_size; -+ } -+ mutex_exit(&ssh->lock); -+ } - } -@@ -182,8 +464,10 @@ vdev_queue_agg_io_done(zio_t *aio) - vdev_io_t *vi = aio->io_data; -- zio_t *pio; - -- while ((pio = zio_walk_parents(aio)) != NULL) -- if (aio->io_type == ZIO_TYPE_READ) -+ if (aio->io_type == ZIO_TYPE_READ) { -+ zio_t *pio; -+ while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); -+ } -+ } - -@@ -204,24 +488,36 @@ vdev_queue_agg_io_done(zio_t *aio) - static zio_t * --vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) -+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) - { -- zio_t *fio, *lio, *aio, *dio, *nio, *mio; -- avl_tree_t *t; - vdev_io_t *vi; -- int flags; -- uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); -- uint64_t maxgap; -- int stretch; -+ zio_t *first, *last, *aio, *dio, *mandatory, *nio; -+ uint64_t maxgap = 0; -+ uint64_t size; -+ boolean_t stretch = B_FALSE; -+ vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority]; -+ avl_tree_t *t = &vqc->vqc_queued_tree; -+ enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; -+ -+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) -+ return (NULL); - --again: -- ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ /* -+ * Prevent users from setting the zfs_vdev_aggregation_limit -+ * tuning larger than SPA_MAXBLOCKSIZE. -+ */ -+ zfs_vdev_aggregation_limit = -+ MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); - -- if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || -- avl_numnodes(&vq->vq_deadline_tree) == 0) -+ /* -+ * The synchronous i/o queues are not sorted by LBA, so we can't -+ * find adjacent i/os. These i/os tend to not be tightly clustered, -+ * or too large to aggregate, so this has little impact on performance. -+ */ -+ if (zio->io_priority == ZIO_PRIORITY_SYNC_READ || -+ zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) - return (NULL); - -- fio = lio = avl_first(&vq->vq_deadline_tree); -+ first = last = zio; - -- t = fio->io_vdev_tree; -- flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; -- maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; -+ if (zio->io_type == ZIO_TYPE_READ) -+ maxgap = zfs_vdev_read_gap_limit; - -@@ -233,85 +529,83 @@ again: - -- if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { -- /* -- * We can aggregate I/Os that are sufficiently adjacent and of -- * the same flavor, as expressed by the AGG_INHERIT flags. -- * The latter requirement is necessary so that certain -- * attributes of the I/O, such as whether it's a normal I/O -- * or a scrub/resilver, can be preserved in the aggregate. -- * We can include optional I/Os, but don't allow them -- * to begin a range as they add no benefit in that situation. -- */ -+ /* -+ * We can aggregate I/Os that are sufficiently adjacent and of -+ * the same flavor, as expressed by the AGG_INHERIT flags. -+ * The latter requirement is necessary so that certain -+ * attributes of the I/O, such as whether it's a normal I/O -+ * or a scrub/resilver, can be preserved in the aggregate. -+ * We can include optional I/Os, but don't allow them -+ * to begin a range as they add no benefit in that situation. -+ */ - -- /* -- * We keep track of the last non-optional I/O. -- */ -- mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; -+ /* -+ * We keep track of the last non-optional I/O. -+ */ -+ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; - -- /* -- * Walk backwards through sufficiently contiguous I/Os -- * recording the last non-option I/O. -- */ -- while ((dio = AVL_PREV(t, fio)) != NULL && -- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -- IO_SPAN(dio, lio) <= maxspan && -- IO_GAP(dio, fio) <= maxgap) { -- fio = dio; -- if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) -- mio = fio; -- } -+ /* -+ * Walk backwards through sufficiently contiguous I/Os -+ * recording the last non-option I/O. -+ */ -+ while ((dio = AVL_PREV(t, first)) != NULL && -+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -+ IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && -+ IO_GAP(dio, first) <= maxgap) { -+ first = dio; -+ if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) -+ mandatory = first; -+ } - -- /* -- * Skip any initial optional I/Os. -- */ -- while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { -- fio = AVL_NEXT(t, fio); -- ASSERT(fio != NULL); -- } -+ /* -+ * Skip any initial optional I/Os. -+ */ -+ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { -+ first = AVL_NEXT(t, first); -+ ASSERT(first != NULL); -+ } - -- /* -- * Walk forward through sufficiently contiguous I/Os. -- */ -- while ((dio = AVL_NEXT(t, lio)) != NULL && -- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -- IO_SPAN(fio, dio) <= maxspan && -- IO_GAP(lio, dio) <= maxgap) { -- lio = dio; -- if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) -- mio = lio; -- } - -- /* -- * Now that we've established the range of the I/O aggregation -- * we must decide what to do with trailing optional I/Os. -- * For reads, there's nothing to do. While we are unable to -- * aggregate further, it's possible that a trailing optional -- * I/O would allow the underlying device to aggregate with -- * subsequent I/Os. We must therefore determine if the next -- * non-optional I/O is close enough to make aggregation -- * worthwhile. -- */ -- stretch = B_FALSE; -- if (t != &vq->vq_read_tree && mio != NULL) { -- nio = lio; -- while ((dio = AVL_NEXT(t, nio)) != NULL && -- IO_GAP(nio, dio) == 0 && -- IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { -- nio = dio; -- if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { -- stretch = B_TRUE; -- break; -- } -+ /* -+ * Walk forward through sufficiently contiguous I/Os. -+ */ -+ while ((dio = AVL_NEXT(t, last)) != NULL && -+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && -+ IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && -+ IO_GAP(last, dio) <= maxgap) { -+ last = dio; -+ if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) -+ mandatory = last; -+ } -+ -+ /* -+ * Now that we've established the range of the I/O aggregation -+ * we must decide what to do with trailing optional I/Os. -+ * For reads, there's nothing to do. While we are unable to -+ * aggregate further, it's possible that a trailing optional -+ * I/O would allow the underlying device to aggregate with -+ * subsequent I/Os. We must therefore determine if the next -+ * non-optional I/O is close enough to make aggregation -+ * worthwhile. -+ */ -+ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { -+ zio_t *nio = last; -+ while ((dio = AVL_NEXT(t, nio)) != NULL && -+ IO_GAP(nio, dio) == 0 && -+ IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { -+ nio = dio; -+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { -+ stretch = B_TRUE; -+ break; - } - } -+ } - -- if (stretch) { -- /* This may be a no-op. */ -- VERIFY((dio = AVL_NEXT(t, lio)) != NULL); -- dio->io_flags &= ~ZIO_FLAG_OPTIONAL; -- } else { -- while (lio != mio && lio != fio) { -- ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); -- lio = AVL_PREV(t, lio); -- ASSERT(lio != NULL); -- } -+ if (stretch) { -+ /* This may be a no-op. */ -+ dio = AVL_NEXT(t, last); -+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL; -+ } else { -+ while (last != mandatory && last != first) { -+ ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); -+ last = AVL_PREV(t, last); -+ ASSERT(last != NULL); - } -@@ -319,44 +613,84 @@ again: - -- if (fio != lio) { -- uint64_t size = IO_SPAN(fio, lio); -- ASSERT(size <= maxspan); -- ASSERT(vi != NULL); -- -- aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, -- vi, size, fio->io_type, ZIO_PRIORITY_AGG, -- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, -- vdev_queue_agg_io_done, NULL); -- aio->io_timestamp = fio->io_timestamp; -- -- nio = fio; -- do { -- dio = nio; -- nio = AVL_NEXT(t, dio); -- ASSERT(dio->io_type == aio->io_type); -- ASSERT(dio->io_vdev_tree == t); -- -- if (dio->io_flags & ZIO_FLAG_NODATA) { -- ASSERT(dio->io_type == ZIO_TYPE_WRITE); -- bzero((char *)aio->io_data + (dio->io_offset - -- aio->io_offset), dio->io_size); -- } else if (dio->io_type == ZIO_TYPE_WRITE) { -- bcopy(dio->io_data, (char *)aio->io_data + -- (dio->io_offset - aio->io_offset), -- dio->io_size); -- } -+ if (first == last) -+ return (NULL); - -- zio_add_child(dio, aio); -- vdev_queue_io_remove(vq, dio); -- zio_vdev_io_bypass(dio); -- zio_execute(dio); -- } while (dio != lio); -+ ASSERT(vi != NULL); -+ -+ size = IO_SPAN(first, last); -+ ASSERT3U(size, <=, zfs_vdev_aggregation_limit); -+ -+ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, -+ vi, size, first->io_type, zio->io_priority, -+ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, -+ vdev_queue_agg_io_done, NULL); -+ aio->io_timestamp = first->io_timestamp; -+ -+ nio = first; -+ do { -+ dio = nio; -+ nio = AVL_NEXT(t, dio); -+ ASSERT3U(dio->io_type, ==, aio->io_type); -+ -+ if (dio->io_flags & ZIO_FLAG_NODATA) { -+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); -+ bzero((char *)aio->io_data + (dio->io_offset - -+ aio->io_offset), dio->io_size); -+ } else if (dio->io_type == ZIO_TYPE_WRITE) { -+ bcopy(dio->io_data, (char *)aio->io_data + -+ (dio->io_offset - aio->io_offset), -+ dio->io_size); -+ } - -- avl_add(&vq->vq_pending_tree, aio); -- list_remove(&vq->vq_io_list, vi); -+ zio_add_child(dio, aio); -+ vdev_queue_io_remove(vq, dio); -+ zio_vdev_io_bypass(dio); -+ zio_execute(dio); -+ } while (dio != last); -+ -+ list_remove(&vq->vq_io_list, vi); -+ -+ return (aio); -+} - -- return (aio); -+static zio_t * -+vdev_queue_io_to_issue(vdev_queue_t *vq) -+{ -+ zio_t *zio, *aio; -+ zio_priority_t p; -+ avl_index_t idx; -+ vdev_queue_class_t *vqc; -+ zio_t *search; -+ -+again: -+ ASSERT(MUTEX_HELD(&vq->vq_lock)); -+ -+ p = vdev_queue_class_to_issue(vq); -+ -+ if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { -+ /* No eligible queued i/os */ -+ return (NULL); - } - -- ASSERT(fio->io_vdev_tree == t); -- vdev_queue_io_remove(vq, fio); -+ /* -+ * For LBA-ordered queues (async / scrub), issue the i/o which follows -+ * the most recently issued i/o in LBA (offset) order. -+ * -+ * For FIFO queues (sync), issue the i/o with the lowest timestamp. -+ */ -+ vqc = &vq->vq_class[p]; -+ search = zio_buf_alloc(sizeof (*search)); -+ search->io_timestamp = 0; -+ search->io_offset = vq->vq_last_offset + 1; -+ VERIFY3P(avl_find(&vqc->vqc_queued_tree, search, &idx), ==, NULL); -+ zio_buf_free(search, sizeof (*search)); -+ zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER); -+ if (zio == NULL) -+ zio = avl_first(&vqc->vqc_queued_tree); -+ ASSERT3U(zio->io_priority, ==, p); -+ -+ aio = vdev_queue_aggregate(vq, zio); -+ if (aio != NULL) -+ zio = aio; -+ else -+ vdev_queue_io_remove(vq, zio); - -@@ -368,6 +702,6 @@ again: - */ -- if (fio->io_flags & ZIO_FLAG_NODATA) { -+ if (zio->io_flags & ZIO_FLAG_NODATA) { - mutex_exit(&vq->vq_lock); -- zio_vdev_io_bypass(fio); -- zio_execute(fio); -+ zio_vdev_io_bypass(zio); -+ zio_execute(zio); - mutex_enter(&vq->vq_lock); -@@ -376,5 +710,6 @@ again: - -- avl_add(&vq->vq_pending_tree, fio); -+ vdev_queue_pending_add(vq, zio); -+ vq->vq_last_offset = zio->io_offset; - -- return (fio); -+ return (zio); - } -@@ -387,4 +722,2 @@ vdev_queue_io(zio_t *zio) - -- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); -- - if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) -@@ -392,19 +725,24 @@ vdev_queue_io(zio_t *zio) - -- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; -+ /* -+ * Children i/os inherent their parent's priority, which might -+ * not match the child's i/o type. Fix it up here. -+ */ -+ if (zio->io_type == ZIO_TYPE_READ) { -+ if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && -+ zio->io_priority != ZIO_PRIORITY_ASYNC_READ && -+ zio->io_priority != ZIO_PRIORITY_SCRUB) -+ zio->io_priority = ZIO_PRIORITY_ASYNC_READ; -+ } else { -+ ASSERT(zio->io_type == ZIO_TYPE_WRITE); -+ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && -+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) -+ zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; -+ } - -- if (zio->io_type == ZIO_TYPE_READ) -- zio->io_vdev_tree = &vq->vq_read_tree; -- else -- zio->io_vdev_tree = &vq->vq_write_tree; -+ zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; - - mutex_enter(&vq->vq_lock); -- - zio->io_timestamp = gethrtime(); -- zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + -- zio->io_priority; -- - vdev_queue_io_add(vq, zio); -- -- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); -- -+ nio = vdev_queue_io_to_issue(vq); - mutex_exit(&vq->vq_lock); -@@ -426,3 +764,3 @@ vdev_queue_io_done(zio_t *zio) - vdev_queue_t *vq = &zio->io_vd->vdev_queue; -- int i; -+ zio_t *nio; - -@@ -433,3 +771,3 @@ vdev_queue_io_done(zio_t *zio) - -- avl_remove(&vq->vq_pending_tree, zio); -+ vdev_queue_pending_remove(vq, zio); - -@@ -439,6 +777,3 @@ vdev_queue_io_done(zio_t *zio) - -- for (i = 0; i < zfs_vdev_ramp_rate; i++) { -- zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); -- if (nio == NULL) -- break; -+ while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { - mutex_exit(&vq->vq_lock); -@@ -457,8 +792,2 @@ vdev_queue_io_done(zio_t *zio) - #if defined(_KERNEL) && defined(HAVE_SPL) --module_param(zfs_vdev_max_pending, int, 0644); --MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os"); -- --module_param(zfs_vdev_min_pending, int, 0644); --MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os"); -- - module_param(zfs_vdev_aggregation_limit, int, 0644); -@@ -466,8 +795,2 @@ MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size"); - --module_param(zfs_vdev_time_shift, int, 0644); --MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O"); -- --module_param(zfs_vdev_ramp_rate, int, 0644); --MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate"); -- - module_param(zfs_vdev_read_gap_limit, int, 0644); -@@ -477,2 +800,51 @@ module_param(zfs_vdev_write_gap_limit, int, 0644); - MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap"); -+ -+module_param(zfs_vdev_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_max_active, "Maximum number of active I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_active_max_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_active_max_dirty_percent, -+ "Async write concurrency max threshold"); -+ -+module_param(zfs_vdev_async_write_active_min_dirty_percent, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_active_min_dirty_percent, -+ "Async write concurrency min threshold"); -+ -+module_param(zfs_vdev_async_read_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_read_max_active, -+ "Max active async read I/Os per vdev"); -+ -+module_param(zfs_vdev_async_read_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_read_min_active, -+ "Min active async read I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_max_active, -+ "Max active async write I/Os per vdev"); -+ -+module_param(zfs_vdev_async_write_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_async_write_min_active, -+ "Min active async write I/Os per vdev"); -+ -+module_param(zfs_vdev_scrub_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_scrub_max_active, "Max active scrub I/Os per vdev"); -+ -+module_param(zfs_vdev_scrub_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_scrub_min_active, "Min active scrub I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_read_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_read_max_active, -+ "Max active sync read I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_read_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_read_min_active, -+ "Min active sync read I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_write_max_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_write_max_active, -+ "Max active sync write I/Os per vdev"); -+ -+module_param(zfs_vdev_sync_write_min_active, int, 0644); -+MODULE_PARM_DESC(zfs_vdev_sync_write_min_active, -+ "Min active sync write I/Osper vdev"); - #endif -diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c -index 3e1878d..4cd21df 100644 ---- a/module/zfs/vdev_raidz.c -+++ b/module/zfs/vdev_raidz.c -@@ -23,3 +23,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -62,2 +62,3 @@ - * o multiplication of A by 2 is defined by the following bitwise expression: -+ * - * (A * 2)_7 = A_6 -@@ -120,3 +121,3 @@ typedef struct raidz_map { - uint64_t rm_nskip; /* Skipped sectors for padding */ -- uint64_t rm_skipstart; /* Column index of padding start */ -+ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ -@@ -160,6 +161,3 @@ int vdev_raidz_default_to_general; - --/* -- * These two tables represent powers and logs of 2 in the Galois field defined -- * above. These values were computed by repeatedly multiplying by 2 as above. -- */ -+/* Powers of 2 in the Galois field defined above. */ - static const uint8_t vdev_raidz_pow2[256] = { -@@ -198,2 +196,3 @@ static const uint8_t vdev_raidz_pow2[256] = { - }; -+/* Logs of 2 in the Galois field defined above. */ - static const uint8_t vdev_raidz_log2[256] = { -@@ -433,3 +432,10 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { - --static raidz_map_t * -+/* -+ * Divides the IO evenly across all child vdevs; usually, dcols is -+ * the number of children in the target vdev. -+ * -+ * Avoid inlining the function to keep vdev_raidz_io_start(), which -+ * is this functions only caller, as small as possible on the stack. -+ */ -+noinline static raidz_map_t * - vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, -@@ -438,5 +444,9 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - raidz_map_t *rm; -+ /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = zio->io_offset >> unit_shift; -+ /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = zio->io_size >> unit_shift; -+ /* The first column for this stripe. */ - uint64_t f = b % dcols; -+ /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; -@@ -444,8 +454,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - -+ /* -+ * "Quotient": The number of data sectors for this stripe on all but -+ * the "big column" child vdevs that also contain "remainder" data. -+ */ - q = s / (dcols - nparity); -+ -+ /* -+ * "Remainder": The number of partial stripe data sectors in this I/O. -+ * This will add a sector to some, but not all, child vdevs. -+ */ - r = s - q * (dcols - nparity); -+ -+ /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); -+ -+ /* -+ * The total number of data and parity sectors associated with -+ * this I/O. -+ */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); - -+ /* acols: The columns that will be accessed. */ -+ /* scols: The columns that will be accessed or skipped. */ - if (q == 0) { -+ /* Our I/O request doesn't span all child vdevs. */ - acols = bc; -@@ -1192,3 +1221,4 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, - uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; -- uint8_t log = 0, val; -+ uint8_t log = 0; -+ uint8_t val; - int ll; -@@ -1458,3 +1488,3 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1522,2 +1552,19 @@ vdev_raidz_child_done(zio_t *zio) - -+/* -+ * Start an IO operation on a RAIDZ VDev -+ * -+ * Outline: -+ * - For write operations: -+ * 1. Generate the parity data -+ * 2. Create child zio write operations to each column's vdev, for both -+ * data and parity. -+ * 3. If the column skips any sectors for padding, create optional dummy -+ * write zio children for those areas to improve aggregation continuity. -+ * - For read operations: -+ * 1. Create child zio read operations to each data column's vdev to read -+ * the range of data required for zio. -+ * 2. If this is a scrub or resilver operation, or if any of the data -+ * vdevs have had errors, then create zio read operations to the parity -+ * columns' VDevs as well. -+ */ - static int -@@ -1583,3 +1630,3 @@ vdev_raidz_io_start(zio_t *zio) - rm->rm_missingparity++; -- rc->rc_error = ENXIO; -+ rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ -@@ -1593,3 +1640,3 @@ vdev_raidz_io_start(zio_t *zio) - rm->rm_missingparity++; -- rc->rc_error = ESTALE; -+ rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; -@@ -1684,3 +1731,3 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) - raidz_checksum_error(zio, rc, orig[c]); -- rc->rc_error = ECKSUM; -+ rc->rc_error = SET_ERROR(ECKSUM); - ret++; -@@ -1808,3 +1855,3 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) - orig[i]); -- rc->rc_error = ECKSUM; -+ rc->rc_error = SET_ERROR(ECKSUM); - } -@@ -1865,2 +1912,23 @@ done: - -+/* -+ * Complete an IO operation on a RAIDZ VDev -+ * -+ * Outline: -+ * - For write operations: -+ * 1. Check for errors on the child IOs. -+ * 2. Return, setting an error code if too few child VDevs were written -+ * to reconstruct the data later. Note that partial writes are -+ * considered successful if they can be reconstructed at all. -+ * - For read operations: -+ * 1. Check for errors on the child IOs. -+ * 2. If data errors occurred: -+ * a. Try to reassemble the data from the parity available. -+ * b. If we haven't yet read the parity drives, read them now. -+ * c. If all parity drives have been read but the data still doesn't -+ * reassemble with a correct checksum, then try combinatorial -+ * reconstruction. -+ * d. If that doesn't work, return an error. -+ * 3. If there were unexpected errors or this is a resilver operation, -+ * rewrite the vdevs that had errors. -+ */ - static void -@@ -2084,3 +2152,3 @@ vdev_raidz_io_done(zio_t *zio) - */ -- zio->io_error = ECKSUM; -+ zio->io_error = SET_ERROR(ECKSUM); - -@@ -2122,3 +2190,3 @@ done: - rc->rc_offset, rc->rc_data, rc->rc_size, -- ZIO_TYPE_WRITE, zio->io_priority, -+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? -diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c -index 5241b02..90250b0 100644 ---- a/module/zfs/vdev_root.c -+++ b/module/zfs/vdev_root.c -@@ -26,3 +26,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -65,3 +65,3 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -diff --git a/module/zfs/zap.c b/module/zfs/zap.c -index a7bae5e..cfae26a 100644 ---- a/module/zfs/zap.c -+++ b/module/zfs/zap.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -297,3 +297,4 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) - DMU_READ_NO_PREFETCH); -- dmu_buf_rele(db, FTAG); -+ if (err == 0) -+ dmu_buf_rele(db, FTAG); - } -@@ -327,3 +328,3 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - -@@ -716,3 +717,3 @@ fzap_checkname(zap_name_t *zn) - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - return (0); -@@ -731,3 +732,3 @@ fzap_checksize(uint64_t integer_size, uint64_t num_integers) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -807,3 +808,3 @@ retry: - if (err == 0) { -- err = EEXIST; -+ err = SET_ERROR(EEXIST); - goto out; -@@ -994,2 +995,3 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -997,4 +999,6 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - (void) zap_cursor_advance(&zc)) { -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - err = zap_add(os, intoobj, za.za_name, -@@ -1002,6 +1006,6 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1016,2 +1020,3 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -1019,4 +1024,6 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - (void) zap_cursor_advance(&zc)) { -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - err = zap_add(os, intoobj, za.za_name, -@@ -1024,6 +1031,6 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1038,2 +1045,3 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -+ err = 0; - for (zap_cursor_init(&zc, os, fromobj); -@@ -1043,4 +1051,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - -- if (za.za_integer_length != 8 || za.za_num_integers != 1) -- return (EINVAL); -+ if (za.za_integer_length != 8 || za.za_num_integers != 1) { -+ err = SET_ERROR(EINVAL); -+ break; -+ } - -@@ -1048,3 +1058,3 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err != 0 && err != ENOENT) -- return (err); -+ break; - delta += za.za_first_integer; -@@ -1052,6 +1062,6 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, - if (err) -- return (err); -+ break; - } - zap_cursor_fini(&zc); -- return (0); -+ return (err); - } -@@ -1252,3 +1262,3 @@ fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) - if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c -index ad21882..13bc879 100644 ---- a/module/zfs/zap_leaf.c -+++ b/module/zfs/zap_leaf.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -436,3 +437,3 @@ again: - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -494,3 +495,3 @@ zap_entry_read(const zap_entry_handle_t *zeh, - if (le->le_value_intlen > integer_size) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -501,3 +502,3 @@ zap_entry_read(const zap_entry_handle_t *zeh, - if (zeh->zeh_num_integers > num_integers) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - return (0); -@@ -522,3 +523,3 @@ zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, - if (le->le_name_numints > buflen) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - return (0); -@@ -538,3 +539,3 @@ zap_entry_update(zap_entry_handle_t *zeh, - if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - -@@ -628,3 +629,3 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, - if (l->l_phys->l_hdr.lh_nfree < numchunks) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - -diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c -index 4da7836..555d52f 100644 ---- a/module/zfs/zap_micro.c -+++ b/module/zfs/zap_micro.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -782,3 +782,3 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -791,8 +791,8 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { - if (num_integers < 1) { -- err = EOVERFLOW; -+ err = SET_ERROR(EOVERFLOW); - } else if (integer_size != 8) { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -@@ -828,3 +828,3 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -851,3 +851,3 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -886,3 +886,3 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -893,3 +893,3 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -920,3 +920,3 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -940,3 +940,4 @@ mzap_addent(zap_name_t *zn, uint64_t value) - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { -- ASSERTV(mzap_ent_phys_t *mze=&zap->zap_m.zap_phys->mz_chunk[i]); -+ ASSERTV(mzap_ent_phys_t *mze); -+ ASSERT(mze = &zap->zap_m.zap_phys->mz_chunk[i]); - ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); -@@ -989,3 +990,3 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1003,3 +1004,3 @@ zap_add(objset_t *os, uint64_t zapobj, const char *key, - if (mze != NULL) { -- err = EEXIST; -+ err = SET_ERROR(EEXIST); - } else { -@@ -1030,3 +1031,3 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1067,3 +1068,3 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1112,3 +1113,3 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1143,3 +1144,3 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1150,3 +1151,3 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -1177,3 +1178,3 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - zap_unlockdir(zap); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1255,3 +1256,3 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - if (zc->zc_hash == -1ULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -1281,4 +1282,2 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - } else { -- err = ENOENT; -- - mze_tofind.mze_hash = zc->zc_hash; -@@ -1305,2 +1304,3 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) - zc->zc_hash = -1ULL; -+ err = SET_ERROR(ENOENT); - } -@@ -1338,3 +1338,3 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) - rw_exit(&zc->zc_zap->zap_rwlock); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1346,3 +1346,3 @@ zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) - if (mze == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - goto out; -diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c -index c09b32d..4f4785a 100644 ---- a/module/zfs/zfeature.c -+++ b/module/zfs/zfeature.c -@@ -22,3 +22,3 @@ - /* -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -182,4 +182,4 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - -- zc = kmem_alloc(sizeof(zap_cursor_t), KM_SLEEP); -- za = kmem_alloc(sizeof(zap_attribute_t), KM_SLEEP); -+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); -+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); -@@ -206,3 +206,3 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - if (zap_lookup(os, desc_obj, za->za_name, -- 1, sizeof (buf), buf) == 0) -+ 1, MAXPATHLEN, buf) == 0) - desc = buf; -@@ -217,4 +217,4 @@ feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj, - kmem_free(buf, MAXPATHLEN); -- kmem_free(za, sizeof(zap_attribute_t)); -- kmem_free(zc, sizeof(zap_cursor_t)); -+ kmem_free(za, sizeof (zap_attribute_t)); -+ kmem_free(zc, sizeof (zap_cursor_t)); - -@@ -236,3 +236,3 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (zapobj == 0) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -242,3 +242,3 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (err == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - else -@@ -283,5 +283,5 @@ feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (error == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (refcount == UINT64_MAX) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - refcount++; -@@ -290,5 +290,5 @@ feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj, - if (error == ENOENT) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - if (refcount == 0) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - refcount--; -diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c -index 8ab5abe..89b6245 100644 ---- a/module/zfs/zfs_acl.c -+++ b/module/zfs/zfs_acl.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -678,3 +679,3 @@ zfs_copy_ace_2_fuid(zfs_sb_t *zsb, umode_t obj_mode, zfs_acl_t *aclp, - aceptr->z_hdr.z_flags) != B_TRUE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -785,3 +786,3 @@ zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, - aceptr->z_flags) != B_TRUE) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1119,3 +1120,3 @@ zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - goto done; -@@ -1157,2 +1158,5 @@ zfs_acl_chown_setattr(znode_t *zp) - -+ if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ - ASSERT(MUTEX_HELD(&zp->z_lock)); -@@ -1163,2 +1167,14 @@ zfs_acl_chown_setattr(znode_t *zp) - &zp->z_pflags, zp->z_uid, zp->z_gid); -+ -+ /* -+ * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL -+ * nor a DACL_ACES SA in which case ENOENT is returned from -+ * zfs_acl_node_read() when the SA can't be located. -+ * Allow chown/chgrp to succeed in these cases rather than -+ * returning an error that makes no sense in the context of -+ * the caller. -+ */ -+ if (error == ENOENT) -+ return (0); -+ - return (error); -@@ -1474,3 +1490,4 @@ zfs_acl_chmod(zfs_sb_t *zsb, uint64_t mode, zfs_acl_t *aclp) - new_bytes += abstract_size; -- } if (deny1) { -+ } -+ if (deny1) { - zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); -@@ -1871,3 +1888,3 @@ zfs_acl_ids_overquota(zfs_sb_t *zsb, zfs_acl_ids_t *acl_ids) - /* -- * Retrieve a files ACL -+ * Retrieve a file's ACL - */ -@@ -1886,3 +1903,3 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) - if (mask == 0) -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - -@@ -1980,3 +1997,3 @@ zfs_vsec_2_aclp(zfs_sb_t *zsb, umode_t obj_mode, - if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2026,3 +2043,3 @@ zfs_vsec_2_aclp(zfs_sb_t *zsb, umode_t obj_mode, - /* -- * Set a files ACL -+ * Set a file's ACL - */ -@@ -2042,6 +2059,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) - if (mask == 0) -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - - if (zp->z_pflags & ZFS_IMMUTABLE) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -2141,3 +2158,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { -- return (EROFS); -+ return (SET_ERROR(EROFS)); - } -@@ -2152,3 +2169,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_IMMUTABLE)))) { -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -2157,3 +2174,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_NOUNLINK)) { -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -2162,3 +2179,3 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) - (zp->z_pflags & ZFS_AV_QUARANTINED))) { -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -2271,3 +2288,3 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - mutex_exit(&zp->z_acl_lock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -2305,3 +2322,3 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - *working_mode |= deny_mask; -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } else if (*working_mode) { -@@ -2372,3 +2389,3 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - if (*working_mode != ACE_WRITE_DATA) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -@@ -2388,3 +2405,3 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) - if (zdp->z_pflags & ZFS_AV_QUARANTINED) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -@@ -2447,2 +2464,3 @@ slow: - * Determine whether Access should be granted/denied. -+ * - * The least priv subsytem is always consulted as a basic privilege -@@ -2494,3 +2512,3 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) - if (error) -- return (error); -+ return (error); - -@@ -2600,3 +2618,3 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) - if (working_mode & ~(ZFS_CHECKED_MASKS)) { -- error = EACCES; -+ error = SET_ERROR(EACCES); - } -@@ -2654,3 +2672,2 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, - * -- * - * The following chart is the recommended NFSv4 enforcement for -@@ -2710,3 +2727,3 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) - if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -2777,3 +2794,3 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - if (szp->z_pflags & ZFS_AV_QUARANTINED) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - -diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c -index b35f27d..9652054 100644 ---- a/module/zfs/zfs_ctldir.c -+++ b/module/zfs/zfs_ctldir.c -@@ -29,2 +29,3 @@ - * Brian Behlendorf -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -82,2 +83,3 @@ - #include -+#include - #include -@@ -100,3 +102,3 @@ zfsctl_sep_alloc(void) - { -- return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); -+ return (kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP)); - } -@@ -255,3 +257,2 @@ zfsctl_inode_destroy(struct inode *ip) - { -- return; - } -@@ -287,3 +288,3 @@ zfsctl_create(zfs_sb_t *zsb) - if (zsb->z_ctldir == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -291,3 +292,3 @@ zfsctl_create(zfs_sb_t *zsb) - #else -- return (EOPNOTSUPP); -+ return (SET_ERROR(EOPNOTSUPP)); - #endif /* CONFIG_64BIT */ -@@ -332,3 +333,3 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) - ZFS_EXIT(zsb); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -356,3 +357,3 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - if (snapshot_namecheck(name, NULL, NULL) != 0) -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - -@@ -360,3 +361,3 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - if ((strlen(zname) + 1 + strlen(name)) >= len) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -@@ -368,2 +369,7 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname) - -+/* -+ * Gets the full dataset name that corresponds to the given snapshot name -+ * Example: -+ * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" -+ */ - static int -@@ -384,3 +390,3 @@ zfsctl_snapshot_zpath(struct path *path, int len, char *zpath) - if (path_len > len) { -- error = EFAULT; -+ error = SET_ERROR(EFAULT); - goto out; -@@ -422,3 +428,3 @@ zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, - if (*ipp == NULL) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -458,3 +464,3 @@ zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, - } else { -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -490,4 +496,4 @@ zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name) - int --zfsctl_snapdir_rename(struct inode *sdip, char *sname, -- struct inode *tdip, char *tname, cred_t *cr, int flags) -+zfsctl_snapdir_rename(struct inode *sdip, char *snm, -+ struct inode *tdip, char *tnm, cred_t *cr, int flags) - { -@@ -496,3 +502,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - avl_index_t where; -- char *to, *from, *real; -+ char *to, *from, *real, *fsname; - int error; -@@ -504,8 +510,9 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - real = kmem_alloc(MAXNAMELEN, KM_SLEEP); -+ fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - - if (zsb->z_case == ZFS_CASE_INSENSITIVE) { -- error = dmu_snapshot_realname(zsb->z_os, sname, real, -+ error = dmu_snapshot_realname(zsb->z_os, snm, real, - MAXNAMELEN, NULL); - if (error == 0) { -- sname = real; -+ snm = real; - } else if (error != ENOTSUP) { -@@ -515,8 +522,10 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from); -- if (!error) -- error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to); -- if (!error) -+ dmu_objset_name(zsb->z_os, fsname); -+ -+ error = zfsctl_snapshot_zname(sdip, snm, MAXNAMELEN, from); -+ if (error == 0) -+ error = zfsctl_snapshot_zname(tdip, tnm, MAXNAMELEN, to); -+ if (error == 0) - error = zfs_secpolicy_rename_perms(from, to, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -527,3 +536,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - if (sdip != tdip) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -534,3 +543,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - */ -- if (strcmp(sname, tname) == 0) { -+ if (strcmp(snm, tnm) == 0) { - error = 0; -@@ -541,3 +550,3 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- error = dmu_objset_rename(from, to, B_FALSE); -+ error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); - if (error) -@@ -545,6 +554,6 @@ zfsctl_snapdir_rename(struct inode *sdip, char *sname, - -- search.se_name = (char *)sname; -+ search.se_name = (char *)snm; - sep = avl_find(&zsb->z_ctldir_snaps, &search, &where); - if (sep) -- zfsctl_rename_snap(zsb, sep, tname); -+ zfsctl_rename_snap(zsb, sep, tnm); - -@@ -556,2 +565,3 @@ out: - kmem_free(real, MAXNAMELEN); -+ kmem_free(fsname, MAXNAMELEN); - -@@ -590,5 +600,5 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) - error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname); -- if (!error) -+ if (error == 0) - error = zfs_secpolicy_destroy_perms(snapname, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -597,3 +607,3 @@ zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) - if ((error == 0) || (error == ENOENT)) -- error = dmu_objset_destroy(snapname, B_FALSE); -+ error = dsl_destroy_snapshot(snapname, B_FALSE); - out: -@@ -623,3 +633,3 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - if (snapshot_namecheck(dirname, NULL, NULL) != 0) { -- error = EILSEQ; -+ error = SET_ERROR(EILSEQ); - goto out; -@@ -630,3 +640,3 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - error = zfs_secpolicy_snapshot_perms(dsname, cr); -- if (error) -+ if (error != 0) - goto out; -@@ -634,5 +644,4 @@ zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - if (error == 0) { -- error = dmu_objset_snapshot(dsname, dirname, -- NULL, NULL, B_FALSE, B_FALSE, -1); -- if (error) -+ error = dmu_objset_snapshot_one(dsname, dirname); -+ if (error != 0) - goto out; -@@ -684,3 +693,3 @@ zfsctl_snapdir_inactive(struct inode *ip) - */ --#define SET_UNMOUNT_CMD \ -+#define SET_UNMOUNT_CMD \ - "exec 0z_ctldir_lock); -@@ -946,3 +963,3 @@ zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp) - } else { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } -@@ -969,3 +986,3 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, - ZFS_EXIT(zsb); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c -index ad611ac..4f612e1 100644 ---- a/module/zfs/zfs_debug.c -+++ b/module/zfs/zfs_debug.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -27,2 +27,9 @@ - -+#if !defined(_KERNEL) || !defined(__linux__) -+list_t zfs_dbgmsgs; -+int zfs_dbgmsg_size; -+kmutex_t zfs_dbgmsgs_lock; -+int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */ -+#endif -+ - /* -@@ -36,2 +43,4 @@ int zfs_flags = 0; - * set, calls to zfs_panic_recover() will turn into warning messages. -+ * This should only be used as a last resort, as it typically results -+ * in leaked space, or worse. - */ -@@ -59,2 +68,8 @@ zfs_dbgmsg_init(void) - { -+#if !defined(_KERNEL) || !defined(__linux__) -+ list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), -+ offsetof(zfs_dbgmsg_t, zdm_node)); -+ mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); -+#endif -+ - if (zfs_flags == 0) { -@@ -73,5 +88,72 @@ zfs_dbgmsg_fini(void) - { -- return; -+#if !defined(_KERNEL) || !defined(__linux__) -+ zfs_dbgmsg_t *zdm; -+ -+ while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { -+ int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); -+ kmem_free(zdm, size); -+ zfs_dbgmsg_size -= size; -+ } -+ mutex_destroy(&zfs_dbgmsgs_lock); -+ ASSERT0(zfs_dbgmsg_size); -+#endif -+} -+ -+#if !defined(_KERNEL) || !defined(__linux__) -+/* -+ * Print these messages by running: -+ * echo ::zfs_dbgmsg | mdb -k -+ * -+ * Monitor these messages by running: -+ * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' -+ */ -+void -+zfs_dbgmsg(const char *fmt, ...) -+{ -+ int size; -+ va_list adx; -+ zfs_dbgmsg_t *zdm; -+ -+ va_start(adx, fmt); -+ size = vsnprintf(NULL, 0, fmt, adx); -+ va_end(adx); -+ -+ /* -+ * There is one byte of string in sizeof (zfs_dbgmsg_t), used -+ * for the terminating null. -+ */ -+ zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); -+ zdm->zdm_timestamp = gethrestime_sec(); -+ -+ va_start(adx, fmt); -+ (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); -+ va_end(adx); -+ -+ DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); -+ -+ mutex_enter(&zfs_dbgmsgs_lock); -+ list_insert_tail(&zfs_dbgmsgs, zdm); -+ zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; -+ while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { -+ zdm = list_remove_head(&zfs_dbgmsgs); -+ size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); -+ kmem_free(zdm, size); -+ zfs_dbgmsg_size -= size; -+ } -+ mutex_exit(&zfs_dbgmsgs_lock); - } - -+void -+zfs_dbgmsg_print(const char *tag) -+{ -+ zfs_dbgmsg_t *zdm; -+ -+ (void) printf("ZFS_DBGMSG(%s):\n", tag); -+ mutex_enter(&zfs_dbgmsgs_lock); -+ for (zdm = list_head(&zfs_dbgmsgs); zdm; -+ zdm = list_next(&zfs_dbgmsgs, zdm)) -+ (void) printf("%s\n", zdm->zdm_msg); -+ mutex_exit(&zfs_dbgmsgs_lock); -+} -+#endif - -diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c -index 4a4969f..448a872 100644 ---- a/module/zfs/zfs_dir.c -+++ b/module/zfs/zfs_dir.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -173,3 +174,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -244,3 +245,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - rw_exit(&dzp->z_name_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -255,3 +256,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - rw_exit(&dzp->z_name_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -309,3 +310,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - if (error == 0) -- error = (zoid == 0 ? ENOENT : 0); -+ error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); - } else { -@@ -316,3 +317,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - iput(vp); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } else if (vp) { -@@ -321,3 +322,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - iput(vp); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -343,3 +344,3 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - zfs_dirent_unlock(dl); -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - } -@@ -764,3 +765,3 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) - mutex_exit(&zp->z_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -867,3 +868,3 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - mutex_exit(&zp->z_lock); -- return (ENOTEMPTY); -+ return (SET_ERROR(ENOTEMPTY)); - } -@@ -971,6 +972,5 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) - zfs_acl_ids_free(&acl_ids); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } - --top: - tx = dmu_tx_create(zsb->z_os); -@@ -983,9 +983,4 @@ top: - zfs_fuid_txhold(zsb, tx); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto top; -- } - zfs_acl_ids_free(&acl_ids); -@@ -1053,3 +1048,3 @@ top: - zfs_dirent_unlock(dl); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1058,3 +1053,3 @@ top: - zfs_dirent_unlock(dl); -- return (EROFS); -+ return (SET_ERROR(EROFS)); - } -diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c -index af2030a..05ee84c 100644 ---- a/module/zfs/zfs_fm.c -+++ b/module/zfs/zfs_fm.c -@@ -253,2 +253,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - vdev_queue_t *vq = &vd->vdev_queue; -+ vdev_stat_t *vs = &vd->vdev_stat; -+ vdev_t *spare_vd; -+ uint64_t *spare_guids; -+ char **spare_paths; -+ int i, spare_count; - -@@ -284,2 +289,12 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - -+ if (vs != NULL) { -+ fm_payload_set(ereport, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_read_errors, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_write_errors, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, -+ DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); -+ } -+ - if (pvd != NULL) { -@@ -300,2 +315,24 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - } -+ -+ spare_count = spa->spa_spares.sav_count; -+ spare_paths = kmem_zalloc(sizeof (char *) * spare_count, -+ KM_PUSHPAGE); -+ spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, -+ KM_PUSHPAGE); -+ -+ for (i = 0; i < spare_count; i++) { -+ spare_vd = spa->spa_spares.sav_vdevs[i]; -+ if (spare_vd) { -+ spare_paths[i] = spare_vd->vdev_path; -+ spare_guids[i] = spare_vd->vdev_guid; -+ } -+ } -+ -+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, -+ DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, -+ DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); -+ -+ kmem_free(spare_guids, sizeof (uint64_t) * spare_count); -+ kmem_free(spare_paths, sizeof (char *) * spare_count); - } -@@ -318,4 +355,2 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - DATA_TYPE_UINT64, zio->io_timestamp, NULL); -- fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE, -- DATA_TYPE_UINT64, zio->io_deadline, NULL); - fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, -@@ -838,11 +873,14 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) - ZFS_ERROR_CLASS, name); -- VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); -- VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); -+ VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); -+ VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); -+ VERIFY0(nvlist_add_int32(resource, -+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); -+ - if (vd) { -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); -- VERIFY(nvlist_add_uint64(resource, -- FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); -+ VERIFY0(nvlist_add_uint64(resource, -+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); - } -diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c -index debb5f8..6ca61b8 100644 ---- a/module/zfs/zfs_fuid.c -+++ b/module/zfs/zfs_fuid.c -@@ -567,5 +567,5 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - idmap_stat status; -- uint64_t idx; -+ uint64_t idx = 0; - zfs_fuid_t *zfuid = NULL; -- zfs_fuid_info_t *fuidp; -+ zfs_fuid_info_t *fuidp = NULL; - -@@ -594,2 +594,5 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - -+ VERIFY3U(type, >=, ZFS_OWNER); -+ VERIFY3U(type, <=, ZFS_ACE_GROUP); -+ - switch (type) { -@@ -610,3 +613,3 @@ zfs_fuid_create(zfs_sb_t *zsb, uint64_t id, cred_t *cr, - }; -- domain = fuidp->z_domain_table[idx -1]; -+ domain = fuidp->z_domain_table[idx - 1]; - } else { -diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c -index a9184a1..0dfda1a 100644 ---- a/module/zfs/zfs_ioctl.c -+++ b/module/zfs/zfs_ioctl.c -@@ -27,5 +27,107 @@ - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. -+ * Copyright (c) 201i3 by Delphix. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. -+ * Copyright (c) 2013 Steven Hartland. All rights reserved. -+ */ -+ -+/* -+ * ZFS ioctls. -+ * -+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage -+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. -+ * -+ * There are two ways that we handle ioctls: the legacy way where almost -+ * all of the logic is in the ioctl callback, and the new way where most -+ * of the marshalling is handled in the common entry point, zfsdev_ioctl(). -+ * -+ * Non-legacy ioctls should be registered by calling -+ * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked -+ * from userland by lzc_ioctl(). -+ * -+ * The registration arguments are as follows: -+ * -+ * const char *name -+ * The name of the ioctl. This is used for history logging. If the -+ * ioctl returns successfully (the callback returns 0), and allow_log -+ * is true, then a history log entry will be recorded with the input & -+ * output nvlists. The log entry can be printed with "zpool history -i". -+ * -+ * zfs_ioc_t ioc -+ * The ioctl request number, which userland will pass to ioctl(2). -+ * The ioctl numbers can change from release to release, because -+ * the caller (libzfs) must be matched to the kernel. -+ * -+ * zfs_secpolicy_func_t *secpolicy -+ * This function will be called before the zfs_ioc_func_t, to -+ * determine if this operation is permitted. It should return EPERM -+ * on failure, and 0 on success. Checks include determining if the -+ * dataset is visible in this zone, and if the user has either all -+ * zfs privileges in the zone (SYS_MOUNT), or has been granted permission -+ * to do this operation on this dataset with "zfs allow". -+ * -+ * zfs_ioc_namecheck_t namecheck -+ * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool -+ * name, a dataset name, or nothing. If the name is not well-formed, -+ * the ioctl will fail and the callback will not be called. -+ * Therefore, the callback can assume that the name is well-formed -+ * (e.g. is null-terminated, doesn't have more than one '@' character, -+ * doesn't have invalid characters). -+ * -+ * zfs_ioc_poolcheck_t pool_check -+ * This specifies requirements on the pool state. If the pool does -+ * not meet them (is suspended or is readonly), the ioctl will fail -+ * and the callback will not be called. If any checks are specified -+ * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. -+ * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | -+ * POOL_CHECK_READONLY). -+ * -+ * boolean_t smush_outnvlist -+ * If smush_outnvlist is true, then the output is presumed to be a -+ * list of errors, and it will be "smushed" down to fit into the -+ * caller's buffer, by removing some entries and replacing them with a -+ * single "N_MORE_ERRORS" entry indicating how many were removed. See -+ * nvlist_smush() for details. If smush_outnvlist is false, and the -+ * outnvlist does not fit into the userland-provided buffer, then the -+ * ioctl will fail with ENOMEM. -+ * -+ * zfs_ioc_func_t *func -+ * The callback function that will perform the operation. -+ * -+ * The callback should return 0 on success, or an error number on -+ * failure. If the function fails, the userland ioctl will return -1, -+ * and errno will be set to the callback's return value. The callback -+ * will be called with the following arguments: -+ * -+ * const char *name -+ * The name of the pool or dataset to operate on, from -+ * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the -+ * expected type (pool, dataset, or none). -+ * -+ * nvlist_t *innvl -+ * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or -+ * NULL if no input nvlist was provided. Changes to this nvlist are -+ * ignored. If the input nvlist could not be deserialized, the -+ * ioctl will fail and the callback will not be called. -+ * -+ * nvlist_t *outnvl -+ * The output nvlist, initially empty. The callback can fill it in, -+ * and it will be returned to userland by serializing it into -+ * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization -+ * fails (e.g. because the caller didn't supply a large enough -+ * buffer), then the overall ioctl will fail. See the -+ * 'smush_nvlist' argument above for additional behaviors. -+ * -+ * There are two typical uses of the output nvlist: -+ * - To return state, e.g. property values. In this case, -+ * smush_outnvlist should be false. If the buffer was not large -+ * enough, the caller will reallocate a larger buffer and try -+ * the ioctl again. -+ * -+ * - To return multiple errors from an ioctl which makes on-disk -+ * changes. In this case, smush_outnvlist should be true. -+ * Ioctls which make on-disk modifications should generally not -+ * use the outnvl if they succeed, because the caller can not -+ * distinguish between the operation failing, and -+ * deserialization failing. - */ -@@ -59,2 +161,3 @@ - #include -+#include - #include -@@ -75,5 +178,7 @@ - #include --#include - #include - -+#include -+#include -+#include - #include -@@ -93,4 +198,9 @@ extern void zfs_fini(void); - --typedef int zfs_ioc_func_t(zfs_cmd_t *); --typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); -+uint_t zfs_fsyncer_key; -+extern uint_t rrw_tsd_key; -+static uint_t zfs_allow_log_key; -+ -+typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); -+typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); -+typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); - -@@ -105,3 +215,3 @@ typedef enum { - POOL_CHECK_SUSPENDED = 1 << 1, -- POOL_CHECK_READONLY = 1 << 2 -+ POOL_CHECK_READONLY = 1 << 2, - } zfs_ioc_poolcheck_t; -@@ -109,2 +219,3 @@ typedef enum { - typedef struct zfs_ioc_vec { -+ zfs_ioc_legacy_func_t *zvec_legacy_func; - zfs_ioc_func_t *zvec_func; -@@ -112,4 +223,6 @@ typedef struct zfs_ioc_vec { - zfs_ioc_namecheck_t zvec_namecheck; -- boolean_t zvec_his_log; -+ boolean_t zvec_allow_log; - zfs_ioc_poolcheck_t zvec_pool_check; -+ boolean_t zvec_smush_outnvlist; -+ const char *zvec_name; - } zfs_ioc_vec_t; -@@ -131,9 +244,6 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, - boolean_t *); --int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); -+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); -+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); - --static int zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature); --static int zfs_prop_activate_feature_check(void *arg1, void *arg2, -- dmu_tx_t *tx); --static void zfs_prop_activate_feature_sync(void *arg1, void *arg2, -- dmu_tx_t *tx); -+static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature); - -@@ -183,5 +293,3 @@ zfs_is_bootfs(const char *name) - /* -- * zfs_earlier_version -- * -- * Return non-zero if the spa version is less than requested version. -+ * Return non-zero if the spa version is less than requested version. - */ -@@ -203,4 +311,2 @@ zfs_earlier_version(const char *name, int version) - /* -- * zpl_earlier_version -- * - * Return TRUE if the ZPL version is less than requested version. -@@ -239,3 +345,3 @@ zfs_log_history(zfs_cmd_t *zc) - if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) -- (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); -+ (void) spa_history_log(spa, buf); - spa_close(spa, FTAG); -@@ -251,3 +357,3 @@ zfs_log_history(zfs_cmd_t *zc) - static int --zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -262,3 +368,3 @@ zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -268,3 +374,3 @@ zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) - -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -282,3 +388,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - !zone_dataset_visible(dataset, &writable)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -290,3 +396,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (secpolicy_zfs(cr) && zoned) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } else { -@@ -296,3 +402,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (!zoned) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -300,3 +406,3 @@ zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) - if (!writable) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -311,3 +417,3 @@ zfs_dozonecheck(const char *dataset, cred_t *cr) - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -321,8 +427,4 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) - -- rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); -- if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL)) { -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -- return (ENOENT); -- } -- rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); -+ if (dsl_prop_get_int_ds(ds, "zoned", &zoned)) -+ return (SET_ERROR(ENOENT)); - -@@ -331,24 +433,7 @@ zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) - --/* -- * If name ends in a '@', then require recursive permissions. -- */ --int --zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) -+static int -+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, -+ const char *perm, cred_t *cr) - { - int error; -- boolean_t descendent = B_FALSE; -- dsl_dataset_t *ds; -- char *at; -- -- at = strchr(name, '@'); -- if (at != NULL && at[1] == '\0') { -- *at = '\0'; -- descendent = B_TRUE; -- } -- -- error = dsl_dataset_hold(name, FTAG, &ds); -- if (at != NULL) -- *at = '@'; -- if (error != 0) -- return (error); - -@@ -357,7 +442,5 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - error = secpolicy_zfs(cr); -- if (error) -- error = dsl_deleg_access_impl(ds, descendent, perm, cr); -+ if (error != 0) -+ error = dsl_deleg_access_impl(ds, perm, cr); - } -- -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -365,14 +448,23 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - --int --zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, -- const char *perm, cred_t *cr) -+static int -+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) - { - int error; -+ dsl_dataset_t *ds; -+ dsl_pool_t *dp; - -- error = zfs_dozonecheck_ds(name, ds, cr); -- if (error == 0) { -- error = secpolicy_zfs(cr); -- if (error) -- error = dsl_deleg_access_impl(ds, B_FALSE, perm, cr); -+ error = dsl_pool_hold(name, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, name, FTAG, &ds); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); - } -+ -+ error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); -+ -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -399,4 +491,4 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - 1, sizeof (ds_hexsl), &ds_hexsl, NULL); -- if (error) -- return (EPERM); -+ if (error != 0) -+ return (SET_ERROR(EPERM)); - -@@ -407,3 +499,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -416,3 +508,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (new_default || !blequal(&new_sl, CR_SL(CRED()))) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - return (0); -@@ -427,6 +519,6 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - if (!zoned) { - if (zfs_check_global_label(name, strval) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -449,4 +541,4 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - setsl_tag, &os); -- if (error) -- return (EPERM); -+ if (error != 0) -+ return (SET_ERROR(EPERM)); - -@@ -460,3 +552,3 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) - if (hexstr_to_label(strval, &new_sl) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -477,3 +569,3 @@ out_check: - #else -- return ENOTSUP; -+ return (ENOTSUP); - #endif /* HAVE_MLSLABEL */ -@@ -498,3 +590,3 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - break; -@@ -512,5 +604,5 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - setpoint)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - if (!zoned || strlen(dsname) <= strlen(setpoint)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -520,3 +612,3 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - if (!is_system_labeled()) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -535,4 +627,5 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, - --int --zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -541,3 +634,3 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) - error = zfs_dozonecheck(zc->zc_name, cr); -- if (error) -+ if (error != 0) - return (error); -@@ -551,4 +644,5 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) - --int --zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -558,6 +652,6 @@ zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) - --int --zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) -+/* ARGSUSED */ -+static int -+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- spa_t *spa; - dsl_pool_t *dp; -@@ -573,14 +667,12 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - if (cp == NULL) -- return (EINVAL); -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -+ return (SET_ERROR(EINVAL)); -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); -+ } - -@@ -591,2 +683,3 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - -@@ -595,5 +688,14 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ -+static int -+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -+{ -+ return (zfs_secpolicy_write_perms(zc->zc_name, -+ ZFS_DELEG_PERM_SEND, cr)); -+} -+ - #ifdef HAVE_SMB_SHARE -+/* ARGSUSED */ - static int --zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -612,3 +714,3 @@ zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) - VN_RELE(vp); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -622,3 +724,3 @@ zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) - int --zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -626,3 +728,3 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -631,6 +733,6 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - } else { -- return (zfs_secpolicy_deleg_share(zc, cr)); -+ return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -639,3 +741,3 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) - int --zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -643,3 +745,3 @@ zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) - if (!INGLOBALZONE(curproc)) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -648,6 +750,6 @@ zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) - } else { -- return (zfs_secpolicy_deleg_share(zc, cr)); -+ return (zfs_secpolicy_deleg_share(zc, innvl, cr)); - } - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -670,3 +772,3 @@ zfs_get_parent(const char *datasetname, char *parent, int parentsize) - if (cp == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - cp[0] = '\0'; -@@ -689,4 +791,5 @@ zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -697,17 +800,47 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) - * Destroying snapshots with delegated permissions requires -- * descendent mount and destroy permissions. -+ * descendant mount and destroy permissions. - */ -+/* ARGSUSED */ - static int --zfs_secpolicy_destroy_recursive(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int error; -- char *dsname; -+ nvlist_t *snaps; -+ nvpair_t *pair, *nextpair; -+ int error = 0; - -- dsname = kmem_asprintf("%s@", zc->zc_name); -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nextpair) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *ds; - -- error = zfs_secpolicy_destroy_perms(dsname, cr); -- if (error == ENOENT) -- error = zfs_secpolicy_destroy_perms(zc->zc_name, cr); -+ error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp); -+ if (error != 0) -+ break; -+ nextpair = nvlist_next_nvpair(snaps, pair); -+ error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); -+ if (error == 0) -+ dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ -+ if (error == 0) { -+ error = zfs_secpolicy_destroy_perms(nvpair_name(pair), -+ cr); -+ } else if (error == ENOENT) { -+ /* -+ * Ignore any snapshots that don't exist (we consider -+ * them "already destroyed"). Remove the name from the -+ * nvl here in case the snapshot is created between -+ * now and when we try to destroy it (in which case -+ * we don't want to destroy it since we haven't -+ * checked for permission). -+ */ -+ fnvlist_remove_nvpair(snaps, pair); -+ error = 0; -+ } -+ if (error != 0) -+ break; -+ } - -- strfree(dsname); - return (error); -@@ -744,4 +877,5 @@ zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -750,7 +884,8 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- char parentname[MAXNAMELEN]; -- objset_t *clone; -+ dsl_pool_t *dp; -+ dsl_dataset_t *clone; - int error; -@@ -759,18 +894,22 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - ZFS_DELEG_PERM_PROMOTE, cr); -- if (error) -+ if (error != 0) -+ return (error); -+ -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -- error = dmu_objset_hold(zc->zc_name, FTAG, &clone); -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); - - if (error == 0) { -- dsl_dataset_t *pclone = NULL; -+ char parentname[MAXNAMELEN]; -+ dsl_dataset_t *origin = NULL; - dsl_dir_t *dd; -- dd = clone->os_dsl_dataset->ds_dir; -+ dd = clone->ds_dir; - -- rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dd->dd_pool, -- dd->dd_phys->dd_origin_obj, FTAG, &pclone); -- rw_exit(&dd->dd_pool->dp_config_rwlock); -- if (error) { -- dmu_objset_rele(clone, FTAG); -+ dd->dd_phys->dd_origin_obj, FTAG, &origin); -+ if (error != 0) { -+ dsl_dataset_rele(clone, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -778,12 +917,14 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_write_perms(zc->zc_name, -+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, - ZFS_DELEG_PERM_MOUNT, cr); - -- dsl_dataset_name(pclone, parentname); -- dmu_objset_rele(clone, FTAG); -- dsl_dataset_rele(pclone, FTAG); -- if (error == 0) -- error = zfs_secpolicy_write_perms(parentname, -+ dsl_dataset_name(origin, parentname); -+ if (error == 0) { -+ error = zfs_secpolicy_write_perms_ds(parentname, origin, - ZFS_DELEG_PERM_PROMOTE, cr); -+ } -+ dsl_dataset_rele(clone, FTAG); -+ dsl_dataset_rele(origin, FTAG); - } -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -791,4 +932,5 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -815,7 +957,44 @@ zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) - -+/* -+ * Check for permission to create each snapshot in the nvlist. -+ */ -+/* ARGSUSED */ - static int --zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -+ nvlist_t *snaps; -+ int error = 0; -+ nvpair_t *pair; -+ -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ char *name = nvpair_name(pair); -+ char *atp = strchr(name, '@'); -+ -+ if (atp == NULL) { -+ error = SET_ERROR(EINVAL); -+ break; -+ } -+ *atp = '\0'; -+ error = zfs_secpolicy_snapshot_perms(name, cr); -+ *atp = '@'; -+ if (error != 0) -+ break; -+ } -+ return (error); -+} - -- return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); -+/* ARGSUSED */ -+static int -+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -+{ -+ /* -+ * Even root must have a proper TSD so that we know what pool -+ * to log to. -+ */ -+ if (tsd_get(zfs_allow_log_key) == NULL) -+ return (SET_ERROR(EPERM)); -+ return (0); - } -@@ -823,3 +1002,3 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -827,2 +1006,3 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - int error; -+ char *origin; - -@@ -832,7 +1012,6 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - -- if (zc->zc_value[0] != '\0') { -- if ((error = zfs_secpolicy_write_perms(zc->zc_value, -- ZFS_DELEG_PERM_CLONE, cr)) != 0) -- return (error); -- } -+ if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && -+ (error = zfs_secpolicy_write_perms(origin, -+ ZFS_DELEG_PERM_CLONE, cr)) != 0) -+ return (error); - -@@ -842,6 +1021,4 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_write_perms(parentname, -- ZFS_DELEG_PERM_MOUNT, cr); -- -- return (error); -+ return (zfs_secpolicy_write_perms(parentname, -+ ZFS_DELEG_PERM_MOUNT, cr)); - } -@@ -854,6 +1031,6 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { - if (secpolicy_sys_config(cr, B_FALSE) != 0) -- return (EPERM); -+ return (SET_ERROR(EPERM)); - -@@ -867,3 +1044,3 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -883,3 +1060,3 @@ zfs_secpolicy_diff(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -888,4 +1065,5 @@ zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -895,3 +1073,3 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) - if (!zfs_prop_user(zc->zc_value)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - return (zfs_secpolicy_write_perms(zc->zc_name, -@@ -905,5 +1083,5 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int err = zfs_secpolicy_read(zc, cr); -+ int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) -@@ -912,3 +1090,3 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -934,5 +1112,5 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- int err = zfs_secpolicy_read(zc, cr); -+ int err = zfs_secpolicy_read(zc, innvl, cr); - if (err) -@@ -941,3 +1119,3 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -947,4 +1125,5 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -954,14 +1133,47 @@ zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) - -+/* ARGSUSED */ - static int --zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- return (zfs_secpolicy_write_perms(zc->zc_name, -- ZFS_DELEG_PERM_HOLD, cr)); -+ nvpair_t *pair; -+ nvlist_t *holds; -+ int error; -+ -+ error = nvlist_lookup_nvlist(innvl, "holds", &holds); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(holds, pair)) { -+ char fsname[MAXNAMELEN]; -+ error = dmu_fsname(nvpair_name(pair), fsname); -+ if (error != 0) -+ return (error); -+ error = zfs_secpolicy_write_perms(fsname, -+ ZFS_DELEG_PERM_HOLD, cr); -+ if (error != 0) -+ return (error); -+ } -+ return (0); - } - -+/* ARGSUSED */ - static int --zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -- return (zfs_secpolicy_write_perms(zc->zc_name, -- ZFS_DELEG_PERM_RELEASE, cr)); -+ nvpair_t *pair; -+ int error; -+ -+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(innvl, pair)) { -+ char fsname[MAXNAMELEN]; -+ error = dmu_fsname(nvpair_name(pair), fsname); -+ if (error != 0) -+ return (error); -+ error = zfs_secpolicy_write_perms(fsname, -+ ZFS_DELEG_PERM_RELEASE, cr); -+ if (error != 0) -+ return (error); -+ } -+ return (0); - } -@@ -972,3 +1184,3 @@ zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) - static int --zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) -+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) - { -@@ -985,9 +1197,9 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, cred_t *cr) - -- error = zfs_secpolicy_snapshot(zc, cr); -- if (!error) -- error = zfs_secpolicy_hold(zc, cr); -- if (!error) -- error = zfs_secpolicy_release(zc, cr); -- if (!error) -- error = zfs_secpolicy_destroy(zc, cr); -+ error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); -+ if (error == 0) -+ error = zfs_secpolicy_hold(zc, innvl, cr); -+ if (error == 0) -+ error = zfs_secpolicy_release(zc, innvl, cr); -+ if (error == 0) -+ error = zfs_secpolicy_destroy(zc, innvl, cr); - return (error); -@@ -1009,3 +1221,3 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) - if (size == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1030,4 +1242,10 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) - -+/* -+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes. -+ * Entries will be removed from the end of the nvlist, and one int32 entry -+ * named "N_MORE_ERRORS" will be added indicating how many entries were -+ * removed. -+ */ - static int --fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) -+nvlist_smush(nvlist_t *errors, size_t max) - { -@@ -1035,5 +1253,5 @@ fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) - -- VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); -+ size = fnvlist_size(errors); - -- if (size > zc->zc_nvlist_dst_size) { -+ if (size > max) { - nvpair_t *more_errors; -@@ -1041,21 +1259,19 @@ fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) - -- if (zc->zc_nvlist_dst_size < 1024) -- return (ENOMEM); -+ if (max < 1024) -+ return (SET_ERROR(ENOMEM)); - -- VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); -- more_errors = nvlist_prev_nvpair(*errors, NULL); -+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); -+ more_errors = nvlist_prev_nvpair(errors, NULL); - - do { -- nvpair_t *pair = nvlist_prev_nvpair(*errors, -+ nvpair_t *pair = nvlist_prev_nvpair(errors, - more_errors); -- VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); -+ fnvlist_remove_nvpair(errors, pair); - n++; -- VERIFY(nvlist_size(*errors, &size, -- NV_ENCODE_NATIVE) == 0); -- } while (size > zc->zc_nvlist_dst_size); -+ size = fnvlist_size(errors); -+ } while (size > max); - -- VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); -- VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); -- ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); -- ASSERT(size <= zc->zc_nvlist_dst_size); -+ fnvlist_remove_nvpair(errors, more_errors); -+ fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); -+ ASSERT3U(fnvlist_size(errors), <=, max); - } -@@ -1072,14 +1288,12 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) - -- VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); -+ size = fnvlist_size(nvl); - - if (size > zc->zc_nvlist_dst_size) { -- error = ENOMEM; -+ error = SET_ERROR(ENOMEM); - } else { -- packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); -- VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, -- KM_SLEEP) == 0); -+ packed = fnvlist_pack(nvl, &size); - if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size, zc->zc_iflags) != 0) -- error = EFAULT; -- kmem_free(packed, size); -+ error = SET_ERROR(EFAULT); -+ fnvlist_pack_free(packed, size); - } -@@ -1087,2 +1301,3 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) - zc->zc_nvlist_dst_size = size; -+ zc->zc_nvlist_dst_filled = B_TRUE; - return (error); -@@ -1097,3 +1312,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - error = dmu_objset_hold(dsname, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -1101,3 +1316,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1109,3 +1324,3 @@ get_zfs_sb(const char *dsname, zfs_sb_t **zsbp) - } else { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -1136,6 +1351,6 @@ zfs_sb_hold(const char *name, void *tag, zfs_sb_t **zsbp, boolean_t writer) - * thread should be just about to disassociate the -- * objset from the zfsvfs. -+ * objset from the zsb. - */ - rrw_exit(&(*zsbp)->z_teardown_lock, tag); -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -1165,3 +1380,2 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - nvlist_t *zplprops = NULL; -- char *buf; - -@@ -1185,3 +1399,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - if (!SPA_VERSION_IS_SUPPORTED(version)) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto pool_props_bad; -@@ -1201,3 +1415,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - zplprops, NULL); -- if (error) -+ if (error != 0) - goto pool_props_bad; -@@ -1205,5 +1419,3 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - -- buf = history_str_get(zc); -- -- error = spa_create(zc->zc_name, config, props, buf, zplprops); -+ error = spa_create(zc->zc_name, config, props, zplprops); - -@@ -1216,5 +1428,2 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) - -- if (buf != NULL) -- history_str_free(buf); -- - pool_props_bad: -@@ -1259,3 +1468,3 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) - guid != zc->zc_guid) -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - else -@@ -1299,3 +1508,3 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -1363,3 +1572,3 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) - if (config == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1421,3 +1630,3 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1439,3 +1648,3 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) - if ((size = zc->zc_history_len) == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1446,3 +1655,3 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1479,8 +1688,3 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) - { -- int error; -- -- if ((error = dsl_dsobj_to_dsname(zc->zc_name,zc->zc_obj,zc->zc_value))) -- return (error); -- -- return (0); -+ return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); - } -@@ -1506,3 +1710,3 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1535,3 +1739,3 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc) - dmu_objset_rele(os, FTAG); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1549,4 +1753,3 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) - int error; -- nvlist_t *config, **l2cache, **spares; -- uint_t nl2cache = 0, nspares = 0; -+ nvlist_t *config; - -@@ -1558,24 +1761,2 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) - zc->zc_iflags, &config); -- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, -- &l2cache, &nl2cache); -- -- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, -- &spares, &nspares); -- -- /* -- * A root pool with concatenated devices is not supported. -- * Thus, can not add a device to a root pool. -- * -- * Intent log device can not be added to a rootpool because -- * during mountroot, zil is replayed, a seperated log device -- * can not be accessed during the mountroot time. -- * -- * l2cache and spare devices are ok to be added to a rootpool. -- */ -- if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { -- nvlist_free(config); -- spa_close(spa, FTAG); -- return (EDOM); -- } -- - if (error == 0) { -@@ -1643,3 +1824,3 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) - default: -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - } -@@ -1801,11 +1982,10 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) - { -- objset_t *os = NULL; -+ objset_t *os; - int error; - -- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) -- return (error); -- -- error = zfs_ioc_objset_stats_impl(zc, os); -- -- dmu_objset_rele(os, FTAG); -+ error = dmu_objset_hold(zc->zc_name, FTAG, &os); -+ if (error == 0) { -+ error = zfs_ioc_objset_stats_impl(zc, os); -+ dmu_objset_rele(os, FTAG); -+ } - -@@ -1830,9 +2010,5 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - { -- objset_t *os = NULL; -- int error; -+ int error = 0; - nvlist_t *nv; - -- if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) -- return (error); -- - /* -@@ -1842,9 +2018,7 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - */ -- if (!dsl_prop_get_hasrecvd(os)) { -- dmu_objset_rele(os, FTAG); -- return (ENOTSUP); -- } -+ if (!dsl_prop_get_hasrecvd(zc->zc_name)) -+ return (SET_ERROR(ENOTSUP)); - - if (zc->zc_nvlist_dst != 0 && -- (error = dsl_prop_get_received(os, &nv)) == 0) { -+ (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { - error = put_nvlist(zc, nv); -@@ -1853,3 +2027,2 @@ zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) - -- dmu_objset_rele(os, FTAG); - return (error); -@@ -1912,3 +2085,3 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) - } else { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } -@@ -1918,3 +2091,3 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) - --static boolean_t -+boolean_t - dataset_name_hidden(const char *name) -@@ -1959,3 +2132,3 @@ top: - if (error == ENOENT) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - return (error); -@@ -1968,16 +2141,2 @@ top: - -- /* -- * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 -- * but is not declared void because its called by dmu_objset_find(). -- */ -- if (zc->zc_cookie == 0) { -- uint64_t cookie = 0; -- int len = sizeof (zc->zc_name) - (p - zc->zc_name); -- -- while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) { -- if (!dataset_name_hidden(zc->zc_name)) -- (void) dmu_objset_prefetch(zc->zc_name, NULL); -- } -- } -- - do { -@@ -1987,3 +2146,3 @@ top: - if (error == ENOENT) -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } while (error == 0 && dataset_name_hidden(zc->zc_name)); -@@ -2024,10 +2183,6 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) - --top: -- if (zc->zc_cookie == 0 && !zc->zc_simple) -- (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, -- NULL, DS_FIND_SNAPSHOTS); -- - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) { - return (error == ENOENT ? ESRCH : error); -+ } - -@@ -2039,3 +2194,3 @@ top: - dmu_objset_rele(os, FTAG); -- return (ESRCH); -+ return (SET_ERROR(ESRCH)); - } -@@ -2051,20 +2206,4 @@ top: - -- /* -- * Since we probably don't have a hold on this snapshot, -- * it's possible that the objsetid could have been destroyed -- * and reused for a new objset. It's OK if this happens during -- * a zfs send operation, since the new createtxg will be -- * beyond the range we're interested in. -- */ -- rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) { -- if (error == ENOENT) { -- /* Racing with destroy, get the next one. */ -- *strchr(zc->zc_name, '@') = '\0'; -- dmu_objset_rele(os, FTAG); -- goto top; -- } -- } else { -+ if (error == 0) { - objset_t *ossnap; -@@ -2077,3 +2216,3 @@ top: - } else if (error == ENOENT) { -- error = ESRCH; -+ error = SET_ERROR(ESRCH); - } -@@ -2082,3 +2221,3 @@ top: - /* if we failed, undo the @ that we tacked on to zc_name */ -- if (error) -+ if (error != 0) - *strchr(zc->zc_name, '@') = '\0'; -@@ -2106,3 +2245,3 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) - &pair) != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2116,3 +2255,3 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) - vallen != 3) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2172,3 +2311,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - case ZFS_PROP_REFQUOTA: -- err = dsl_dataset_set_quota(dsname, source, intval); -+ err = dsl_dataset_set_refquota(dsname, source, intval); - break; -@@ -2178,3 +2317,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - case ZFS_PROP_REFRESERVATION: -- err = dsl_dataset_set_reservation(dsname, source, intval); -+ err = dsl_dataset_set_refreservation(dsname, source, intval); - break; -@@ -2213,3 +2352,2 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - spa_t *spa; -- dsl_pool_t *dp; - -@@ -2218,4 +2356,2 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - -- dp = spa->spa_dsl_pool; -- - /* -@@ -2225,3 +2361,3 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - if (!spa_feature_is_active(spa, feature)) { -- if ((err = zfs_prop_activate_feature(dp, -+ if ((err = zfs_prop_activate_feature(spa, - feature)) != 0) { -@@ -2251,10 +2387,9 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, - * This function is best effort. If it fails to set any of the given properties, -- * it continues to set as many as it can and returns the first error -- * encountered. If the caller provides a non-NULL errlist, it also gives the -- * complete list of names of all the properties it failed to set along with the -- * corresponding error numbers. The caller is responsible for freeing the -- * returned errlist. -+ * it continues to set as many as it can and returns the last error -+ * encountered. If the caller provides a non-NULL errlist, it will be filled in -+ * with the list of names of all the properties that failed along with the -+ * corresponding error numbers. - * -- * If every property is set successfully, zero is returned and the list pointed -- * at by errlist is NULL. -+ * If every property is set successfully, zero is returned and errlist is not -+ * modified. - */ -@@ -2262,3 +2397,3 @@ int - zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, -- nvlist_t **errlist) -+ nvlist_t *errlist) - { -@@ -2269,10 +2404,5 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, - char *strval; -- nvlist_t *genericnvl; -- nvlist_t *errors; -- nvlist_t *retrynvl; -- -- VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); -- VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - -+ nvlist_t *genericnvl = fnvlist_alloc(); -+ nvlist_t *retrynvl = fnvlist_alloc(); - retry: -@@ -2288,6 +2418,6 @@ retry: - nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -+ attrs = fnvpair_value_nvlist(pair); - if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &propval) != 0) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2298,3 +2428,3 @@ retry: - if (nvpair_type(propval) != DATA_TYPE_STRING) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else if (zfs_prop_userquota(propname)) { -@@ -2302,5 +2432,5 @@ retry: - DATA_TYPE_UINT64_ARRAY) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2309,3 +2439,3 @@ retry: - if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { -@@ -2313,4 +2443,3 @@ retry: - -- VERIFY(nvpair_value_uint64(propval, -- &intval) == 0); -+ intval = fnvpair_value_uint64(propval); - -@@ -2320,3 +2449,3 @@ retry: - case PROP_TYPE_STRING: -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -2325,3 +2454,3 @@ retry: - intval, &unused) != 0) -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - break; -@@ -2332,3 +2461,3 @@ retry: - } else { -- err = EINVAL; -+ err = SET_ERROR(EINVAL); - } -@@ -2358,4 +2487,7 @@ retry: - -- if (err != 0) -- VERIFY(nvlist_add_int32(errors, propname, err) == 0); -+ if (err != 0) { -+ if (errlist != NULL) -+ fnvlist_add_int32(errlist, propname, err); -+ rv = err; -+ } - } -@@ -2381,5 +2513,5 @@ retry: - nvlist_t *attrs; -- VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); -- VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, -- &propval) == 0); -+ attrs = fnvpair_value_nvlist(pair); -+ propval = fnvlist_lookup_nvpair(attrs, -+ ZPROP_VALUE); - } -@@ -2387,11 +2519,9 @@ retry: - if (nvpair_type(propval) == DATA_TYPE_STRING) { -- VERIFY(nvpair_value_string(propval, -- &strval) == 0); -- err = dsl_prop_set(dsname, propname, source, 1, -- strlen(strval) + 1, strval); -+ strval = fnvpair_value_string(propval); -+ err = dsl_prop_set_string(dsname, propname, -+ source, strval); - } else { -- VERIFY(nvpair_value_uint64(propval, -- &intval) == 0); -- err = dsl_prop_set(dsname, propname, source, 8, -- 1, &intval); -+ intval = fnvpair_value_uint64(propval); -+ err = dsl_prop_set_int(dsname, propname, source, -+ intval); - } -@@ -2399,4 +2529,7 @@ retry: - if (err != 0) { -- VERIFY(nvlist_add_int32(errors, propname, -- err) == 0); -+ if (errlist != NULL) { -+ fnvlist_add_int32(errlist, propname, -+ err); -+ } -+ rv = err; - } -@@ -2407,14 +2540,2 @@ retry: - -- if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { -- nvlist_free(errors); -- errors = NULL; -- } else { -- VERIFY(nvpair_value_int32(pair, &rv) == 0); -- } -- -- if (errlist == NULL) -- nvlist_free(errors); -- else -- *errlist = errors; -- - return (rv); -@@ -2426,3 +2547,3 @@ retry: - static int --zfs_check_userprops(char *fsname, nvlist_t *nvl) -+zfs_check_userprops(const char *fsname, nvlist_t *nvl) - { -@@ -2437,3 +2558,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - nvpair_type(pair) != DATA_TYPE_STRING) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2444,3 +2565,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - if (strlen(propname) >= ZAP_MAXNAMELEN) -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - -@@ -2448,3 +2569,3 @@ zfs_check_userprops(char *fsname, nvlist_t *nvl) - if (strlen(valstr) >= ZAP_MAXVALUELEN) -- return (E2BIG); -+ return (SET_ERROR(E2BIG)); - } -@@ -2470,3 +2591,3 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) - static int --clear_received_props(objset_t *os, const char *fs, nvlist_t *props, -+clear_received_props(const char *dsname, nvlist_t *props, - nvlist_t *skipped) -@@ -2482,4 +2603,4 @@ clear_received_props(objset_t *os, const char *fs, nvlist_t *props, - zprop_source_t flags = (ZPROP_SRC_NONE | -- (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); -- err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); -+ (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); -+ err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); - } -@@ -2506,3 +2627,3 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) - ZPROP_SRC_LOCAL); -- nvlist_t *errors = NULL; -+ nvlist_t *errors; - int error; -@@ -2515,17 +2636,15 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) - nvlist_t *origprops; -- objset_t *os; -- -- if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { -- if (dsl_prop_get_received(os, &origprops) == 0) { -- (void) clear_received_props(os, -- zc->zc_name, origprops, nvl); -- nvlist_free(origprops); -- } - -- dsl_prop_set_hasrecvd(os); -- dmu_objset_rele(os, FTAG); -+ if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { -+ (void) clear_received_props(zc->zc_name, -+ origprops, nvl); -+ nvlist_free(origprops); - } -+ -+ error = dsl_prop_set_hasrecvd(zc->zc_name); - } - -- error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); -+ errors = fnvlist_alloc(); -+ if (error == 0) -+ error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); - -@@ -2570,3 +2689,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - if (!zfs_prop_user(propname)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -2575,3 +2694,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - prop == ZFS_PROP_VERSION) { -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } else { -@@ -2592,3 +2711,3 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - nvlist_free(dummy); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2608,7 +2727,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } - -- /* the property name has been validated by zfs_secpolicy_inherit() */ -- return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); -+ /* property name has been validated by zfs_secpolicy_inherit_prop() */ -+ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); - } -@@ -2685,3 +2804,3 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) - else -- error = EFAULT; -+ error = SET_ERROR(EFAULT); - -@@ -2693,26 +2812,2 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) - * inputs: -- * zc_name name of volume -- * -- * outputs: none -- */ --static int --zfs_ioc_create_minor(zfs_cmd_t *zc) --{ -- return (zvol_create_minor(zc->zc_name)); --} -- --/* -- * inputs: -- * zc_name name of volume -- * -- * outputs: none -- */ --static int --zfs_ioc_remove_minor(zfs_cmd_t *zc) --{ -- return (zvol_remove_minor(zc->zc_name)); --} -- --/* -- * inputs: - * zc_name name of filesystem -@@ -2738,3 +2833,3 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) - nvlist_free(fsaclnv); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2748,3 +2843,3 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) - error = secpolicy_zfs(CRED()); -- if (error) { -+ if (error != 0) { - if (zc->zc_perm_action == B_FALSE) { -@@ -2799,6 +2894,6 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) - * inputs: -- * createprops list of properties requested by creator -- * default_zplver zpl version to use if unspecified in createprops -- * fuids_ok fuids allowed in this version of the spa? - * os parent objset pointer (NULL if root fs) -+ * fuids_ok fuids allowed in this version of the spa? -+ * sa_ok SAs allowed in this version of the spa? -+ * createprops list of properties requested by creator - * -@@ -2861,3 +2956,3 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - sense != ZFS_PROP_UNDEFINED))) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -2957,22 +3052,26 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, - /* -- * inputs: -- * zc_objset_type type of objset to create (fs vs zvol) -- * zc_name name of new objset -- * zc_value name of snapshot to clone from (may be empty) -- * zc_nvlist_src{_size} nvlist of properties to apply -+ * innvl: { -+ * "type" -> dmu_objset_type_t (int32) -+ * (optional) "props" -> { prop -> value } -+ * } - * -- * outputs: none -+ * outnvl: propname -> error code (int32) - */ - static int --zfs_ioc_create(zfs_cmd_t *zc) -+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) - { -- objset_t *clone; - int error = 0; -- zfs_creat_t zct; -+ zfs_creat_t zct = { 0 }; - nvlist_t *nvprops = NULL; - void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -- dmu_objset_type_t type = zc->zc_objset_type; -+ int32_t type32; -+ dmu_objset_type_t type; -+ boolean_t is_insensitive = B_FALSE; - -- switch (type) { -+ if (nvlist_lookup_int32(innvl, "type", &type32) != 0) -+ return (SET_ERROR(EINVAL)); -+ type = type32; -+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - -+ switch (type) { - case DMU_OST_ZFS: -@@ -2989,96 +3088,104 @@ zfs_ioc_create(zfs_cmd_t *zc) - } -- if (strchr(zc->zc_name, '@') || -- strchr(zc->zc_name, '%')) -- return (EINVAL); -- -- if (zc->zc_nvlist_src != 0 && -- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvprops)) != 0) -- return (error); -+ if (strchr(fsname, '@') || -+ strchr(fsname, '%')) -+ return (SET_ERROR(EINVAL)); - -- zct.zct_zplprops = NULL; - zct.zct_props = nvprops; - -- if (zc->zc_value[0] != '\0') { -- /* -- * We're creating a clone of an existing snapshot. -- */ -- zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; -- if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ if (cbfunc == NULL) -+ return (SET_ERROR(EINVAL)); -+ -+ if (type == DMU_OST_ZVOL) { -+ uint64_t volsize, volblocksize; -+ -+ if (nvprops == NULL) -+ return (SET_ERROR(EINVAL)); -+ if (nvlist_lookup_uint64(nvprops, -+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) -+ return (SET_ERROR(EINVAL)); - -- error = dmu_objset_hold(zc->zc_value, FTAG, &clone); -- if (error) { -- nvlist_free(nvprops); -+ if ((error = nvlist_lookup_uint64(nvprops, -+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), -+ &volblocksize)) != 0 && error != ENOENT) -+ return (SET_ERROR(EINVAL)); -+ -+ if (error != 0) -+ volblocksize = zfs_prop_default_numeric( -+ ZFS_PROP_VOLBLOCKSIZE); -+ -+ if ((error = zvol_check_volblocksize( -+ volblocksize)) != 0 || -+ (error = zvol_check_volsize(volsize, -+ volblocksize)) != 0) - return (error); -- } -+ } else if (type == DMU_OST_ZFS) { -+ int error; - -- error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); -- dmu_objset_rele(clone, FTAG); -- if (error) { -- nvlist_free(nvprops); -+ /* -+ * We have to have normalization and -+ * case-folding flags correct when we do the -+ * file system creation, so go figure them out -+ * now. -+ */ -+ VERIFY(nvlist_alloc(&zct.zct_zplprops, -+ NV_UNIQUE_NAME, KM_SLEEP) == 0); -+ error = zfs_fill_zplprops(fsname, nvprops, -+ zct.zct_zplprops, &is_insensitive); -+ if (error != 0) { -+ nvlist_free(zct.zct_zplprops); - return (error); - } -- } else { -- boolean_t is_insensitive = B_FALSE; -+ } - -- if (cbfunc == NULL) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ error = dmu_objset_create(fsname, type, -+ is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); -+ nvlist_free(zct.zct_zplprops); - -- if (type == DMU_OST_ZVOL) { -- uint64_t volsize, volblocksize; -+ /* -+ * It would be nice to do this atomically. -+ */ -+ if (error == 0) { -+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, -+ nvprops, outnvl); -+ if (error != 0) -+ (void) dsl_destroy_head(fsname); -+ } - -- if (nvprops == NULL || -- nvlist_lookup_uint64(nvprops, -- zfs_prop_to_name(ZFS_PROP_VOLSIZE), -- &volsize) != 0) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+#ifdef _KERNEL -+ if (error == 0 && type == DMU_OST_ZVOL) -+ zvol_create_minors(fsname); -+#endif - -- if ((error = nvlist_lookup_uint64(nvprops, -- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), -- &volblocksize)) != 0 && error != ENOENT) { -- nvlist_free(nvprops); -- return (EINVAL); -- } -+ return (error); -+} - -- if (error != 0) -- volblocksize = zfs_prop_default_numeric( -- ZFS_PROP_VOLBLOCKSIZE); -+/* -+ * innvl: { -+ * "origin" -> name of origin snapshot -+ * (optional) "props" -> { prop -> value } -+ * } -+ * -+ * outputs: -+ * outnvl: propname -> error code (int32) -+ */ -+static int -+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ int error = 0; -+ nvlist_t *nvprops = NULL; -+ char *origin_name; - -- if ((error = zvol_check_volblocksize( -- volblocksize)) != 0 || -- (error = zvol_check_volsize(volsize, -- volblocksize)) != 0) { -- nvlist_free(nvprops); -- return (error); -- } -- } else if (type == DMU_OST_ZFS) { -- int error; -+ if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) -+ return (SET_ERROR(EINVAL)); -+ (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); - -- /* -- * We have to have normalization and -- * case-folding flags correct when we do the -- * file system creation, so go figure them out -- * now. -- */ -- VERIFY(nvlist_alloc(&zct.zct_zplprops, -- NV_UNIQUE_NAME, KM_SLEEP) == 0); -- error = zfs_fill_zplprops(zc->zc_name, nvprops, -- zct.zct_zplprops, &is_insensitive); -- if (error != 0) { -- nvlist_free(nvprops); -- nvlist_free(zct.zct_zplprops); -- return (error); -- } -- } -- error = dmu_objset_create(zc->zc_name, type, -- is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); -- nvlist_free(zct.zct_zplprops); -- } -+ if (strchr(fsname, '@') || -+ strchr(fsname, '%')) -+ return (SET_ERROR(EINVAL)); -+ -+ if (dataset_namecheck(origin_name, NULL, NULL) != 0) -+ return (SET_ERROR(EINVAL)); -+ error = dmu_objset_clone(fsname, origin_name); -+ if (error != 0) -+ return (error); - -@@ -3088,8 +3195,13 @@ zfs_ioc_create(zfs_cmd_t *zc) - if (error == 0) { -- error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, -- nvprops, NULL); -+ error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, -+ nvprops, outnvl); - if (error != 0) -- (void) dmu_objset_destroy(zc->zc_name, B_FALSE); -+ (void) dsl_destroy_head(fsname); - } -- nvlist_free(nvprops); -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(fsname); -+#endif -+ - return (error); -@@ -3098,41 +3210,64 @@ zfs_ioc_create(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -- * zc_value short name of snapshot -- * zc_cookie recursive flag -- * zc_nvlist_src[_size] property list -+ * innvl: { -+ * "snaps" -> { snapshot1, snapshot2 } -+ * (optional) "props" -> { prop -> value (string) } -+ * } - * -- * outputs: -- * zc_value short snapname (i.e. part after the '@') -+ * outnvl: snapshot -> error code (int32) - */ - static int --zfs_ioc_snapshot(zfs_cmd_t *zc) -+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) - { -- nvlist_t *nvprops = NULL; -- int error; -- boolean_t recursive = zc->zc_cookie; -- -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -+ nvlist_t *snaps; -+ nvlist_t *props = NULL; -+ int error, poollen; -+ nvpair_t *pair, *pair2; - -- if (zc->zc_nvlist_src != 0 && -- (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvprops)) != 0) -+ (void) nvlist_lookup_nvlist(innvl, "props", &props); -+ if ((error = zfs_check_userprops(poolname, props)) != 0) - return (error); - -- error = zfs_check_userprops(zc->zc_name, nvprops); -- if (error) -- goto out; -+ if (!nvlist_empty(props) && -+ zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) -+ return (SET_ERROR(ENOTSUP)); - -- if (!nvlist_empty(nvprops) && -- zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { -- error = ENOTSUP; -- goto out; -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ poollen = strlen(poolname); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { -+ const char *name = nvpair_name(pair); -+ const char *cp = strchr(name, '@'); -+ -+ /* -+ * The snap name must contain an @, and the part after it must -+ * contain only valid characters. -+ */ -+ if (cp == NULL || snapshot_namecheck(cp + 1, NULL, NULL) != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ /* -+ * The snap must be in the specified pool. -+ */ -+ if (strncmp(name, poolname, poollen) != 0 || -+ (name[poollen] != '/' && name[poollen] != '@')) -+ return (SET_ERROR(EXDEV)); -+ -+ /* This must be the only snap of this fs. */ -+ for (pair2 = nvlist_next_nvpair(snaps, pair); -+ pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { -+ if (strncmp(name, nvpair_name(pair2), cp - name + 1) -+ == 0) { -+ return (SET_ERROR(EXDEV)); -+ } -+ } - } - -- error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, NULL, -- nvprops, recursive, B_FALSE, -1); -+ error = dsl_dataset_snapshot(snaps, props, outnvl); -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(poolname); -+#endif - --out: -- nvlist_free(nvprops); - return (error); -@@ -3141,8 +3276,59 @@ out: - /* -- * inputs: -- * name dataset name, or when 'arg == NULL' the full snapshot name -- * arg short snapshot name (i.e. part after the '@') -+ * innvl: "message" -> string -+ */ -+/* ARGSUSED */ -+static int -+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ char *message; -+ spa_t *spa; -+ int error; -+ char *poolname; -+ -+ /* -+ * The poolname in the ioctl is not set, we get it from the TSD, -+ * which was set at the end of the last successful ioctl that allows -+ * logging. The secpolicy func already checked that it is set. -+ * Only one log ioctl is allowed after each successful ioctl, so -+ * we clear the TSD here. -+ */ -+ poolname = tsd_get(zfs_allow_log_key); -+ (void) tsd_set(zfs_allow_log_key, NULL); -+ error = spa_open(poolname, &spa, FTAG); -+ strfree(poolname); -+ if (error != 0) -+ return (error); -+ -+ if (nvlist_lookup_string(innvl, "message", &message) != 0) { -+ spa_close(spa, FTAG); -+ return (SET_ERROR(EINVAL)); -+ } -+ -+ if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { -+ spa_close(spa, FTAG); -+ return (SET_ERROR(ENOTSUP)); -+ } -+ -+ error = spa_history_log(spa, message); -+ spa_close(spa, FTAG); -+ return (error); -+} -+ -+/* -+ * The dp_config_rwlock must not be held when calling this, because the -+ * unmount may need to write out data. -+ * -+ * This function is best-effort. Callers must deal gracefully if it -+ * remains mounted (or is remounted after this call). -+ * -+ * XXX: This function should detect a failure to unmount a snapdir of a dataset -+ * and return the appropriate error code when it is mounted. Its Illumos and -+ * FreeBSD counterparts do this. We do not do this on Linux because there is no -+ * clear way to access the mount information that FreeBSD and Illumos use to -+ * distinguish between things with mounted snapshot directories, and things -+ * without mounted snapshot directories, which include zvols. Returning a -+ * failure for the latter causes `zfs destroy` to fail on zvol snapshots. - */ - int --zfs_unmount_snap(const char *name, void *arg) -+zfs_unmount_snap(const char *snapname) - { -@@ -3150,38 +3336,29 @@ zfs_unmount_snap(const char *name, void *arg) - char *dsname; -- char *snapname; - char *fullname; - char *ptr; -- int error; - -- if (arg) { -- dsname = strdup(name); -- snapname = strdup(arg); -- } else { -- ptr = strchr(name, '@'); -- if (ptr) { -- dsname = strdup(name); -- dsname[ptr - name] = '\0'; -- snapname = strdup(ptr + 1); -- } else { -- return (0); -- } -- } -+ if ((ptr = strchr(snapname, '@')) == NULL) -+ return (0); - -- fullname = kmem_asprintf("%s@%s", dsname, snapname); -+ dsname = kmem_alloc(ptr - snapname + 1, KM_SLEEP); -+ strlcpy(dsname, snapname, ptr - snapname + 1); -+ fullname = strdup(snapname); - -- error = zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE); -- if (error == 0) { -- error = zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); -+ if (zfs_sb_hold(dsname, FTAG, &zsb, B_FALSE) == 0) { -+ ASSERT(!dsl_pool_config_held(dmu_objset_pool(zsb->z_os))); -+ (void) zfsctl_unmount_snapshot(zsb, fullname, MNT_FORCE); - zfs_sb_rele(zsb, FTAG); -- -- /* Allow ENOENT for consistency with upstream */ -- if (error == ENOENT) -- error = 0; - } - -- strfree(dsname); -- strfree(snapname); -+ kmem_free(dsname, ptr - snapname + 1); - strfree(fullname); - -- return (error); -+ return (0); -+} -+ -+/* ARGSUSED */ -+static int -+zfs_unmount_snap_cb(const char *snapname, void *arg) -+{ -+ return (zfs_unmount_snap(snapname)); - } -@@ -3189,36 +3366,62 @@ zfs_unmount_snap(const char *name, void *arg) - /* -- * inputs: -- * zc_name name of filesystem, snaps must be under it -- * zc_nvlist_src[_size] full names of snapshots to destroy -- * zc_defer_destroy mark for deferred destroy -+ * When a clone is destroyed, its origin may also need to be destroyed, -+ * in which case it must be unmounted. This routine will do that unmount -+ * if necessary. -+ */ -+void -+zfs_destroy_unmount_origin(const char *fsname) -+{ -+ int error; -+ objset_t *os; -+ dsl_dataset_t *ds; -+ -+ error = dmu_objset_hold(fsname, FTAG, &os); -+ if (error != 0) -+ return; -+ ds = dmu_objset_ds(os); -+ if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { -+ char originname[MAXNAMELEN]; -+ dsl_dataset_name(ds->ds_prev, originname); -+ dmu_objset_rele(os, FTAG); -+ (void) zfs_unmount_snap(originname); -+ } else { -+ dmu_objset_rele(os, FTAG); -+ } -+} -+ -+/* -+ * innvl: { -+ * "snaps" -> { snapshot1, snapshot2 } -+ * (optional boolean) "defer" -+ * } - * -- * outputs: -- * zc_name on failure, name of failed snapshot -+ * outnvl: snapshot -> error code (int32) - */ - static int --zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) -+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) - { -- int err, len; -- nvlist_t *nvl; -+ int error, poollen; -+ nvlist_t *snaps; - nvpair_t *pair; -+ boolean_t defer; - -- if ((err = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -- zc->zc_iflags, &nvl)) != 0) -- return (err); -+ if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) -+ return (SET_ERROR(EINVAL)); -+ defer = nvlist_exists(innvl, "defer"); - -- len = strlen(zc->zc_name); -- for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; -- pair = nvlist_next_nvpair(nvl, pair)) { -+ poollen = strlen(poolname); -+ for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; -+ pair = nvlist_next_nvpair(snaps, pair)) { - const char *name = nvpair_name(pair); -+ - /* -- * The snap name must be underneath the zc_name. This ensures -- * that our permission checks were legitimate. -+ * The snap must be in the specified pool. - */ -- if (strncmp(zc->zc_name, name, len) != 0 || -- (name[len] != '@' && name[len] != '/')) { -- nvlist_free(nvl); -- return (EINVAL); -- } -+ if (strncmp(name, poolname, poollen) != 0 || -+ (name[poollen] != '/' && name[poollen] != '@')) -+ return (SET_ERROR(EXDEV)); - -- (void) zfs_unmount_snap(name, NULL); -+ error = zfs_unmount_snap(name); -+ if (error != 0) -+ return (error); - (void) zvol_remove_minor(name); -@@ -3226,6 +3429,3 @@ zfs_ioc_destroy_snaps_nvl(zfs_cmd_t *zc) - -- err = dmu_snapshots_destroy_nvl(nvl, zc->zc_defer_destroy, -- zc->zc_name); -- nvlist_free(nvl); -- return (err); -+ return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); - } -@@ -3244,5 +3444,6 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - int err; -- if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { -- err = zfs_unmount_snap(zc->zc_name, NULL); -- if (err) -+ -+ if (zc->zc_objset_type == DMU_OST_ZFS) { -+ err = zfs_unmount_snap(zc->zc_name); -+ if (err != 0) - return (err); -@@ -3250,3 +3451,6 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - -- err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); -+ if (strchr(zc->zc_name, '@')) -+ err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); -+ else -+ err = dsl_destroy_head(zc->zc_name); - if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) -@@ -3257,47 +3461,17 @@ zfs_ioc_destroy(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of dataset to rollback (to most recent snapshot) -+ * fsname is name of dataset to rollback (to most recent snapshot) - * -- * outputs: none -+ * innvl is not used. -+ * -+ * outnvl: "target" -> name of most recent snapshot -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_rollback(zfs_cmd_t *zc) -+zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) - { -- dsl_dataset_t *ds, *clone; -- int error; - zfs_sb_t *zsb; -- char *clone_name; -- -- error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); -- if (error) -- return (error); -- -- /* must not be a snapshot */ -- if (dsl_dataset_is_snapshot(ds)) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -- } -- -- /* must have a most recent snapshot */ -- if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { -- dsl_dataset_rele(ds, FTAG); -- return (EINVAL); -- } -- -- /* -- * Create clone of most recent snapshot. -- */ -- clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); -- error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); -- if (error) -- goto out; -- -- error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); -- if (error) -- goto out; -+ int error; - -- /* -- * Do clone swap. -- */ -- if (get_zfs_sb(zc->zc_name, &zsb) == 0) { -+ if (get_zfs_sb(fsname, &zsb) == 0) { - error = zfs_suspend_fs(zsb); -@@ -3306,11 +3480,4 @@ zfs_ioc_rollback(zfs_cmd_t *zc) - -- if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { -- error = dsl_dataset_clone_swap(clone, ds, -- B_TRUE); -- dsl_dataset_disown(ds, FTAG); -- ds = NULL; -- } else { -- error = EBUSY; -- } -- resume_err = zfs_resume_fs(zsb, zc->zc_name); -+ error = dsl_dataset_rollback(fsname, zsb, outnvl); -+ resume_err = zfs_resume_fs(zsb, fsname); - error = error ? error : resume_err; -@@ -3319,20 +3486,18 @@ zfs_ioc_rollback(zfs_cmd_t *zc) - } else { -- if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { -- error = dsl_dataset_clone_swap(clone, ds, B_TRUE); -- dsl_dataset_disown(ds, FTAG); -- ds = NULL; -- } else { -- error = EBUSY; -- } -+ error = dsl_dataset_rollback(fsname, NULL, outnvl); - } -+ return (error); -+} - -- /* -- * Destroy clone (which also closes it). -- */ -- (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); -+static int -+recursive_unmount(const char *fsname, void *arg) -+{ -+ const char *snapname = arg; -+ char *fullname; -+ int error; -+ -+ fullname = kmem_asprintf("%s@%s", fsname, snapname); -+ error = zfs_unmount_snap(fullname); -+ strfree(fullname); - --out: -- strfree(clone_name); -- if (ds) -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -3352,3 +3517,3 @@ zfs_ioc_rename(zfs_cmd_t *zc) - boolean_t recursive = zc->zc_cookie & 1; -- int err; -+ char *at; - -@@ -3357,23 +3522,29 @@ zfs_ioc_rename(zfs_cmd_t *zc) - strchr(zc->zc_value, '%')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -- /* -- * Unmount snapshot unless we're doing a recursive rename, -- * in which case the dataset code figures out which snapshots -- * to unmount. -- */ -- if (!recursive && strchr(zc->zc_name, '@') != NULL && -- zc->zc_objset_type == DMU_OST_ZFS) { -- err = zfs_unmount_snap(zc->zc_name, NULL); -- if (err) -- return (err); -- } -+ at = strchr(zc->zc_name, '@'); -+ if (at != NULL) { -+ /* snaps must be in same fs */ -+ int error; - -- err = dmu_objset_rename(zc->zc_name, zc->zc_value, recursive); -- if ((err == 0) && (zc->zc_objset_type == DMU_OST_ZVOL)) { -- (void) zvol_remove_minor(zc->zc_name); -- (void) zvol_create_minor(zc->zc_value); -- } -+ if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) -+ return (SET_ERROR(EXDEV)); -+ *at = '\0'; -+ if (zc->zc_objset_type == DMU_OST_ZFS) { -+ error = dmu_objset_find(zc->zc_name, -+ recursive_unmount, at + 1, -+ recursive ? DS_FIND_CHILDREN : 0); -+ if (error != 0) { -+ *at = '@'; -+ return (error); -+ } -+ } -+ error = dsl_dataset_rename_snapshot(zc->zc_name, -+ at + 1, strchr(zc->zc_value, '@') + 1, recursive); -+ *at = '@'; - -- return (err); -+ return (error); -+ } else { -+ return (dsl_dir_rename(zc->zc_name, zc->zc_value)); -+ } - } -@@ -3412,3 +3583,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - /* USERUSED and GROUPUSED are read-only */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3420,3 +3591,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3424,3 +3595,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (issnap) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3453,3 +3624,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_GZIP_COMPRESSION)) { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3459,3 +3630,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_ZLE_COMPRESSION)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -3472,3 +3643,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - spa_close(spa, FTAG); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3486,3 +3657,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - !BOOTFS_COMPRESS_VALID(intval)) { -- return (ERANGE); -+ return (SET_ERROR(ERANGE)); - } -@@ -3493,3 +3664,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3498,3 +3669,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3503,3 +3674,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - break; -@@ -3512,3 +3683,3 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - SPA_VERSION_PASSTHROUGH_X)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -3523,22 +3694,2 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) - /* -- * Activates a feature on a pool in response to a property setting. This -- * creates a new sync task which modifies the pool to reflect the feature -- * as being active. -- */ --static int --zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) --{ -- int err; -- -- /* EBUSY here indicates that the feature is already active */ -- err = dsl_sync_task_do(dp, zfs_prop_activate_feature_check, -- zfs_prop_activate_feature_sync, dp->dp_spa, feature, 2); -- -- if (err != 0 && err != EBUSY) -- return (err); -- else -- return (0); --} -- --/* - * Checks for a race condition to make sure we don't increment a feature flag -@@ -3546,8 +3697,7 @@ zfs_prop_activate_feature(dsl_pool_t *dp, zfeature_info_t *feature) - */ --/*ARGSUSED*/ - static int --zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) -+zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - -@@ -3556,3 +3706,3 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) - else -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - } -@@ -3564,6 +3714,6 @@ zfs_prop_activate_feature_check(void *arg1, void *arg2, dmu_tx_t *tx) - static void --zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) -+zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) - { -- spa_t *spa = arg1; -- zfeature_info_t *feature = arg2; -+ spa_t *spa = dmu_tx_pool(tx)->dp_spa; -+ zfeature_info_t *feature = arg; - -@@ -3573,2 +3723,23 @@ zfs_prop_activate_feature_sync(void *arg1, void *arg2, dmu_tx_t *tx) - /* -+ * Activates a feature on a pool in response to a property setting. This -+ * creates a new sync task which modifies the pool to reflect the feature -+ * as being active. -+ */ -+static int -+zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature) -+{ -+ int err; -+ -+ /* EBUSY here indicates that the feature is already active */ -+ err = dsl_sync_task(spa_name(spa), -+ zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, -+ feature, 2); -+ -+ if (err != 0 && err != EBUSY) -+ return (err); -+ else -+ return (0); -+} -+ -+/* - * Removes properties from the given props list that fail permission checks -@@ -3607,3 +3778,3 @@ zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) - if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || -- (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { -+ (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { - VERIFY(nvlist_remove_nvpair(props, pair) == 0); -@@ -3726,3 +3897,2 @@ zfs_ioc_recv(zfs_cmd_t *zc) - file_t *fp; -- objset_t *os; - dmu_recv_cookie_t drc; -@@ -3736,3 +3906,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - nvlist_t *origprops = NULL; /* existing properties */ -- objset_t *origin = NULL; -+ char *origin = NULL; - char *tosnap; -@@ -3744,3 +3914,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - strchr(zc->zc_value, '%')) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3759,3 +3929,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - nvlist_free(props); -- return (EBADF); -+ return (SET_ERROR(EBADF)); - } -@@ -3764,7 +3934,20 @@ zfs_ioc_recv(zfs_cmd_t *zc) - -- if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { -- if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && -- !dsl_prop_get_hasrecvd(os)) { -+ if (zc->zc_string[0]) -+ origin = zc->zc_string; -+ -+ error = dmu_recv_begin(tofs, tosnap, -+ &zc->zc_begin_record, force, origin, &drc); -+ if (error != 0) -+ goto out; -+ -+ /* -+ * Set properties before we receive the stream so that they are applied -+ * to the new data. Note that we must call dmu_recv_stream() if -+ * dmu_recv_begin() succeeds. -+ */ -+ if (props != NULL && !drc.drc_newfs) { -+ if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= -+ SPA_VERSION_RECVD_PROPS && -+ !dsl_prop_get_hasrecvd(tofs)) - first_recvd_props = B_TRUE; -- } - -@@ -3775,3 +3958,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- if (dsl_prop_get_received(os, &origprops) == 0) { -+ if (dsl_prop_get_received(tofs, &origprops) == 0) { - nvlist_t *errlist = NULL; -@@ -3787,56 +3970,26 @@ zfs_ioc_recv(zfs_cmd_t *zc) - props_reduce(props, origprops); -- if (zfs_check_clearable(tofs, origprops, -- &errlist) != 0) -+ if (zfs_check_clearable(tofs, origprops, &errlist) != 0) - (void) nvlist_merge(errors, errlist, 0); - nvlist_free(errlist); -- } -- -- dmu_objset_rele(os, FTAG); -- } -- -- if (zc->zc_string[0]) { -- error = dmu_objset_hold(zc->zc_string, FTAG, &origin); -- if (error) -- goto out; -- } - -- error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, -- &zc->zc_begin_record, force, origin, &drc); -- if (origin) -- dmu_objset_rele(origin, FTAG); -- if (error) -- goto out; -- -- /* -- * Set properties before we receive the stream so that they are applied -- * to the new data. Note that we must call dmu_recv_stream() if -- * dmu_recv_begin() succeeds. -- */ -- if (props) { -- nvlist_t *errlist; -- -- if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { -- if (drc.drc_newfs) { -- if (spa_version(os->os_spa) >= -- SPA_VERSION_RECVD_PROPS) -- first_recvd_props = B_TRUE; -- } else if (origprops != NULL) { -- if (clear_received_props(os, tofs, origprops, -- first_recvd_props ? NULL : props) != 0) -- zc->zc_obj |= ZPROP_ERR_NOCLEAR; -- } else { -+ if (clear_received_props(tofs, origprops, -+ first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; -- } -- dsl_prop_set_hasrecvd(os); -- } else if (!drc.drc_newfs) { -+ } else { - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } -+ } - -- (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, -- props, &errlist); -- (void) nvlist_merge(errors, errlist, 0); -- nvlist_free(errlist); -+ if (props != NULL) { -+ props_error = dsl_prop_set_hasrecvd(tofs); -+ -+ if (props_error == 0) { -+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, -+ props, errors); -+ } - } - -- if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { -+ if (zc->zc_nvlist_dst_size != 0 && -+ (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || -+ put_nvlist(zc, errors) != 0)) { - /* -@@ -3845,3 +3998,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- props_error = EINVAL; -+ props_error = SET_ERROR(EINVAL); - } -@@ -3864,3 +4017,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- end_err = dmu_recv_end(&drc); -+ end_err = dmu_recv_end(&drc, zsb); - if (error == 0) -@@ -3870,3 +4023,3 @@ zfs_ioc_recv(zfs_cmd_t *zc) - } else { -- error = dmu_recv_end(&drc); -+ error = dmu_recv_end(&drc, NULL); - } -@@ -3884,2 +4037,8 @@ zfs_ioc_recv(zfs_cmd_t *zc) - #endif -+ -+#ifdef _KERNEL -+ if (error == 0) -+ zvol_create_minors(tofs); -+#endif -+ - /* -@@ -3887,18 +4046,12 @@ zfs_ioc_recv(zfs_cmd_t *zc) - */ -- if (error && props) { -- if (dmu_objset_hold(tofs, FTAG, &os) == 0) { -- if (clear_received_props(os, tofs, props, NULL) != 0) { -- /* -- * We failed to clear the received properties. -- * Since we may have left a $recvd value on the -- * system, we can't clear the $hasrecvd flag. -- */ -- zc->zc_obj |= ZPROP_ERR_NORESTORE; -- } else if (first_recvd_props) { -- dsl_prop_unset_hasrecvd(os); -- } -- dmu_objset_rele(os, FTAG); -- } else if (!drc.drc_newfs) { -- /* We failed to clear the received properties. */ -+ if (error != 0 && props != NULL && !drc.drc_newfs) { -+ if (clear_received_props(tofs, props, NULL) != 0) { -+ /* -+ * We failed to clear the received properties. -+ * Since we may have left a $recvd value on the -+ * system, we can't clear the $hasrecvd flag. -+ */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; -+ } else if (first_recvd_props) { -+ dsl_prop_unset_hasrecvd(tofs); - } -@@ -3954,66 +4107,66 @@ zfs_ioc_send(zfs_cmd_t *zc) - { -- objset_t *fromsnap = NULL; -- objset_t *tosnap; - int error; - offset_t off; -- dsl_dataset_t *ds; -- dsl_dataset_t *dsfrom = NULL; -- spa_t *spa; -- dsl_pool_t *dp; - boolean_t estimate = (zc->zc_guid != 0); - -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -- return (error); -+ if (zc->zc_obj != 0) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *tosnap; - -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- if (error) { -- spa_close(spa, FTAG); -- return (error); -- } -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) -+ return (error); - -- error = dmu_objset_from_ds(ds, &tosnap); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -- spa_close(spa, FTAG); -- return (error); -+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ -+ if (dsl_dir_is_clone(tosnap->ds_dir)) -+ zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj; -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - } - -- if (zc->zc_fromobj != 0) { -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &dsfrom); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -+ if (estimate) { -+ dsl_pool_t *dp; -+ dsl_dataset_t *tosnap; -+ dsl_dataset_t *fromsnap = NULL; -+ -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); -- } -- error = dmu_objset_from_ds(dsfrom, &fromsnap); -- if (error) { -- dsl_dataset_rele(dsfrom, FTAG); -- dsl_dataset_rele(ds, FTAG); -+ -+ error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); - return (error); - } -- } else { -- spa_close(spa, FTAG); -- } - -- if (estimate) { -- error = dmu_send_estimate(tosnap, fromsnap, zc->zc_obj, -+ if (zc->zc_fromobj != 0) { -+ error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, -+ FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ } -+ -+ error = dmu_send_estimate(tosnap, fromsnap, - &zc->zc_objset_type); -+ -+ if (fromsnap != NULL) -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); - } else { - file_t *fp = getf(zc->zc_cookie); -- if (fp == NULL) { -- dsl_dataset_rele(ds, FTAG); -- if (dsfrom) -- dsl_dataset_rele(dsfrom, FTAG); -- return (EBADF); -- } -+ if (fp == NULL) -+ return (SET_ERROR(EBADF)); - - off = fp->f_offset; -- error = dmu_send(tosnap, fromsnap, zc->zc_obj, -- zc->zc_cookie, fp->f_vnode, &off); -+ error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, -+ zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off); - -@@ -4023,5 +4176,2 @@ zfs_ioc_send(zfs_cmd_t *zc) - } -- if (dsfrom) -- dsl_dataset_rele(dsfrom, FTAG); -- dsl_dataset_rele(ds, FTAG); - return (error); -@@ -4040,2 +4190,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - { -+ dsl_pool_t *dp; - dsl_dataset_t *ds; -@@ -4044,5 +4195,12 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - -- if ((error = dsl_dataset_hold(zc->zc_name, FTAG, &ds)) != 0) -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); -+ if (error != 0) - return (error); - -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ - mutex_enter(&ds->ds_sendstream_lock); -@@ -4066,3 +4224,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - -@@ -4070,2 +4228,3 @@ zfs_ioc_send_progress(zfs_cmd_t *zc) - dsl_dataset_rele(ds, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4143,3 +4302,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - mutex_exit(&spa_namespace_lock); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -4159,3 +4318,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - if (zc->zc_nvlist_src == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4176,3 +4335,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - -- if (error) -+ if (error != 0) - return (error); -@@ -4188,3 +4347,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - spa_close(spa, FTAG); -- return (ENODEV); -+ return (SET_ERROR(ENODEV)); - } -@@ -4200,3 +4359,3 @@ zfs_ioc_clear(zfs_cmd_t *zc) - if (zio_resume(spa) != 0) -- error = EIO; -+ error = SET_ERROR(EIO); - -@@ -4214,3 +4373,3 @@ zfs_ioc_pool_reopen(zfs_cmd_t *zc) - error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -+ if (error != 0) - return (error); -@@ -4254,3 +4413,3 @@ zfs_ioc_promote(zfs_cmd_t *zc) - (void) dmu_objset_find(zc->zc_value, -- zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); -+ zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); -@@ -4277,6 +4436,6 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) - if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); -- if (error) -+ if (error != 0) - return (error); -@@ -4310,6 +4469,6 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) - if (bufsize <= 0) -- return (ENOMEM); -+ return (SET_ERROR(ENOMEM)); - - error = zfs_sb_hold(zc->zc_name, FTAG, &zsb, B_FALSE); -- if (error) -+ if (error != 0) - return (error); -@@ -4354,4 +4513,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) - error = zfs_suspend_fs(zsb); -- if (error == 0) -+ if (error == 0) { -+ dmu_objset_refresh_ownership(zsb->z_os, -+ zsb); - error = zfs_resume_fs(zsb, zc->zc_name); -+ } - } -@@ -4363,3 +4525,3 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -4376,3 +4538,3 @@ zfs_ioc_share(zfs_cmd_t *zc) - { -- return (ENOSYS); -+ return (SET_ERROR(ENOSYS)); - } -@@ -4398,3 +4560,3 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) - error = dmu_objset_hold(zc->zc_name, FTAG, &os); -- if (error) -+ if (error != 0) - return (error); -@@ -4415,2 +4577,3 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) - * outputs: -+ * zc_value short name of new snapshot - */ -@@ -4420,3 +4583,9 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) - char *snap_name; -+ char *hold_name; - int error; -+ minor_t minor; -+ -+ error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); -+ if (error != 0) -+ return (error); - -@@ -4424,18 +4593,12 @@ zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) - (u_longlong_t)ddi_get_lbolt64()); -+ hold_name = kmem_asprintf("%%%s", zc->zc_value); - -- if (strlen(snap_name) >= MAXNAMELEN) { -- strfree(snap_name); -- return (E2BIG); -- } -- -- error = dmu_objset_snapshot(zc->zc_name, snap_name, snap_name, -- NULL, B_FALSE, B_TRUE, zc->zc_cleanup_fd); -- if (error != 0) { -- strfree(snap_name); -- return (error); -- } -- -- (void) strcpy(zc->zc_value, snap_name); -+ error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, -+ hold_name); -+ if (error == 0) -+ (void) strcpy(zc->zc_value, snap_name); - strfree(snap_name); -- return (0); -+ strfree(hold_name); -+ zfs_onexit_fd_rele(zc->zc_cleanup_fd); -+ return (error); - } -@@ -4454,4 +4617,2 @@ zfs_ioc_diff(zfs_cmd_t *zc) - { -- objset_t *fromsnap; -- objset_t *tosnap; - file_t *fp; -@@ -4460,18 +4621,5 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); -- if (error) -- return (error); -- -- error = dmu_objset_hold(zc->zc_value, FTAG, &fromsnap); -- if (error) { -- dmu_objset_rele(tosnap, FTAG); -- return (error); -- } -- - fp = getf(zc->zc_cookie); -- if (fp == NULL) { -- dmu_objset_rele(fromsnap, FTAG); -- dmu_objset_rele(tosnap, FTAG); -- return (EBADF); -- } -+ if (fp == NULL) -+ return (SET_ERROR(EBADF)); - -@@ -4479,3 +4627,3 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- error = dmu_diff(tosnap, fromsnap, fp->f_vnode, &off); -+ error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); - -@@ -4485,4 +4633,2 @@ zfs_ioc_diff(zfs_cmd_t *zc) - -- dmu_objset_rele(fromsnap, FTAG); -- dmu_objset_rele(tosnap, FTAG); - return (error); -@@ -4539,3 +4685,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - VN_RELE(vp); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4558,3 +4704,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - error = dmu_tx_assign(tx, TXG_WAIT); -- if (error) { -+ if (error != 0) { - dmu_tx_abort(tx); -@@ -4564,3 +4710,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - } -- if (error) { -+ if (error != 0) { - mutex_exit(&zsb->z_lock); -@@ -4629,3 +4775,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - default: -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - break; -@@ -4640,3 +4786,3 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - #else -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - #endif /* HAVE_SMB_SHARE */ -@@ -4645,21 +4791,18 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -- * zc_value short name of snap -- * zc_string user-supplied tag for this hold -- * zc_cookie recursive flag -- * zc_temphold set if hold is temporary -- * zc_cleanup_fd cleanup-on-exit file descriptor for calling process -- * zc_sendobj if non-zero, the objid for zc_name@zc_value -- * zc_createtxg if zc_sendobj is non-zero, snap must have zc_createtxg -+ * innvl: { -+ * "holds" -> { snapname -> holdname (string), ... } -+ * (optional) "cleanup_fd" -> fd (int32) -+ * } - * -- * outputs: none -+ * outnvl: { -+ * snapname -> error value (int32) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_hold(zfs_cmd_t *zc) -+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) - { -- boolean_t recursive = zc->zc_cookie; -- spa_t *spa; -- dsl_pool_t *dp; -- dsl_dataset_t *ds; -+ nvlist_t *holds; -+ int cleanup_fd = -1; - int error; -@@ -4667,55 +4810,15 @@ zfs_ioc_hold(zfs_cmd_t *zc) - -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -- -- if (zc->zc_sendobj == 0) { -- return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, -- zc->zc_string, recursive, zc->zc_temphold, -- zc->zc_cleanup_fd)); -- } -- -- if (recursive) -- return (EINVAL); -- -- error = spa_open(zc->zc_name, &spa, FTAG); -- if (error) -- return (error); -- -- dp = spa_get_dsl(spa); -- rw_enter(&dp->dp_config_rwlock, RW_READER); -- error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); -- rw_exit(&dp->dp_config_rwlock); -- spa_close(spa, FTAG); -- if (error) -- return (error); -- -- /* -- * Until we have a hold on this snapshot, it's possible that -- * zc_sendobj could've been destroyed and reused as part -- * of a later txg. Make sure we're looking at the right object. -- */ -- if (zc->zc_createtxg != ds->ds_phys->ds_creation_txg) { -- dsl_dataset_rele(ds, FTAG); -- return (ENOENT); -- } -+ error = nvlist_lookup_nvlist(args, "holds", &holds); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); - -- if (zc->zc_cleanup_fd != -1 && zc->zc_temphold) { -- error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); -- if (error) { -- dsl_dataset_rele(ds, FTAG); -+ if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { -+ error = zfs_onexit_fd_hold(cleanup_fd, &minor); -+ if (error != 0) - return (error); -- } -- } -- -- error = dsl_dataset_user_hold_for_send(ds, zc->zc_string, -- zc->zc_temphold); -- if (minor != 0) { -- if (error == 0) { -- dsl_register_onexit_hold_cleanup(ds, zc->zc_string, -- minor); -- } -- zfs_onexit_fd_rele(zc->zc_cleanup_fd); - } -- dsl_dataset_rele(ds, FTAG); - -+ error = dsl_dataset_user_hold(holds, minor, errlist); -+ if (minor != 0) -+ zfs_onexit_fd_rele(cleanup_fd); - return (error); -@@ -4724,20 +4827,14 @@ zfs_ioc_hold(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of dataset from which we're releasing a user hold -- * zc_value short name of snap -- * zc_string user-supplied tag for this hold -- * zc_cookie recursive flag -+ * innvl is not used. - * -- * outputs: none -+ * outnvl: { -+ * holdname -> time added (uint64 seconds since epoch) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_release(zfs_cmd_t *zc) -+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) - { -- boolean_t recursive = zc->zc_cookie; -- -- if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) -- return (EINVAL); -- -- return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, -- zc->zc_string, recursive)); -+ return (dsl_dataset_get_holds(snapname, outnvl)); - } -@@ -4745,20 +4842,17 @@ zfs_ioc_release(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name name of filesystem -+ * innvl: { -+ * snapname -> { holdname, ... } -+ * ... -+ * } - * -- * outputs: -- * zc_nvlist_src{_size} nvlist of snapshot holds -+ * outnvl: { -+ * snapname -> error value (int32) -+ * ... -+ * } - */ -+/* ARGSUSED */ - static int --zfs_ioc_get_holds(zfs_cmd_t *zc) -+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) - { -- nvlist_t *nvp; -- int error; -- -- if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { -- error = put_nvlist(zc, nvp); -- nvlist_free(nvp); -- } -- -- return (error); -+ return (dsl_dataset_user_release(holds, errlist)); - } -@@ -4768,2 +4862,3 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) - * zc_guid flags (ZEVENT_NONBLOCK) -+ * zc_cleanup_fd zevent file descriptor - * -@@ -4772,3 +4867,2 @@ zfs_ioc_get_holds(zfs_cmd_t *zc) - * zc_cookie dropped events since last get -- * zc_cleanup_fd cleanup-on-exit file descriptor - */ -@@ -4803,3 +4897,3 @@ zfs_ioc_events_next(zfs_cmd_t *zc) - error = zfs_zevent_wait(ze); -- if (error) -+ if (error != 0) - break; -@@ -4824,3 +4918,25 @@ zfs_ioc_events_clear(zfs_cmd_t *zc) - -- return 0; -+ return (0); -+} -+ -+/* -+ * inputs: -+ * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END -+ * zc_cleanup zevent file descriptor -+ */ -+static int -+zfs_ioc_events_seek(zfs_cmd_t *zc) -+{ -+ zfs_zevent_t *ze; -+ minor_t minor; -+ int error; -+ -+ error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); -+ if (error != 0) -+ return (error); -+ -+ error = zfs_zevent_seek(ze, zc->zc_guid); -+ zfs_zevent_fd_rele(zc->zc_cleanup_fd); -+ -+ return (error); - } -@@ -4841,10 +4957,17 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - int error; -+ dsl_pool_t *dp; - dsl_dataset_t *new, *old; - -- error = dsl_dataset_hold(zc->zc_name, FTAG, &new); -+ error = dsl_pool_hold(zc->zc_name, FTAG, &dp); - if (error != 0) - return (error); -- error = dsl_dataset_hold(zc->zc_value, FTAG, &old); -+ error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4856,2 +4979,3 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4860,23 +4984,37 @@ zfs_ioc_space_written(zfs_cmd_t *zc) - /* -- * inputs: -- * zc_name full name of last snapshot -- * zc_value full name of first snapshot -+ * innvl: { -+ * "firstsnap" -> snapshot name -+ * } - * -- * outputs: -- * zc_cookie space in bytes -- * zc_objset_type compressed space in bytes -- * zc_perm_action uncompressed space in bytes -+ * outnvl: { -+ * "used" -> space in bytes -+ * "compressed" -> compressed space in bytes -+ * "uncompressed" -> uncompressed space in bytes -+ * } - */ - static int --zfs_ioc_space_snaps(zfs_cmd_t *zc) -+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) - { - int error; -+ dsl_pool_t *dp; - dsl_dataset_t *new, *old; -+ char *firstsnap; -+ uint64_t used, comp, uncomp; -+ -+ if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) -+ return (SET_ERROR(EINVAL)); - -- error = dsl_dataset_hold(zc->zc_name, FTAG, &new); -+ error = dsl_pool_hold(lastsnap, FTAG, &dp); - if (error != 0) - return (error); -- error = dsl_dataset_hold(zc->zc_value, FTAG, &old); -+ -+ error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error != 0) { - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); - return (error); -@@ -4884,6 +5022,9 @@ zfs_ioc_space_snaps(zfs_cmd_t *zc) - -- error = dsl_dataset_space_wouldfree(old, new, &zc->zc_cookie, -- &zc->zc_objset_type, &zc->zc_perm_action); -+ error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); - dsl_dataset_rele(old, FTAG); - dsl_dataset_rele(new, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ fnvlist_add_uint64(outnvl, "used", used); -+ fnvlist_add_uint64(outnvl, "compressed", comp); -+ fnvlist_add_uint64(outnvl, "uncompressed", uncomp); - return (error); -@@ -4892,142 +5033,389 @@ zfs_ioc_space_snaps(zfs_cmd_t *zc) - /* -- * pool create, destroy, and export don't log the history as part of -- * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export -- * do the logging of those commands. -+ * innvl: { -+ * "fd" -> file descriptor to write stream to (int32) -+ * (optional) "fromsnap" -> full snap name to send an incremental from -+ * } -+ * -+ * outnvl is unused - */ --static zfs_ioc_vec_t zfs_ioc_vec[] = { -- { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_scan, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_READONLY }, -- { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_NONE }, -- { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_destroy_snaps_nvl, zfs_secpolicy_destroy_recursive, -- DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_obj_to_path, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, DATASET_NAME, -- B_FALSE, POOL_CHECK_NONE }, -- { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, DATASET_NAME, -- B_FALSE, POOL_CHECK_NONE }, -- { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, -- DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_next_obj, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_diff, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, DATASET_NAME, -- B_FALSE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_obj_to_stats, zfs_secpolicy_diff, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, -- POOL_CHECK_NONE }, -- { zfs_ioc_pool_reguid, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY }, -- { zfs_ioc_space_written, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, B_TRUE, -- POOL_CHECK_SUSPENDED }, -- { zfs_ioc_send_progress, zfs_secpolicy_read, DATASET_NAME, B_FALSE, -- POOL_CHECK_NONE } --}; -+/* ARGSUSED */ -+static int -+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ int error; -+ offset_t off; -+ char *fromname = NULL; -+ int fd; -+ file_t *fp; -+ -+ error = nvlist_lookup_int32(innvl, "fd", &fd); -+ if (error != 0) -+ return (SET_ERROR(EINVAL)); -+ -+ (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); -+ -+ if ((fp = getf(fd)) == NULL) -+ return (SET_ERROR(EBADF)); -+ -+ off = fp->f_offset; -+ error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off); -+ -+ if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) -+ fp->f_offset = off; -+ -+ releasef(fd); -+ return (error); -+} -+ -+/* -+ * Determine approximately how large a zfs send stream will be -- the number -+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). -+ * -+ * innvl: { -+ * (optional) "fromsnap" -> full snap name to send an incremental from -+ * } -+ * -+ * outnvl: { -+ * "space" -> bytes of space (uint64) -+ * } -+ */ -+static int -+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) -+{ -+ dsl_pool_t *dp; -+ dsl_dataset_t *fromsnap = NULL; -+ dsl_dataset_t *tosnap; -+ int error; -+ char *fromname; -+ uint64_t space; -+ -+ error = dsl_pool_hold(snapname, FTAG, &dp); -+ if (error != 0) -+ return (error); -+ -+ error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); -+ if (error != 0) { -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ -+ error = nvlist_lookup_string(innvl, "fromsnap", &fromname); -+ if (error == 0) { -+ error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); -+ if (error != 0) { -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+ } -+ } -+ -+ error = dmu_send_estimate(tosnap, fromsnap, &space); -+ fnvlist_add_uint64(outnvl, "space", space); -+ -+ if (fromsnap != NULL) -+ dsl_dataset_rele(fromsnap, FTAG); -+ dsl_dataset_rele(tosnap, FTAG); -+ dsl_pool_rele(dp, FTAG); -+ return (error); -+} -+ -+ -+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; -+ -+static void -+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, -+ boolean_t log_history, zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; -+ -+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST); -+ ASSERT3U(ioc, <, ZFS_IOC_LAST); -+ ASSERT3P(vec->zvec_legacy_func, ==, NULL); -+ ASSERT3P(vec->zvec_func, ==, NULL); -+ -+ vec->zvec_legacy_func = func; -+ vec->zvec_secpolicy = secpolicy; -+ vec->zvec_namecheck = namecheck; -+ vec->zvec_allow_log = log_history; -+ vec->zvec_pool_check = pool_check; -+} -+ -+/* -+ * See the block comment at the beginning of this file for details on -+ * each argument to this function. -+ */ -+static void -+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, -+ zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, -+ boolean_t allow_log) -+{ -+ zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; -+ -+ ASSERT3U(ioc, >=, ZFS_IOC_FIRST); -+ ASSERT3U(ioc, <, ZFS_IOC_LAST); -+ ASSERT3P(vec->zvec_legacy_func, ==, NULL); -+ ASSERT3P(vec->zvec_func, ==, NULL); -+ -+ /* if we are logging, the name must be valid */ -+ ASSERT(!allow_log || namecheck != NO_NAME); -+ -+ vec->zvec_name = name; -+ vec->zvec_func = func; -+ vec->zvec_secpolicy = secpolicy; -+ vec->zvec_namecheck = namecheck; -+ vec->zvec_pool_check = pool_check; -+ vec->zvec_smush_outnvlist = smush_outnvlist; -+ vec->zvec_allow_log = allow_log; -+} -+ -+static void -+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, boolean_t log_history, -+ zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ POOL_NAME, log_history, pool_check); -+} -+ -+static void -+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_FALSE, pool_check); -+} -+ -+static void -+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -+{ -+ zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, -+ POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+} -+ -+static void -+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ NO_NAME, B_FALSE, POOL_CHECK_NONE); -+} -+ -+static void -+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, -+ zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); -+} -+ -+static void -+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) -+{ -+ zfs_ioctl_register_dataset_read_secpolicy(ioc, func, -+ zfs_secpolicy_read); -+} -+ -+static void -+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, -+ zfs_secpolicy_func_t *secpolicy) -+{ -+ zfs_ioctl_register_legacy(ioc, func, secpolicy, -+ DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+} -+ -+static void -+zfs_ioctl_init(void) -+{ -+ zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, -+ zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, -+ zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, -+ zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, -+ zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, -+ zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("create", ZFS_IOC_CREATE, -+ zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("clone", ZFS_IOC_CLONE, -+ zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, -+ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("hold", ZFS_IOC_HOLD, -+ zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ zfs_ioctl_register("release", ZFS_IOC_RELEASE, -+ zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); -+ -+ zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, -+ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, -+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); -+ -+ zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, -+ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); -+ -+ /* IOCTLS that use the legacy function signature */ -+ -+ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, -+ zfs_ioc_pool_scan); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, -+ zfs_ioc_pool_upgrade); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, -+ zfs_ioc_vdev_add); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, -+ zfs_ioc_vdev_remove); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, -+ zfs_ioc_vdev_set_state); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, -+ zfs_ioc_vdev_attach); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, -+ zfs_ioc_vdev_detach); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, -+ zfs_ioc_vdev_setpath); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, -+ zfs_ioc_vdev_setfru); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, -+ zfs_ioc_pool_set_props); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, -+ zfs_ioc_vdev_split); -+ zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, -+ zfs_ioc_pool_reguid); -+ -+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, -+ zfs_ioc_pool_configs, zfs_secpolicy_none); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, -+ zfs_ioc_pool_tryimport, zfs_secpolicy_config); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, -+ zfs_ioc_inject_fault, zfs_secpolicy_inject); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, -+ zfs_ioc_clear_fault, zfs_secpolicy_inject); -+ zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, -+ zfs_ioc_inject_list_next, zfs_secpolicy_inject); -+ -+ /* -+ * pool destroy, and export don't log the history as part of -+ * zfsdev_ioctl, but rather zfs_ioc_pool_export -+ * does the logging of those commands. -+ */ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, -+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, -+ zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, -+ zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); -+ zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, -+ zfs_ioc_dsobj_to_dsname, -+ zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, -+ zfs_ioc_pool_get_history, -+ zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ -+ zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); -+ zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, -+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); -+ -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, -+ zfs_ioc_space_written); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, -+ zfs_ioc_objset_recvd_props); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, -+ zfs_ioc_next_obj); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, -+ zfs_ioc_get_fsacl); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, -+ zfs_ioc_objset_stats); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, -+ zfs_ioc_objset_zplprops); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, -+ zfs_ioc_dataset_list_next); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, -+ zfs_ioc_snapshot_list_next); -+ zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, -+ zfs_ioc_send_progress); -+ -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, -+ zfs_ioc_diff, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, -+ zfs_ioc_obj_to_stats, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, -+ zfs_ioc_obj_to_path, zfs_secpolicy_diff); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, -+ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, -+ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); -+ zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, -+ zfs_ioc_send, zfs_secpolicy_send); -+ -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, -+ zfs_secpolicy_none); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, -+ zfs_secpolicy_destroy); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, -+ zfs_secpolicy_rename); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, -+ zfs_secpolicy_recv); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, -+ zfs_secpolicy_promote); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, -+ zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); -+ zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, -+ zfs_secpolicy_set_fsacl); -+ -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, -+ zfs_secpolicy_share, POOL_CHECK_NONE); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, -+ zfs_secpolicy_smb_acl, POOL_CHECK_NONE); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, -+ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+ zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, -+ zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, -+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); -+ -+ /* -+ * ZoL functions -+ */ -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+ zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, -+ zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); -+} - -@@ -5048,5 +5436,5 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type, - if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) -- error = EAGAIN; -+ error = SET_ERROR(EAGAIN); - else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) -- error = EROFS; -+ error = SET_ERROR(EROFS); - spa_close(spa, FTAG); -@@ -5064,8 +5452,11 @@ zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) - for (zs = list_head(&zfsdev_state_list); zs != NULL; -- zs = list_next(&zfsdev_state_list, zs)) { -+ zs = list_next(&zfsdev_state_list, zs)) { - if (zs->zs_minor == minor) { - switch (which) { -- case ZST_ONEXIT: return (zs->zs_onexit); -- case ZST_ZEVENT: return (zs->zs_zevent); -- case ZST_ALL: return (zs); -+ case ZST_ONEXIT: -+ return (zs->zs_onexit); -+ case ZST_ZEVENT: -+ return (zs->zs_zevent); -+ case ZST_ALL: -+ return (zs); - } -@@ -5074,3 +5465,3 @@ zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) - -- return NULL; -+ return (NULL); - } -@@ -5086,3 +5477,3 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) - -- return ptr; -+ return (ptr); - } -@@ -5130,7 +5521,7 @@ zfsdev_state_init(struct file *filp) - -- minor = zfsdev_minor_alloc(); -- if (minor == 0) -- return (ENXIO); -+ minor = zfsdev_minor_alloc(); -+ if (minor == 0) -+ return (SET_ERROR(ENXIO)); - -- zs = kmem_zalloc( sizeof(zfsdev_state_t), KM_SLEEP); -+ zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); - -@@ -5161,5 +5552,5 @@ zfsdev_state_destroy(struct file *filp) - list_remove(&zfsdev_state_list, zs); -- kmem_free(zs, sizeof(zfsdev_state_t)); -+ kmem_free(zs, sizeof (zfsdev_state_t)); - -- return 0; -+ return (0); - } -@@ -5194,8 +5585,19 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - zfs_cmd_t *zc; -- uint_t vec; -- int error, rc, flag = 0; -+ uint_t vecnum; -+ int error, rc, len = 0, flag = 0; -+ const zfs_ioc_vec_t *vec; -+ char *saved_poolname = NULL; -+ nvlist_t *innvl = NULL; -+ -+ vecnum = cmd - ZFS_IOC_FIRST; -+ if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) -+ return (-SET_ERROR(EINVAL)); -+ vec = &zfs_ioc_vec[vecnum]; - -- vec = cmd - ZFS_IOC; -- if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) -- return (-EINVAL); -+ /* -+ * The registered ioctl list may be sparse, verify that either -+ * a normal or legacy handler are registered. -+ */ -+ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) -+ return (-SET_ERROR(EINVAL)); - -@@ -5204,7 +5606,14 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); -- if (error != 0) -- error = EFAULT; -+ if (error != 0) { -+ error = SET_ERROR(EFAULT); -+ goto out; -+ } - -- if ((error == 0) && !(flag & FKIOCTL)) -- error = zfs_ioc_vec[vec].zvec_secpolicy(zc, CRED()); -+ zc->zc_iflags = flag & FKIOCTL; -+ if (zc->zc_nvlist_src_size != 0) { -+ error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, -+ zc->zc_iflags, &innvl); -+ if (error != 0) -+ goto out; -+ } - -@@ -5214,36 +5623,103 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - */ -- if (error == 0) { -- zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; -- zc->zc_iflags = flag & FKIOCTL; -- switch (zfs_ioc_vec[vec].zvec_namecheck) { -- case POOL_NAME: -- if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) -- error = EINVAL; -+ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; -+ switch (vec->zvec_namecheck) { -+ case POOL_NAME: -+ if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) -+ error = SET_ERROR(EINVAL); -+ else - error = pool_status_check(zc->zc_name, -- zfs_ioc_vec[vec].zvec_namecheck, -- zfs_ioc_vec[vec].zvec_pool_check); -- break; -+ vec->zvec_namecheck, vec->zvec_pool_check); -+ break; - -- case DATASET_NAME: -- if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) -- error = EINVAL; -+ case DATASET_NAME: -+ if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) -+ error = SET_ERROR(EINVAL); -+ else - error = pool_status_check(zc->zc_name, -- zfs_ioc_vec[vec].zvec_namecheck, -- zfs_ioc_vec[vec].zvec_pool_check); -- break; -+ vec->zvec_namecheck, vec->zvec_pool_check); -+ break; - -- case NO_NAME: -- break; -- } -+ case NO_NAME: -+ break; - } - -- if (error == 0) -- error = zfs_ioc_vec[vec].zvec_func(zc); - -+ if (error == 0 && !(flag & FKIOCTL)) -+ error = vec->zvec_secpolicy(zc, innvl, CRED()); -+ -+ if (error != 0) -+ goto out; -+ -+ /* legacy ioctls can modify zc_name */ -+ len = strcspn(zc->zc_name, "/@#") + 1; -+ saved_poolname = kmem_alloc(len, KM_SLEEP); -+ (void) strlcpy(saved_poolname, zc->zc_name, len); -+ -+ if (vec->zvec_func != NULL) { -+ nvlist_t *outnvl; -+ int puterror = 0; -+ spa_t *spa; -+ nvlist_t *lognv = NULL; -+ -+ ASSERT(vec->zvec_legacy_func == NULL); -+ -+ /* -+ * Add the innvl to the lognv before calling the func, -+ * in case the func changes the innvl. -+ */ -+ if (vec->zvec_allow_log) { -+ lognv = fnvlist_alloc(); -+ fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, -+ vec->zvec_name); -+ if (!nvlist_empty(innvl)) { -+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, -+ innvl); -+ } -+ } -+ -+ VERIFY0(nvlist_alloc(&outnvl, NV_UNIQUE_NAME, KM_PUSHPAGE)); -+ error = vec->zvec_func(zc->zc_name, innvl, outnvl); -+ -+ if (error == 0 && vec->zvec_allow_log && -+ spa_open(zc->zc_name, &spa, FTAG) == 0) { -+ if (!nvlist_empty(outnvl)) { -+ fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, -+ outnvl); -+ } -+ (void) spa_history_log_nvl(spa, lognv); -+ spa_close(spa, FTAG); -+ } -+ fnvlist_free(lognv); -+ -+ if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { -+ int smusherror = 0; -+ if (vec->zvec_smush_outnvlist) { -+ smusherror = nvlist_smush(outnvl, -+ zc->zc_nvlist_dst_size); -+ } -+ if (smusherror == 0) -+ puterror = put_nvlist(zc, outnvl); -+ } -+ -+ if (puterror != 0) -+ error = puterror; -+ -+ nvlist_free(outnvl); -+ } else { -+ error = vec->zvec_legacy_func(zc); -+ } -+ -+out: -+ nvlist_free(innvl); - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); -- if (error == 0) { -- if (rc != 0) -- error = EFAULT; -- if (zfs_ioc_vec[vec].zvec_his_log) -- zfs_log_history(zc); -+ if (error == 0 && rc != 0) -+ error = SET_ERROR(EFAULT); -+ if (error == 0 && vec->zvec_allow_log) { -+ char *s = tsd_get(zfs_allow_log_key); -+ if (s != NULL) -+ strfree(s); -+ (void) tsd_set(zfs_allow_log_key, saved_poolname); -+ } else { -+ if (saved_poolname != NULL) -+ kmem_free(saved_poolname, len); - } -@@ -5258,6 +5734,6 @@ zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - { -- return zfsdev_ioctl(filp, cmd, arg); -+ return (zfsdev_ioctl(filp, cmd, arg)); - } - #else --#define zfsdev_compat_ioctl NULL -+#define zfsdev_compat_ioctl NULL - #endif -@@ -5265,7 +5741,7 @@ zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) - static const struct file_operations zfsdev_fops = { -- .open = zfsdev_open, -- .release = zfsdev_release, -- .unlocked_ioctl = zfsdev_ioctl, -- .compat_ioctl = zfsdev_compat_ioctl, -- .owner = THIS_MODULE, -+ .open = zfsdev_open, -+ .release = zfsdev_release, -+ .unlocked_ioctl = zfsdev_ioctl, -+ .compat_ioctl = zfsdev_compat_ioctl, -+ .owner = THIS_MODULE, - }; -@@ -5273,5 +5749,5 @@ static const struct file_operations zfsdev_fops = { - static struct miscdevice zfs_misc = { -- .minor = MISC_DYNAMIC_MINOR, -- .name = ZFS_DRIVER, -- .fops = &zfsdev_fops, -+ .minor = MISC_DYNAMIC_MINOR, -+ .name = ZFS_DRIVER, -+ .fops = &zfsdev_fops, - }; -@@ -5288,3 +5764,3 @@ zfs_attach(void) - error = misc_register(&zfs_misc); -- if (error) { -+ if (error != 0) { - printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); -@@ -5302,3 +5778,3 @@ zfs_detach(void) - error = misc_deregister(&zfs_misc); -- if (error) -+ if (error != 0) - printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); -@@ -5309,9 +5785,13 @@ zfs_detach(void) - --uint_t zfs_fsyncer_key; --extern uint_t rrw_tsd_key; -+static void -+zfs_allow_log_destroy(void *arg) -+{ -+ char *poolname = arg; -+ strfree(poolname); -+} - - #ifdef DEBUG --#define ZFS_DEBUG_STR " (DEBUG mode)" -+#define ZFS_DEBUG_STR " (DEBUG mode)" - #else --#define ZFS_DEBUG_STR "" -+#define ZFS_DEBUG_STR "" - #endif -@@ -5329,2 +5809,4 @@ _init(void) - -+ zfs_ioctl_init(); -+ - if ((error = zfs_attach()) != 0) -@@ -5333,8 +5815,12 @@ _init(void) - tsd_create(&zfs_fsyncer_key, NULL); -- tsd_create(&rrw_tsd_key, NULL); -+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy); -+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); - - printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " -- "ZFS pool version %s, ZFS filesystem version %s\n", -- ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, -- SPA_VERSION_STRING, ZPL_VERSION_STRING); -+ "ZFS pool version %s, ZFS filesystem version %s\n", -+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, -+ SPA_VERSION_STRING, ZPL_VERSION_STRING); -+#ifndef CONFIG_FS_POSIX_ACL -+ printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); -+#endif /* CONFIG_FS_POSIX_ACL */ - -@@ -5348,4 +5834,4 @@ out1: - printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" -- ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, -- ZFS_DEBUG_STR, error); -+ ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, -+ ZFS_DEBUG_STR, error); - -@@ -5364,5 +5850,6 @@ _fini(void) - tsd_destroy(&rrw_tsd_key); -+ tsd_destroy(&zfs_allow_log_key); - - printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", -- ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); -+ ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); - -@@ -5378,2 +5865,3 @@ MODULE_AUTHOR(ZFS_META_AUTHOR); - MODULE_LICENSE(ZFS_META_LICENSE); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - #endif /* HAVE_SPL */ -diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c -index cbd6f1c..cfce831 100644 ---- a/module/zfs/zfs_log.c -+++ b/module/zfs/zfs_log.c -@@ -214,5 +214,4 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) - /* -- * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, -- * TX_MKDIR_ATTR and TX_MKXATTR -- * transactions. -+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and -+ * TK_MKXATTR transactions. - * -@@ -241,3 +240,2 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - lr_acl_create_t *lracl; -- xvattr_t *xvap = (xvattr_t *)vap; - size_t aclsize = 0; -@@ -245,2 +243,3 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - size_t txsize; -+ xvattr_t *xvap = (xvattr_t *)vap; - void *end; -@@ -271,3 +270,2 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - } else { -- aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; - txsize = -@@ -344,3 +342,3 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. -+ * Handles both TX_REMOVE and TX_RMDIR transactions. - */ -@@ -368,3 +366,3 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_link() handles TX_LINK transactions. -+ * Handles TX_LINK transactions. - */ -@@ -391,3 +389,3 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_symlink() handles TX_SYMLINK transactions. -+ * Handles TX_SYMLINK transactions. - */ -@@ -423,3 +421,3 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_rename() handles TX_RENAME transactions. -+ * Handles TX_RENAME transactions. - */ -@@ -449,3 +447,5 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - /* -- * zfs_log_write() handles TX_WRITE transactions. -+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is -+ * called as soon as the write is on stable storage (be it via a DMU sync or a -+ * ZIL commit). - */ -@@ -455,3 +455,4 @@ void - zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, -- znode_t *zp, offset_t off, ssize_t resid, int ioflag) -+ znode_t *zp, offset_t off, ssize_t resid, int ioflag, -+ zil_callback_t callback, void *callback_data) - { -@@ -462,4 +463,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - -- if (zil_replaying(zilog, tx) || zp->z_unlinked) -+ if (zil_replaying(zilog, tx) || zp->z_unlinked) { -+ if (callback != NULL) -+ callback(callback_data); - return; -+ } - -@@ -520,2 +524,4 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - -+ itx->itx_callback = callback; -+ itx->itx_callback_data = callback_data; - zil_itx_assign(zilog, itx, tx); -@@ -528,3 +534,3 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_truncate() handles TX_TRUNCATE transactions. -+ * Handles TX_TRUNCATE transactions. - */ -@@ -551,3 +557,3 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_setattr() handles TX_SETATTR transactions. -+ * Handles TX_SETATTR transactions. - */ -@@ -613,3 +619,3 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - /* -- * zfs_log_acl() handles TX_ACL transactions. -+ * Handles TX_ACL transactions. - */ -diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c -index 2f60b5e..2b286e7 100644 ---- a/module/zfs/zfs_onexit.c -+++ b/module/zfs/zfs_onexit.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -111,3 +112,3 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) - if (*zo == NULL) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -130,3 +131,3 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp) - if (fp == NULL) -- return (EBADF); -+ return (SET_ERROR(EBADF)); - -@@ -157,3 +158,3 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - -- ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); -+ ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_PUSHPAGE); - list_link_init(&ap->za_link); -@@ -213,3 +214,3 @@ zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) - mutex_exit(&zo->zo_lock); -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } -@@ -242,3 +243,3 @@ zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) - else -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - mutex_exit(&zo->zo_lock); -diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c -index 813250c..6ac10e2 100644 ---- a/module/zfs/zfs_replay.c -+++ b/module/zfs/zfs_replay.c -@@ -23,2 +23,3 @@ - * Copyright (c) 2012 Cyril Plisko. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -75,3 +76,3 @@ zfs_replay_error(zfs_sb_t *zsb, lr_t *lr, boolean_t byteswap) - { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -389,3 +390,3 @@ zfs_replay_create_acl(zfs_sb_t *zsb, lr_acl_create_t *lracl, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -515,3 +516,3 @@ zfs_replay_create(zfs_sb_t *zsb, lr_create_t *lr, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -555,3 +556,3 @@ zfs_replay_remove(zfs_sb_t *zsb, lr_remove_t *lr, boolean_t byteswap) - default: -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - } -@@ -679,3 +680,3 @@ zfs_replay_write(zfs_sb_t *zsb, lr_write_t *lr, boolean_t byteswap) - else if (written < length) -- error = EIO; /* short write */ -+ error = SET_ERROR(EIO); /* short write */ - -diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c -index 136972b..2533ced 100644 ---- a/module/zfs/zfs_rlock.c -+++ b/module/zfs/zfs_rlock.c -@@ -30,3 +30,3 @@ - * This file contains the code to implement file range locking in -- * ZFS, although there isn't much specific to ZFS (all that comes to mind -+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is - * support for growing the blocksize). -@@ -552,3 +552,3 @@ zfs_range_unlock(rl_t *rl) - ASSERT(!rl->r_proxy); -- list_create(&free_list, sizeof(rl_t), offsetof(rl_t, rl_node)); -+ list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node)); - -diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c -index 621c5f9..ebe92bb 100644 ---- a/module/zfs/zfs_sa.c -+++ b/module/zfs/zfs_sa.c -@@ -266,3 +266,3 @@ out: - * since the SA code can read both old/new znode formats -- * with probably little to know performance difference. -+ * with probably little to no performance difference. - * -@@ -312,3 +312,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - /* First do a bulk query of the attributes that aren't cached */ -- bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); -+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); -@@ -326,3 +326,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) { -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); - goto done; -@@ -335,3 +335,3 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - count = 0; -- sa_attrs = kmem_zalloc(sizeof(sa_bulk_attr_t) * 20, KM_SLEEP); -+ sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP); - SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); -@@ -392,4 +392,4 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) - zp->z_is_sa = B_TRUE; -- kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * 20); -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 20); -+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * 20); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 20); - done: -diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c -index 9ae7ab5..a27ac69 100644 ---- a/module/zfs/zfs_vfsops.c -+++ b/module/zfs/zfs_vfsops.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -139,2 +140,8 @@ atime_changed_cb(void *arg, uint64_t newval) - static void -+relatime_changed_cb(void *arg, uint64_t newval) -+{ -+ ((zfs_sb_t *)arg)->z_relatime = newval; -+} -+ -+static void - xattr_changed_cb(void *arg, uint64_t newval) -@@ -156,2 +163,26 @@ xattr_changed_cb(void *arg, uint64_t newval) - static void -+acltype_changed_cb(void *arg, uint64_t newval) -+{ -+ zfs_sb_t *zsb = arg; -+ -+ switch (newval) { -+ case ZFS_ACLTYPE_OFF: -+ zsb->z_acl_type = ZFS_ACLTYPE_OFF; -+ zsb->z_sb->s_flags &= ~MS_POSIXACL; -+ break; -+ case ZFS_ACLTYPE_POSIXACL: -+#ifdef CONFIG_FS_POSIX_ACL -+ zsb->z_acl_type = ZFS_ACLTYPE_POSIXACL; -+ zsb->z_sb->s_flags |= MS_POSIXACL; -+#else -+ zsb->z_acl_type = ZFS_ACLTYPE_OFF; -+ zsb->z_sb->s_flags &= ~MS_POSIXACL; -+#endif /* CONFIG_FS_POSIX_ACL */ -+ break; -+ default: -+ break; -+ } -+} -+ -+static void - blksz_changed_cb(void *arg, uint64_t newval) -@@ -249,24 +280,30 @@ zfs_register_callbacks(zfs_sb_t *zsb) - ds = dmu_objset_ds(os); -+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG); - error = dsl_prop_register(ds, -- "atime", atime_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zsb); -+ error = error ? error : dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "xattr", xattr_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "recordsize", blksz_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "readonly", readonly_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "devices", devices_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "setuid", setuid_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "exec", exec_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "snapdir", snapdir_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "aclinherit", acl_inherit_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "vscan", vscan_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zsb); - error = error ? error : dsl_prop_register(ds, -- "nbmand", nbmand_changed_cb, zsb); -+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zsb); -+ error = error ? error : dsl_prop_register(ds, -+ zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zsb); -+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG); - if (error) -@@ -285,14 +322,28 @@ unregister: - */ -- (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, -- zsb); -- (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zsb); -- (void) dsl_prop_unregister(ds, "nbmand", nbmand_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME), -+ atime_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), -+ relatime_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR), -+ xattr_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), -+ blksz_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY), -+ readonly_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), -+ devices_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID), -+ setuid_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC), -+ exec_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), -+ snapdir_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), -+ acltype_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), -+ acl_inherit_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), -+ vscan_changed_cb, zsb); -+ (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), -+ nbmand_changed_cb, zsb); - -@@ -306,4 +357,2 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - { -- int error = 0; -- - /* -@@ -312,3 +361,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -321,3 +370,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - if (data == NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -364,3 +413,3 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - } -- return (error); -+ return (0); - } -@@ -397,3 +446,3 @@ zfs_userquota_prop_to_obj(zfs_sb_t *zsb, zfs_userquota_prop_t type) - default: -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -413,3 +462,3 @@ zfs_userspace_many(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (!dmu_objset_userspace_present(zsb->z_os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -458,3 +507,3 @@ id_to_fuidstr(zfs_sb_t *zsb, const char *domain, uid_t rid, - if (domainid == -1) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -476,3 +525,3 @@ zfs_userspace_one(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (!dmu_objset_userspace_present(zsb->z_os)) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -504,6 +553,6 @@ zfs_set_userquota(zfs_sb_t *zsb, zfs_userquota_prop_t type, - if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (zsb->z_version < ZPL_VERSION_USERSPACE) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -638,3 +687,3 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - (u_longlong_t)spa_version(dmu_objset_spa(os))); -- error = ENOTSUP; -+ error = SET_ERROR(ENOTSUP); - goto out; -@@ -653,2 +702,6 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - -+ if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &zval)) != 0) -+ goto out; -+ zsb->z_acl_type = (uint_t)zval; -+ - /* -@@ -727,3 +780,3 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp) - offsetof(znode_t, z_link_node)); -- rrw_init(&zsb->z_teardown_lock); -+ rrw_init(&zsb->z_teardown_lock, B_FALSE); - rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); -@@ -873,2 +926,5 @@ zfs_unregister_callbacks(zfs_sb_t *zsb) - -+ VERIFY(dsl_prop_unregister(ds, "relatime", relatime_changed_cb, -+ zsb) == 0); -+ - VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, -@@ -894,2 +950,5 @@ zfs_unregister_callbacks(zfs_sb_t *zsb) - -+ VERIFY(dsl_prop_unregister(ds, "acltype", acltype_changed_cb, -+ zsb) == 0); -+ - VERIFY(dsl_prop_unregister(ds, "aclinherit", -@@ -908,9 +967,8 @@ EXPORT_SYMBOL(zfs_unregister_callbacks); - /* -- * zfs_check_global_label: -- * Check that the hex label string is appropriate for the dataset -- * being mounted into the global_zone proper. -+ * Check that the hex label string is appropriate for the dataset being -+ * mounted into the global_zone proper. - * -- * Return an error if the hex label string is not default or -- * admin_low/admin_high. For admin_low labels, the corresponding -- * dataset must be readonly. -+ * Return an error if the hex label string is not default or -+ * admin_low/admin_high. For admin_low labels, the corresponding -+ * dataset must be readonly. - */ -@@ -929,6 +987,6 @@ zfs_check_global_label(const char *dsname, const char *hexsl) - zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) -- return (EACCES); -+ return (SET_ERROR(EACCES)); - return (rdonly ? 0 : EACCES); - } -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -1048,2 +1106,10 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - -+ /* -+ * If someone has not already unmounted this file system, -+ * drain the iput_taskq to ensure all active references to the -+ * zfs_sb_t have been handled only then can it be safely destroyed. -+ */ -+ if (zsb->z_os) -+ taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); -+ - rrw_enter(&zsb->z_teardown_lock, RW_WRITER, FTAG); -@@ -1062,10 +1128,2 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - /* -- * If someone has not already unmounted this file system, -- * drain the iput_taskq to ensure all active references to the -- * zfs_sb_t have been handled only then can it be safely destroyed. -- */ -- if (zsb->z_os) -- taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os))); -- -- /* - * Close the zil. NB: Can't close the zil while zfs_inactive -@@ -1088,3 +1146,3 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - rrw_exit(&zsb->z_teardown_lock, FTAG); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1101,6 +1159,4 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - zp = list_next(&zsb->z_all_znodes, zp)) { -- if (zp->z_sa_hdl) { -- ASSERT(atomic_read(&ZTOI(zp)->i_count) > 0); -+ if (zp->z_sa_hdl) - zfs_znode_dmu_fini(zp); -- } - } -@@ -1139,3 +1195,3 @@ zfs_sb_teardown(zfs_sb_t *zsb, boolean_t unmounting) - txg_wait_synced(dmu_objset_pool(zsb->z_os), 0); -- (void) dmu_objset_evict_dbufs(zsb->z_os); -+ dmu_objset_evict_dbufs(zsb->z_os); - -@@ -1208,5 +1264,10 @@ zfs_domount(struct super_block *sb, void *data, int silent) - readonly_changed_cb(zsb, B_TRUE); -- if ((error = dsl_prop_get_integer(osname,"xattr",&pval,NULL))) -+ if ((error = dsl_prop_get_integer(osname, -+ "xattr", &pval, NULL))) - goto out; - xattr_changed_cb(zsb, pval); -+ if ((error = dsl_prop_get_integer(osname, -+ "acltype", &pval, NULL))) -+ goto out; -+ acltype_changed_cb(zsb, pval); - zsb->z_issnap = B_TRUE; -@@ -1232,3 +1293,3 @@ zfs_domount(struct super_block *sb, void *data, int silent) - (void) zfs_umount(sb); -- error = ENOMEM; -+ error = SET_ERROR(ENOMEM); - goto out; -@@ -1347,3 +1408,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - if (err) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1362,3 +1423,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1396,3 +1457,3 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1412,3 +1473,5 @@ EXPORT_SYMBOL(zfs_vget); - * Note, if successful, then we return with the 'z_teardown_lock' and -- * 'z_teardown_inactive_lock' write held. -+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying -+ * dataset and objset intact so that they can be atomically handed off during -+ * a subsequent rollback or recv operation and the resume thereafter. - */ -@@ -1422,4 +1485,2 @@ zfs_suspend_fs(zfs_sb_t *zsb) - -- dmu_objset_disown(zsb->z_os, zsb); -- - return (0); -@@ -1435,2 +1496,4 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) - int err, err2; -+ znode_t *zp; -+ uint64_t sa_obj = 0; - -@@ -1439,43 +1502,59 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) - -- err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os); -- if (err) { -- zsb->z_os = NULL; -- } else { -- znode_t *zp; -- uint64_t sa_obj = 0; -+ /* -+ * We already own this, so just hold and rele it to update the -+ * objset_t, as the one we had before may have been evicted. -+ */ -+ VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os)); -+ VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb); -+ VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset)); -+ dmu_objset_rele(zsb->z_os, zsb); - -- err2 = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, -- ZFS_SA_ATTRS, 8, 1, &sa_obj); -+ /* -+ * Make sure version hasn't changed -+ */ - -- if ((err || err2) && zsb->z_version >= ZPL_VERSION_SA) -- goto bail; -+ err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION, -+ &zsb->z_version); - -+ if (err) -+ goto bail; - -- if ((err = sa_setup(zsb->z_os, sa_obj, -- zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) -- goto bail; -+ err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, -+ ZFS_SA_ATTRS, 8, 1, &sa_obj); - -- VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); -- zsb->z_rollback_time = jiffies; -+ if (err && zsb->z_version >= ZPL_VERSION_SA) -+ goto bail; - -- /* -- * Attempt to re-establish all the active inodes with their -- * dbufs. If a zfs_rezget() fails, then we unhash the inode -- * and mark it stale. This prevents a collision if a new -- * inode/object is created which must use the same inode -- * number. The stale inode will be be released when the -- * VFS prunes the dentry holding the remaining references -- * on the stale inode. -- */ -- mutex_enter(&zsb->z_znodes_lock); -- for (zp = list_head(&zsb->z_all_znodes); zp; -- zp = list_next(&zsb->z_all_znodes, zp)) { -- err2 = zfs_rezget(zp); -- if (err2) { -- remove_inode_hash(ZTOI(zp)); -- zp->z_is_stale = B_TRUE; -- } -+ if ((err = sa_setup(zsb->z_os, sa_obj, -+ zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) -+ goto bail; -+ -+ if (zsb->z_version >= ZPL_VERSION_SA) -+ sa_register_update_callback(zsb->z_os, -+ zfs_sa_upgrade); -+ -+ VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); -+ -+ zfs_set_fuid_feature(zsb); -+ zsb->z_rollback_time = jiffies; -+ -+ /* -+ * Attempt to re-establish all the active inodes with their -+ * dbufs. If a zfs_rezget() fails, then we unhash the inode -+ * and mark it stale. This prevents a collision if a new -+ * inode/object is created which must use the same inode -+ * number. The stale inode will be be released when the -+ * VFS prunes the dentry holding the remaining references -+ * on the stale inode. -+ */ -+ mutex_enter(&zsb->z_znodes_lock); -+ for (zp = list_head(&zsb->z_all_znodes); zp; -+ zp = list_next(&zsb->z_all_znodes, zp)) { -+ err2 = zfs_rezget(zp); -+ if (err2) { -+ remove_inode_hash(ZTOI(zp)); -+ zp->z_is_stale = B_TRUE; - } -- mutex_exit(&zsb->z_znodes_lock); - } -+ mutex_exit(&zsb->z_znodes_lock); - -@@ -1488,4 +1567,4 @@ bail: - /* -- * Since we couldn't reopen zfs_sb_t or, setup the -- * sa framework, force unmount this file system. -+ * Since we couldn't setup the sa framework, try to force -+ * unmount this file system. - */ -@@ -1506,6 +1585,6 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (newvers < zsb->z_version) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1513,3 +1592,3 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - spa_version(dmu_objset_spa(zsb->z_os))) -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - -@@ -1552,5 +1631,4 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - -- spa_history_log_internal(LOG_DS_UPGRADE, -- dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu", -- zsb->z_version, newvers, dmu_objset_id(os)); -+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, -+ "from %llu to %llu", zsb->z_version, newvers); - -@@ -1560,4 +1638,3 @@ zfs_set_version(zfs_sb_t *zsb, uint64_t newvers) - -- if (zsb->z_version >= ZPL_VERSION_FUID) -- zfs_set_fuid_feature(zsb); -+ zfs_set_fuid_feature(zsb); - -@@ -1574,3 +1651,3 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) - const char *pname; -- int error = ENOENT; -+ int error = SET_ERROR(ENOENT); - -@@ -1601,2 +1678,5 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) - break; -+ case ZFS_PROP_ACLTYPE: -+ *value = ZFS_ACLTYPE_OFF; -+ break; - default: -@@ -1623,2 +1703,3 @@ zfs_fini(void) - { -+ taskq_wait(system_taskq); - unregister_filesystem(&zpl_fs_type); -diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c -index 876d44b..91f743a 100644 ---- a/module/zfs/zfs_vnops.c -+++ b/module/zfs/zfs_vnops.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -108,7 +108,14 @@ - * -- * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). -- * This is critical because we don't want to block while holding locks. -- * Note, in particular, that if a lock is sometimes acquired before -- * the tx assigns, and sometimes after (e.g. z_lock), then failing to -- * use a non-blocking assign can deadlock the system. The scenario: -+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to -+ * dmu_tx_assign(). This is critical because we don't want to block -+ * while holding locks. -+ * -+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This -+ * reduces lock contention and CPU usage when we must wait (note that if -+ * throughput is constrained by the storage, nearly every transaction -+ * must wait). -+ * -+ * Note, in particular, that if a lock is sometimes acquired before -+ * the tx assigns, and sometimes after (e.g. z_lock), then failing -+ * to use a non-blocking assign can deadlock the system. The scenario: - * -@@ -120,3 +127,7 @@ - * If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT, -- * then drop all locks, call dmu_tx_wait(), and try again. -+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent -+ * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, -+ * to indicate that this operation has already called dmu_tx_wait(). -+ * This will ensure that we don't retry forever, waiting a short bit -+ * each time. - * -@@ -125,3 +136,3 @@ - * in the intent log matches the order in which they actually occurred. -- * During ZIL replay the zfs_log_* functions will update the sequence -+ * During ZIL replay the zfs_log_* functions will update the sequence - * number to indicate the zil transaction has replayed. -@@ -142,3 +153,3 @@ - * dmu_tx_hold_*(); // hold each object you might modify -- * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign -+ * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - * if (error) { -@@ -148,2 +159,3 @@ - * if (error == ERESTART) { -+ * waited = B_TRUE; - * dmu_tx_wait(tx); -@@ -194,3 +206,3 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -202,3 +214,3 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -225,9 +237,5 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) - -- /* -- * Zero the synchronous opens in the znode. Under Linux the -- * zfs_close() hook is not symmetric with zfs_open(), it is -- * only called once when the last reference is dropped. -- */ -+ /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) -- zp->z_sync_cnt = 0; -+ atomic_dec_32(&zp->z_sync_cnt); - -@@ -258,3 +266,3 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) - if (noff >= file_sz) { -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -277,3 +285,3 @@ zfs_holey_common(struct inode *ip, int cmd, loff_t *off) - } -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -359,3 +367,3 @@ update_pages(struct inode *ip, int64_t start, int len, - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when -- * the file is memory mapped. -+ * the file is memory mapped. - */ -@@ -420,4 +428,3 @@ unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -446,3 +453,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -454,3 +461,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -471,3 +478,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } -@@ -540,3 +547,3 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - break; -@@ -550,3 +557,2 @@ out: - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - ZFS_EXIT(zsb); -@@ -629,3 +635,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -640,3 +646,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -649,6 +655,5 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } - --#ifdef HAVE_UIO_ZEROCOPY - /* -@@ -658,2 +663,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - */ -+#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && -@@ -662,4 +668,4 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - else -+#endif - uio_prefaultpages(MIN(n, max_blksz), uio); --#endif /* HAVE_UIO_ZEROCOPY */ - -@@ -696,3 +702,3 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - ZFS_EXIT(zsb); -- return (EFBIG); -+ return (SET_ERROR(EFBIG)); - } -@@ -715,3 +721,2 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) - woff = uio->uio_loffset; --again: - if (zfs_owner_overquota(zsb, zp, B_FALSE) || -@@ -720,3 +725,3 @@ again: - dmu_return_arcbuf(abuf); -- error = EDQUOT; -+ error = SET_ERROR(EDQUOT); - break; -@@ -765,9 +770,4 @@ again: - zfs_sa_upgrade_txholds(tx, zp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto again; -- } - dmu_tx_abort(tx); -@@ -895,3 +895,4 @@ again: - -- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); -+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, -+ NULL, NULL); - dmu_tx_commit(tx); -@@ -989,3 +990,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (zfs_zget(zsb, object, &zp) != 0) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { -@@ -996,3 +997,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os))); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -@@ -1014,3 +1015,3 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (offset >= zp->z_size) { -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - } else { -@@ -1041,6 +1042,6 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (lr->lr_offset >= zp->z_size) -- error = ENOENT; -+ error = SET_ERROR(ENOENT); - #ifdef DEBUG - if (zil_fault_io) { -- error = EIO; -+ error = SET_ERROR(EIO); - zil_fault_io = 0; -@@ -1053,2 +1054,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -1118,4 +1125,3 @@ EXPORT_SYMBOL(zfs_access); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1137,5 +1143,5 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - if (!S_ISDIR(dip->i_mode)) { -- return (ENOTDIR); -+ return (SET_ERROR(ENOTDIR)); - } else if (zdp->z_sa_hdl == NULL) { -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1162,3 +1168,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - iput(tvp); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } else { -@@ -1184,3 +1190,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1208,3 +1214,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (ENOTDIR); -+ return (SET_ERROR(ENOTDIR)); - } -@@ -1223,3 +1229,3 @@ zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1251,4 +1257,3 @@ EXPORT_SYMBOL(zfs_lookup); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1276,2 +1281,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - boolean_t have_acl = B_FALSE; -+ boolean_t waited = B_FALSE; - -@@ -1287,3 +1293,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1297,3 +1303,3 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1331,3 +1337,3 @@ top: - if (strcmp(name, "..") == 0) -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - ZFS_EXIT(zsb); -@@ -1358,3 +1364,3 @@ top: - zfs_acl_ids_free(&acl_ids); -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -1369,3 +1375,3 @@ top: - zfs_acl_ids_free(&acl_ids); -- error = EDQUOT; -+ error = SET_ERROR(EDQUOT); - goto out; -@@ -1388,3 +1394,3 @@ top: - } -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1392,2 +1398,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1428,3 +1435,3 @@ top: - if (excl) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - goto out; -@@ -1435,3 +1442,3 @@ top: - if (S_ISDIR(ZTOI(zp)->i_mode)) { -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - goto out; -@@ -1521,2 +1528,3 @@ zfs_remove(struct inode *dip, char *name, cred_t *cr) - int zflg = ZEXISTS; -+ boolean_t waited = B_FALSE; - -@@ -1560,3 +1568,3 @@ top: - if (S_ISDIR(ip->i_mode)) { -- error = EPERM; -+ error = SET_ERROR(EPERM); - goto out; -@@ -1596,3 +1604,3 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1603,2 +1611,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1707,2 +1716,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - boolean_t fuid_dirtied; -+ boolean_t waited = B_FALSE; - -@@ -1718,3 +1728,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -1726,3 +1736,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1732,3 +1742,3 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -1778,3 +1788,3 @@ top: - ZFS_EXIT(zsb); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } -@@ -1798,3 +1808,3 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1802,2 +1812,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -1860,4 +1871,3 @@ EXPORT_SYMBOL(zfs_mkdir); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -1880,2 +1890,3 @@ zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, - int zflg = ZEXISTS; -+ boolean_t waited = B_FALSE; - -@@ -1906,3 +1917,3 @@ top: - if (!S_ISDIR(ip->i_mode)) { -- error = ENOTDIR; -+ error = SET_ERROR(ENOTDIR); - goto out; -@@ -1911,3 +1922,3 @@ top: - if (ip == cwd) { -- error = EINVAL; -+ error = SET_ERROR(EINVAL); - goto out; -@@ -1933,3 +1944,3 @@ top: - zfs_sa_upgrade_txholds(tx, dzp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -1940,2 +1951,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -2006,8 +2018,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - zap_attribute_t zap; -- int outcount; - int error; - uint8_t prefetch; -+ uint8_t type; - int done = 0; - uint64_t parent; -- loff_t *pos = &(ctx->pos); -+ uint64_t offset; /* must be unsigned; checks for < 1 */ - -@@ -2023,3 +2035,2 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- error = 0; - if (zp->z_unlinked) -@@ -2027,3 +2038,5 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - -+ error = 0; - os = zsb->z_os; -+ offset = ctx->pos; - prefetch = zp->z_zn_prefetch; -@@ -2033,3 +2046,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- if (*pos <= 3) { -+ if (offset <= 3) { - /* -@@ -2042,3 +2055,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- zap_cursor_init_serialized(&zc, os, zp->z_id, *pos); -+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); - } -@@ -2048,4 +2061,2 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- outcount = 0; -- - while (!done) { -@@ -2055,3 +2066,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - */ -- if (*pos == 0) { -+ if (offset == 0) { - (void) strcpy(zap.za_name, "."); -@@ -2059,3 +2070,4 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = zp->z_id; -- } else if (*pos == 1) { -+ type = DT_DIR; -+ } else if (offset == 1) { - (void) strcpy(zap.za_name, ".."); -@@ -2063,3 +2075,4 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = parent; -- } else if (*pos == 2 && zfs_show_ctldir(zp)) { -+ type = DT_DIR; -+ } else if (offset == 2 && zfs_show_ctldir(zp)) { - (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); -@@ -2067,2 +2080,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = ZFSCTL_INO_ROOT; -+ type = DT_DIR; - } else { -@@ -2091,6 +2105,6 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - (u_longlong_t)zp->z_id, -- (u_longlong_t)*pos, -+ (u_longlong_t)offset, - zap.za_integer_length, - (u_longlong_t)zap.za_num_integers); -- error = ENXIO; -+ error = SET_ERROR(ENXIO); - goto update; -@@ -2099,2 +2113,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); -+ type = ZFS_DIRENT_TYPE(zap.za_first_integer); - } -@@ -2102,3 +2117,3 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - done = !dir_emit(ctx, zap.za_name, strlen(zap.za_name), -- objnum, ZFS_DIRENT_TYPE(zap.za_first_integer)); -+ objnum, type); - if (done) -@@ -2111,8 +2126,12 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) - -- if (*pos > 2 || (*pos == 2 && !zfs_show_ctldir(zp))) { -+ /* -+ * Move to the next entry, fill in the previous offset. -+ */ -+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { - zap_cursor_advance(&zc); -- *pos = zap_cursor_serialize(&zc); -+ offset = zap_cursor_serialize(&zc); - } else { -- (*pos)++; -+ offset += 1; - } -+ ctx->pos = offset; - } -@@ -2126,3 +2145,2 @@ update: - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - -@@ -2376,2 +2394,4 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) - zfs_sb_t *zsb = ITOZSB(ip); -+ uint32_t blksize; -+ u_longlong_t nblocks; - -@@ -2385,3 +2405,6 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) - -- sa_object_size(zp->z_sa_hdl, (uint32_t *)&sp->blksize, &sp->blocks); -+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); -+ sp->blksize = blksize; -+ sp->blocks = nblocks; -+ - if (unlikely(zp->z_blksz == 0)) { -@@ -2429,3 +2452,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - uint_t mask = vap->va_mask; -- uint_t saved_mask; -+ uint_t saved_mask = 0; - int trim_mask = 0; -@@ -2465,3 +2488,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2470,3 +2493,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EISDIR); -+ return (SET_ERROR(EISDIR)); - } -@@ -2475,3 +2498,3 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -2484,7 +2507,7 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - -- tmpxvattr = kmem_alloc(sizeof(xvattr_t), KM_SLEEP); -+ tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); - xva_init(tmpxvattr); - -- bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); -- xattr_bulk = kmem_alloc(sizeof(sa_bulk_attr_t) * 7, KM_SLEEP); -+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); -+ xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 7, KM_SLEEP); - -@@ -2512,4 +2535,6 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) - if (mask & (ATTR_ATIME | ATTR_MTIME)) { -- if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || -- ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { -+ if (((mask & ATTR_ATIME) && -+ TIMESPEC_OVERFLOW(&vap->va_atime)) || -+ ((mask & ATTR_MTIME) && -+ TIMESPEC_OVERFLOW(&vap->va_mtime))) { - err = EOVERFLOW; -@@ -2825,8 +2850,5 @@ top: - -- err = dmu_tx_assign(tx, TXG_NOWAIT); -- if (err) { -- if (err == ERESTART) -- dmu_tx_wait(tx); -+ err = dmu_tx_assign(tx, TXG_WAIT); -+ if (err) - goto out; -- } - -@@ -3027,5 +3049,5 @@ out2: - out3: -- kmem_free(xattr_bulk, sizeof(sa_bulk_attr_t) * 7); -- kmem_free(bulk, sizeof(sa_bulk_attr_t) * 7); -- kmem_free(tmpxvattr, sizeof(xvattr_t)); -+ kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * 7); -+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * 7); -+ kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zsb); -@@ -3111,3 +3133,3 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) - if (oidp == szp->z_id) /* We're a descendant of szp */ -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -3143,4 +3165,3 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -3164,2 +3185,3 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - int zflg = 0; -+ boolean_t waited = B_FALSE; - -@@ -3169,5 +3191,5 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - -- if (tdip->i_sb != sdip->i_sb) { -+ if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { - ZFS_EXIT(zsb); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -3179,3 +3201,3 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3197,3 +3219,3 @@ top: - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3342,3 +3364,3 @@ top: - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { -- error = ENOTDIR; -+ error = SET_ERROR(ENOTDIR); - goto out; -@@ -3347,3 +3369,3 @@ top: - if (S_ISDIR(ZTOI(tzp)->i_mode)) { -- error = EISDIR; -+ error = SET_ERROR(EISDIR); - goto out; -@@ -3378,3 +3400,3 @@ top: - dmu_tx_hold_zap(tx, zsb->z_unlinkedobj, FALSE, NULL); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3392,2 +3414,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3478,4 +3501,3 @@ EXPORT_SYMBOL(zfs_rename); - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -3500,2 +3522,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - uint64_t txtype = TX_SYMLINK; -+ boolean_t waited = B_FALSE; - -@@ -3510,3 +3533,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3517,3 +3540,3 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, - ZFS_EXIT(zsb); -- return (ENAMETOOLONG); -+ return (SET_ERROR(ENAMETOOLONG)); - } -@@ -3549,3 +3572,3 @@ top: - ZFS_EXIT(zsb); -- return (EDQUOT); -+ return (SET_ERROR(EDQUOT)); - } -@@ -3564,3 +3587,3 @@ top: - zfs_fuid_txhold(zsb, tx); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3568,2 +3591,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3660,3 +3684,2 @@ zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) - ZFS_ACCESSTIME_STAMP(zsb, zp); -- zfs_inode_update(zp); - ZFS_EXIT(zsb); -@@ -3695,2 +3718,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - uid_t owner; -+ boolean_t waited = B_FALSE; - -@@ -3708,8 +3732,8 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } - -- if (sip->i_sb != tdip->i_sb) { -+ if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zsb); -- return (EXDEV); -+ return (SET_ERROR(EXDEV)); - } -@@ -3728,3 +3752,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -3734,3 +3758,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EILSEQ); -+ return (SET_ERROR(EILSEQ)); - } -@@ -3749,3 +3773,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -3755,3 +3779,3 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -3778,3 +3802,3 @@ top: - zfs_sa_upgrade_txholds(tx, dzp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); - if (error) { -@@ -3782,2 +3806,3 @@ top: - if (error == ERESTART) { -+ waited = B_TRUE; - dmu_tx_wait(tx); -@@ -3817,3 +3842,3 @@ EXPORT_SYMBOL(zfs_link); - static void --zfs_putpage_commit_cb(void *arg, int error) -+zfs_putpage_commit_cb(void *arg) - { -@@ -3821,11 +3846,3 @@ zfs_putpage_commit_cb(void *arg, int error) - -- if (error) { -- __set_page_dirty_nobuffers(pp); -- -- if (error != ECANCELED) -- SetPageError(pp); -- } else { -- ClearPageError(pp); -- } -- -+ ClearPageError(pp); - end_page_writeback(pp); -@@ -3863,3 +3880,2 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - int cnt = 0; -- int sync; - -@@ -3870,5 +3886,5 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- pgoff = page_offset(pp); /* Page byte-offset in file */ -- offset = i_size_read(ip); /* File length in bytes */ -- pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */ -+ pgoff = page_offset(pp); /* Page byte-offset in file */ -+ offset = i_size_read(ip); /* File length in bytes */ -+ pglen = MIN(PAGE_CACHE_SIZE, /* Page length in bytes */ - P2ROUNDUP(offset, PAGE_CACHE_SIZE)-pgoff); -@@ -3904,7 +3920,2 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) || -- (wbc->sync_mode == WB_SYNC_ALL)); -- if (!sync) -- dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp); -- - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); -@@ -3918,12 +3929,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- /* Will call all registered commit callbacks */ - dmu_tx_abort(tx); -- -- /* -- * For the synchronous case the commit callback must be -- * explicitly called because there is no registered callback. -- */ -- if (sync) -- zfs_putpage_commit_cb(pp, ECANCELED); -- -+ __set_page_dirty_nobuffers(pp); -+ ClearPageError(pp); -+ end_page_writeback(pp); - zfs_range_unlock(rl); -@@ -3950,3 +3955,4 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0); -+ zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, -+ zfs_putpage_commit_cb, pp); - dmu_tx_commit(tx); -@@ -3955,5 +3961,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - -- if (sync) { -+ if (wbc->sync_mode != WB_SYNC_NONE) { -+ /* -+ * Note that this is rarely called under writepages(), because -+ * writepages() normally handles the entire commit for -+ * performance reasons. -+ */ - zil_commit(zsb->z_log, zp->z_id); -- zfs_putpage_commit_cb(pp, err); - } -@@ -3966,3 +3976,3 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) - * Update the system attributes when the inode has been dirtied. For the -- * moment we're conservative and only update the atime, mtime, and ctime. -+ * moment we only update the mode, atime, mtime, and ctime. - */ -@@ -3974,4 +3984,4 @@ zfs_dirty_inode(struct inode *ip, int flags) - dmu_tx_t *tx; -- uint64_t atime[2], mtime[2], ctime[2]; -- sa_bulk_attr_t bulk[3]; -+ uint64_t mode, atime[2], mtime[2], ctime[2]; -+ sa_bulk_attr_t bulk[4]; - int error; -@@ -3994,2 +4004,3 @@ zfs_dirty_inode(struct inode *ip, int flags) - mutex_enter(&zp->z_lock); -+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zsb), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zsb), NULL, &atime, 16); -@@ -3998,3 +4009,3 @@ zfs_dirty_inode(struct inode *ip, int flags) - -- /* Preserve the mtime and ctime provided by the inode */ -+ /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); -@@ -4002,2 +4013,5 @@ zfs_dirty_inode(struct inode *ip, int flags) - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); -+ mode = ip->i_mode; -+ -+ zp->z_mode = mode; - zp->z_atime_dirty = 0; -@@ -4083,13 +4097,13 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - { -- znode_t *zp = ITOZ(ip); -- zfs_sb_t *zsb = ITOZSB(ip); -- objset_t *os; -+ znode_t *zp = ITOZ(ip); -+ zfs_sb_t *zsb = ITOZSB(ip); -+ objset_t *os; - struct page *cur_pp; -- u_offset_t io_off, total; -- size_t io_len; -- loff_t i_size; -- unsigned page_idx; -- int err; -+ u_offset_t io_off, total; -+ size_t io_len; -+ loff_t i_size; -+ unsigned page_idx; -+ int err; - -- os = zsb->z_os; -+ os = zsb->z_os; - io_len = nr_pages << PAGE_CACHE_SHIFT; -@@ -4116,3 +4130,3 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - if (err == ECKSUM) -- err = EIO; -+ err = SET_ERROR(EIO); - return (err); -@@ -4132,4 +4146,3 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -4188,3 +4201,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (EPERM); -+ return (SET_ERROR(EPERM)); - } -@@ -4194,3 +4207,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (EACCES); -+ return (SET_ERROR(EACCES)); - } -@@ -4199,3 +4212,3 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, - ZFS_EXIT(zsb); -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - } -@@ -4232,3 +4245,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4236,3 +4249,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - if (lckdat->l_start < 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4248,3 +4261,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - default: -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4268,4 +4281,3 @@ convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure. - * -@@ -4289,3 +4301,3 @@ zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4299,3 +4311,3 @@ zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4350,3 +4362,3 @@ zfs_fid(struct inode *ip, fid_t *fidp) - ZFS_EXIT(zsb); -- return (ENOSPC); -+ return (SET_ERROR(ENOSPC)); - } -@@ -4454,3 +4466,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - if (xuio->xu_type != UIOTYPE_ZEROCOPY) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -4467,3 +4479,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4532,3 +4544,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4541,3 +4553,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -4546,3 +4558,3 @@ zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) - ZFS_EXIT(zsb); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c -index aaf17e1..2ab896f 100644 ---- a/module/zfs/zfs_znode.c -+++ b/module/zfs/zfs_znode.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -357,2 +357,3 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - struct inode *ip; -+ uint64_t mode; - uint64_t parent; -@@ -388,3 +389,3 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - -- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &zp->z_mode, 8); -+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zsb), NULL, &mode, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zsb), NULL, &zp->z_gen, 8); -@@ -408,2 +409,4 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, - -+ zp->z_mode = mode; -+ - /* -@@ -442,3 +445,3 @@ error: - iput(ip); -- return NULL; -+ return (NULL); - } -@@ -649,3 +652,3 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - */ -- sa_attrs = kmem_alloc(sizeof(sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); -+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_PUSHPAGE); - -@@ -751,3 +754,3 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - } -- kmem_free(sa_attrs, sizeof(sa_bulk_attr_t) * ZPL_END); -+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); - ZFS_OBJ_HOLD_EXIT(zsb, obj); -@@ -756,5 +759,4 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - /* -- * zfs_xvattr_set only updates the in-core attributes -- * it is assumed the caller will be doing an sa_bulk_update -- * to push the changes out -+ * Update in-core attributes. It is assumed the caller will be doing an -+ * sa_bulk_update to push the changes out. - */ -@@ -859,3 +861,2 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) - sa_handle_t *hdl; -- struct inode *ip; - -@@ -864,4 +865,2 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) - again: -- ip = ilookup(zsb->z_sb, obj_num); -- - ZFS_OBJ_HOLD_ENTER(zsb, obj_num); -@@ -871,3 +870,2 @@ again: - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); - return (err); -@@ -882,4 +880,3 @@ again: - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -888,18 +885,5 @@ again: - if (hdl != NULL) { -- if (ip == NULL) { -- /* -- * ilookup returned NULL, which means -- * the znode is dying - but the SA handle isn't -- * quite dead yet, we need to drop any locks -- * we're holding, re-schedule the task and try again. -- */ -- sa_buf_rele(db, NULL); -- ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- -- schedule(); -- goto again; -- } -- - zp = sa_get_userdata(hdl); - -+ - /* -@@ -915,5 +899,24 @@ again: - if (zp->z_unlinked) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -- igrab(ZTOI(zp)); -+ /* -+ * If igrab() returns NULL the VFS has independently -+ * determined the inode should be evicted and has -+ * called iput_final() to start the eviction process. -+ * The SA handle is still valid but because the VFS -+ * requires that the eviction succeed we must drop -+ * our locks and references to allow the eviction to -+ * complete. The zfs_zget() may then be retried. -+ * -+ * This unlikely case could be optimized by registering -+ * a sops->drop_inode() callback. The callback would -+ * need to detect the active SA hold thereby informing -+ * the VFS that this inode should not be evicted. -+ */ -+ if (igrab(ZTOI(zp)) == NULL) { -+ mutex_exit(&zp->z_lock); -+ sa_buf_rele(db, NULL); -+ ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -+ goto again; -+ } - *zpp = zp; -@@ -921,6 +924,5 @@ again: - } -- sa_buf_rele(db, NULL); - mutex_exit(&zp->z_lock); -+ sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- iput(ip); - return (err); -@@ -928,4 +930,2 @@ again: - -- ASSERT3P(ip, ==, NULL); -- - /* -@@ -943,3 +943,3 @@ again: - if (zp == NULL) { -- err = ENOENT; -+ err = SET_ERROR(ENOENT); - } else { -@@ -999,3 +999,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - } -@@ -1025,3 +1025,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1033,3 +1033,3 @@ zfs_rezget(znode_t *zp) - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); -- return (EIO); -+ return (SET_ERROR(EIO)); - } -@@ -1109,2 +1109,59 @@ zfs_zinactive(znode_t *zp) - -+static inline int -+zfs_compare_timespec(struct timespec *t1, struct timespec *t2) -+{ -+ if (t1->tv_sec < t2->tv_sec) -+ return (-1); -+ -+ if (t1->tv_sec > t2->tv_sec) -+ return (1); -+ -+ return (t1->tv_nsec - t2->tv_nsec); -+} -+ -+/* -+ * Determine whether the znode's atime must be updated. The logic mostly -+ * duplicates the Linux kernel's relatime_need_update() functionality. -+ * This function is only called if the underlying filesystem actually has -+ * atime updates enabled. -+ */ -+static inline boolean_t -+zfs_atime_need_update(znode_t *zp, timestruc_t *now) -+{ -+ if (!ZTOZSB(zp)->z_relatime) -+ return (B_TRUE); -+ -+ /* -+ * In relatime mode, only update the atime if the previous atime -+ * is earlier than either the ctime or mtime or if at least a day -+ * has passed since the last update of atime. -+ */ -+ if (zfs_compare_timespec(&ZTOI(zp)->i_mtime, &ZTOI(zp)->i_atime) >= 0) -+ return (B_TRUE); -+ -+ if (zfs_compare_timespec(&ZTOI(zp)->i_ctime, &ZTOI(zp)->i_atime) >= 0) -+ return (B_TRUE); -+ -+ if ((long)now->tv_sec - ZTOI(zp)->i_atime.tv_sec >= 24*60*60) -+ return (B_TRUE); -+ -+ return (B_FALSE); -+} -+ -+/* -+ * Prepare to update znode time stamps. -+ * -+ * IN: zp - znode requiring timestamp update -+ * flag - ATTR_MTIME, ATTR_CTIME, ATTR_ATIME flags -+ * have_tx - true of caller is creating a new txg -+ * -+ * OUT: zp - new atime (via underlying inode's i_atime) -+ * mtime - new mtime -+ * ctime - new ctime -+ * -+ * NOTE: The arguments are somewhat redundant. The following condition -+ * is always true: -+ * -+ * have_tx == !(flag & ATTR_ATIME) -+ */ - void -@@ -1115,13 +1172,22 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], - -+ ASSERT(have_tx == !(flag & ATTR_ATIME)); - gethrestime(&now); - -- if (have_tx) { /* will sa_bulk_update happen really soon? */ -+ /* -+ * NOTE: The following test intentionally does not update z_atime_dirty -+ * in the case where an ATIME update has been requested but for which -+ * the update is omitted due to relatime logic. The rationale being -+ * that if the flag was set somewhere else, we should leave it alone -+ * here. -+ */ -+ if (flag & ATTR_ATIME) { -+ if (zfs_atime_need_update(zp, &now)) { -+ ZFS_TIME_ENCODE(&now, zp->z_atime); -+ ZTOI(zp)->i_atime.tv_sec = zp->z_atime[0]; -+ ZTOI(zp)->i_atime.tv_nsec = zp->z_atime[1]; -+ zp->z_atime_dirty = 1; -+ } -+ } else { - zp->z_atime_dirty = 0; - zp->z_seq++; -- } else { -- zp->z_atime_dirty = 1; -- } -- -- if (flag & ATTR_ATIME) { -- ZFS_TIME_ENCODE(&now, zp->z_atime); - } -@@ -1185,4 +1251,3 @@ zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1209,3 +1274,2 @@ zfs_extend(znode_t *zp, uint64_t end) - } --top: - tx = dmu_tx_create(zsb->z_os); -@@ -1229,9 +1293,4 @@ top: - -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto top; -- } - dmu_tx_abort(tx); -@@ -1263,4 +1322,3 @@ top: - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1302,4 +1360,3 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1376,4 +1433,3 @@ top: - * -- * RETURN: 0 if success -- * error code if failure -+ * RETURN: 0 on success, error code on failure - */ -@@ -1410,3 +1466,3 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) - if (!lock_may_write(ip, off, length)) -- return (EAGAIN); -+ return (SET_ERROR(EAGAIN)); - } -@@ -1426,9 +1482,4 @@ log: - zfs_sa_upgrade_txholds(tx, zp); -- error = dmu_tx_assign(tx, TXG_NOWAIT); -+ error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { -- if (error == ERESTART) { -- dmu_tx_wait(tx); -- dmu_tx_abort(tx); -- goto log; -- } - dmu_tx_abort(tx); -@@ -1631,3 +1682,3 @@ zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, - sa_buf_rele(*db, tag); -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -1718,6 +1769,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, - for (;;) { -- uint64_t pobj; -+ uint64_t pobj = 0; - char component[MAXNAMELEN + 2]; - size_t complen; -- int is_xattrdir; -+ int is_xattrdir = 0; - -diff --git a/module/zfs/zil.c b/module/zfs/zil.c -index c179693..b69a7bf 100644 ---- a/module/zfs/zil.c -+++ b/module/zfs/zil.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -72,15 +72,15 @@ - zil_stats_t zil_stats = { -- { "zil_commit_count", KSTAT_DATA_UINT64 }, -- { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, -- { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_commit_count", KSTAT_DATA_UINT64 }, -+ { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, -+ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, - }; -@@ -90,5 +90,5 @@ static kstat_t *zil_ksp; - /* -- * This global ZIL switch affects all pools -+ * Disable intent logging replay. This global ZIL switch affects all pools. - */ --int zil_replay_disable = 0; /* disable intent logging replay */ -+int zil_replay_disable = 0; - -@@ -166,3 +166,3 @@ zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) - if (avl_find(t, dva, &where) != NULL) -- return (EEXIST); -+ return (SET_ERROR(EEXIST)); - -@@ -237,3 +237,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { -- error = ECKSUM; -+ error = SET_ERROR(ECKSUM); - } else { -@@ -251,3 +251,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - (zilc->zc_nused > (size - sizeof (*zilc)))) { -- error = ECKSUM; -+ error = SET_ERROR(ECKSUM); - } else { -@@ -259,3 +259,3 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - -- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); -+ VERIFY(arc_buf_remove_ref(abuf, &abuf)); - } -@@ -321,3 +321,3 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - -- bzero(&next_blk, sizeof(blkptr_t)); -+ bzero(&next_blk, sizeof (blkptr_t)); - -@@ -358,3 +358,3 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); -- if (error) -+ if (error != 0) - break; -@@ -494,3 +494,3 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) - -- if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg) == 0) { -+ if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { - /* up the hold count until we can be written out */ -@@ -660,4 +660,4 @@ zil_claim(const char *osname, void *txarg) - -- error = dmu_objset_hold(osname, FTAG, &os); -- if (error) { -+ error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); -+ if (error != 0) { - cmn_err(CE_WARN, "can't open objset for %s", osname); -@@ -674,3 +674,3 @@ zil_claim(const char *osname, void *txarg) - dsl_dataset_dirty(dmu_objset_ds(os), tx); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - return (0); -@@ -699,3 +699,3 @@ zil_claim(const char *osname, void *txarg) - ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); -- dmu_objset_rele(os, FTAG); -+ dmu_objset_disown(os, FTAG); - return (0); -@@ -719,3 +719,3 @@ zil_check_log_chain(const char *osname, void *tx) - error = dmu_objset_hold(osname, FTAG, &os); -- if (error) { -+ if (error != 0) { - cmn_err(CE_WARN, "can't open objset for %s", osname); -@@ -915,3 +915,3 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), -- zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, -+ zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | -@@ -924,2 +924,3 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) - * Define a limited set of intent log block sizes. -+ * - * These must be a multiple of 4KB. Note only the amount used (again -@@ -1016,10 +1017,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) - use_slog = USE_SLOG(zilog); -- error = zio_alloc_zil(spa, txg, bp, zil_blksz, USE_SLOG(zilog)); -- if (use_slog) -- { -+ error = zio_alloc_zil(spa, txg, bp, zil_blksz, -+ USE_SLOG(zilog)); -+ if (use_slog) { - ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); -- } -- else -- { -+ } else { - ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); -@@ -1027,3 +1026,3 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) - } -- if (!error) { -+ if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); -@@ -1134,3 +1133,4 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - ZIL_STAT_BUMP(zil_itx_needcopy_count); -- ZIL_STAT_INCR(zil_itx_needcopy_bytes, lrw->lr_length); -+ ZIL_STAT_INCR(zil_itx_needcopy_bytes, -+ lrw->lr_length); - } else { -@@ -1139,3 +1139,4 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - ZIL_STAT_BUMP(zil_itx_indirect_count); -- ZIL_STAT_INCR(zil_itx_indirect_bytes, lrw->lr_length); -+ ZIL_STAT_INCR(zil_itx_indirect_bytes, -+ lrw->lr_length); - } -@@ -1147,3 +1148,3 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) - } -- if (error) { -+ if (error != 0) { - ASSERT(error == ENOENT || error == EEXIST || -@@ -1184,2 +1185,4 @@ zil_itx_create(uint64_t txtype, size_t lrsize) - itx->itx_sync = B_TRUE; /* default is synchronous */ -+ itx->itx_callback = NULL; -+ itx->itx_callback_data = NULL; - -@@ -1209,2 +1212,4 @@ zil_itxg_clean(itxs_t *itxs) - while ((itx = list_head(list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(list, itx); -@@ -1219,2 +1224,4 @@ zil_itxg_clean(itxs_t *itxs) - while ((itx = list_head(list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(list, itx); -@@ -1285,2 +1292,4 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) - while ((itx = list_head(&clean_list)) != NULL) { -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); - list_remove(&clean_list, itx); -@@ -1336,3 +1345,4 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) - itxg->itxg_txg = txg; -- itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_PUSHPAGE); -+ itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), -+ KM_PUSHPAGE); - -@@ -1356,3 +1366,4 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) - if (ian == NULL) { -- ian = kmem_alloc(sizeof (itx_async_node_t), KM_PUSHPAGE); -+ ian = kmem_alloc(sizeof (itx_async_node_t), -+ KM_PUSHPAGE); - list_create(&ian->ia_list, sizeof (itx_t), -@@ -1530,3 +1541,4 @@ zil_commit_writer(zilog_t *zilog) - DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); -- while ((itx = list_head(&zilog->zl_itx_commit_list))) { -+ for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL; -+ itx = list_next(&zilog->zl_itx_commit_list, itx)) { - txg = itx->itx_lr.lrc_txg; -@@ -1536,5 +1548,2 @@ zil_commit_writer(zilog_t *zilog) - lwb = zil_lwb_commit(zilog, itx, lwb); -- list_remove(&zilog->zl_itx_commit_list, itx); -- kmem_free(itx, offsetof(itx_t, itx_lr) -- + itx->itx_lr.lrc_reclen); - } -@@ -1560,2 +1569,13 @@ zil_commit_writer(zilog_t *zilog) - -+ while ((itx = list_head(&zilog->zl_itx_commit_list))) { -+ txg = itx->itx_lr.lrc_txg; -+ ASSERT(txg); -+ -+ if (itx->itx_callback != NULL) -+ itx->itx_callback(itx->itx_callback_data); -+ list_remove(&zilog->zl_itx_commit_list, itx); -+ kmem_free(itx, offsetof(itx_t, itx_lr) -+ + itx->itx_lr.lrc_reclen); -+ } -+ - mutex_enter(&zilog->zl_lock); -@@ -1727,3 +1747,3 @@ zil_init(void) - zil_ksp = kstat_create("zfs", 0, "zil", "misc", -- KSTAT_TYPE_NAMED, sizeof(zil_stats) / sizeof(kstat_named_t), -+ KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); -@@ -1809,2 +1829,5 @@ zil_free(zilog_t *zilog) - -+ ASSERT0(zilog->zl_suspend); -+ ASSERT0(zilog->zl_suspending); -+ - ASSERT(list_is_empty(&zilog->zl_lwb_list)); -@@ -1907,2 +1930,4 @@ zil_close(zilog_t *zilog) - -+static char *suspend_tag = "zil suspending"; -+ - /* -@@ -1910,20 +1935,67 @@ zil_close(zilog_t *zilog) - * synchronous semantics, but we rely on txg_wait_synced() to do it. -- * We suspend the log briefly when taking a snapshot so that the snapshot -- * contains all the data it's supposed to, and has an empty intent log. -+ * On old version pools, we suspend the log briefly when taking a -+ * snapshot so that it will have an empty intent log. -+ * -+ * Long holds are not really intended to be used the way we do here -- -+ * held for such a short time. A concurrent caller of dsl_dataset_long_held() -+ * could fail. Therefore we take pains to only put a long hold if it is -+ * actually necessary. Fortunately, it will only be necessary if the -+ * objset is currently mounted (or the ZVOL equivalent). In that case it -+ * will already have a long hold, so we are not really making things any worse. -+ * -+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or -+ * zvol_state_t), and use their mechanism to prevent their hold from being -+ * dropped (e.g. VFS_HOLD()). However, that would be even more pain for -+ * very little gain. -+ * -+ * if cookiep == NULL, this does both the suspend & resume. -+ * Otherwise, it returns with the dataset "long held", and the cookie -+ * should be passed into zil_resume(). - */ - int --zil_suspend(zilog_t *zilog) -+zil_suspend(const char *osname, void **cookiep) - { -- const zil_header_t *zh = zilog->zl_header; -+ objset_t *os; -+ zilog_t *zilog; -+ const zil_header_t *zh; -+ int error; -+ -+ error = dmu_objset_hold(osname, suspend_tag, &os); -+ if (error != 0) -+ return (error); -+ zilog = dmu_objset_zil(os); - - mutex_enter(&zilog->zl_lock); -+ zh = zilog->zl_header; -+ - if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ - mutex_exit(&zilog->zl_lock); -- return (EBUSY); -+ dmu_objset_rele(os, suspend_tag); -+ return (SET_ERROR(EBUSY)); - } -- if (zilog->zl_suspend++ != 0) { -+ -+ /* -+ * Don't put a long hold in the cases where we can avoid it. This -+ * is when there is no cookie so we are doing a suspend & resume -+ * (i.e. called from zil_vdev_offline()), and there's nothing to do -+ * for the suspend because it's already suspended, or there's no ZIL. -+ */ -+ if (cookiep == NULL && !zilog->zl_suspending && -+ (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { -+ mutex_exit(&zilog->zl_lock); -+ dmu_objset_rele(os, suspend_tag); -+ return (0); -+ } -+ -+ dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); -+ dsl_pool_rele(dmu_objset_pool(os), suspend_tag); -+ -+ zilog->zl_suspend++; -+ -+ if (zilog->zl_suspend > 1) { - /* -- * Someone else already began a suspend. -+ * Someone else is already suspending it. - * Just wait for them to finish. - */ -+ - while (zilog->zl_suspending) -@@ -1931,4 +2003,23 @@ zil_suspend(zilog_t *zilog) - mutex_exit(&zilog->zl_lock); -+ -+ if (cookiep == NULL) -+ zil_resume(os); -+ else -+ *cookiep = os; -+ return (0); -+ } -+ -+ /* -+ * If there is no pointer to an on-disk block, this ZIL must not -+ * be active (e.g. filesystem not mounted), so there's nothing -+ * to clean up. -+ */ -+ if (BP_IS_HOLE(&zh->zh_log)) { -+ ASSERT(cookiep != NULL); /* fast path already handled */ -+ -+ *cookiep = os; -+ mutex_exit(&zilog->zl_lock); - return (0); - } -+ - zilog->zl_suspending = B_TRUE; -@@ -1945,2 +2036,6 @@ zil_suspend(zilog_t *zilog) - -+ if (cookiep == NULL) -+ zil_resume(os); -+ else -+ *cookiep = os; - return (0); -@@ -1949,4 +2044,7 @@ zil_suspend(zilog_t *zilog) - void --zil_resume(zilog_t *zilog) -+zil_resume(void *cookie) - { -+ objset_t *os = cookie; -+ zilog_t *zilog = dmu_objset_zil(os); -+ - mutex_enter(&zilog->zl_lock); -@@ -1955,2 +2053,4 @@ zil_resume(zilog_t *zilog) - mutex_exit(&zilog->zl_lock); -+ dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); -+ dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); - } -@@ -2027,3 +2127,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - zr->zr_lr + reclen); -- if (error) -+ if (error != 0) - return (zil_replay_error(zilog, lr, error)); -@@ -2048,3 +2148,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); -- if (error) { -+ if (error != 0) { - /* -@@ -2058,3 +2158,3 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); -- if (error) -+ if (error != 0) - return (zil_replay_error(zilog, lr, error)); -@@ -2130,17 +2230,8 @@ zil_vdev_offline(const char *osname, void *arg) - { -- objset_t *os; -- zilog_t *zilog; - int error; - -- error = dmu_objset_hold(osname, FTAG, &os); -- if (error) -- return (error); -- -- zilog = dmu_objset_zil(os); -- if (zil_suspend(zilog) != 0) -- error = EEXIST; -- else -- zil_resume(zilog); -- dmu_objset_rele(os, FTAG); -- return (error); -+ error = zil_suspend(osname, NULL); -+ if (error != 0) -+ return (SET_ERROR(EEXIST)); -+ return (0); - } -diff --git a/module/zfs/zio.c b/module/zfs/zio.c -index ccefaf8..97f2549 100644 ---- a/module/zfs/zio.c -+++ b/module/zfs/zio.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. -@@ -41,22 +41,2 @@ - * ========================================================================== -- * I/O priority table -- * ========================================================================== -- */ --uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { -- 0, /* ZIO_PRIORITY_NOW */ -- 0, /* ZIO_PRIORITY_SYNC_READ */ -- 0, /* ZIO_PRIORITY_SYNC_WRITE */ -- 0, /* ZIO_PRIORITY_LOG_WRITE */ -- 1, /* ZIO_PRIORITY_CACHE_FILL */ -- 1, /* ZIO_PRIORITY_AGG */ -- 4, /* ZIO_PRIORITY_FREE */ -- 4, /* ZIO_PRIORITY_ASYNC_WRITE */ -- 6, /* ZIO_PRIORITY_ASYNC_READ */ -- 10, /* ZIO_PRIORITY_RESILVER */ -- 20, /* ZIO_PRIORITY_SCRUB */ -- 2, /* ZIO_PRIORITY_DDT_PREFETCH */ --}; -- --/* -- * ========================================================================== - * I/O type descriptions -@@ -64,3 +44,3 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { - */ --char *zio_type_name[ZIO_TYPES] = { -+const char *zio_type_name[ZIO_TYPES] = { - "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" -@@ -154,3 +134,3 @@ zio_init(void) - sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_KMEM); -- zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof(vdev_io_t), -+ zio_vdev_cache = kmem_cache_create("zio_vdev_cache", sizeof (vdev_io_t), - PAGESIZE, NULL, NULL, NULL, NULL, NULL, KMC_VMEM); -@@ -171,7 +151,17 @@ zio_init(void) - -+#ifndef _KERNEL -+ /* -+ * If we are using watchpoints, put each buffer on its own page, -+ * to eliminate the performance overhead of trapping to the -+ * kernel when modifying a non-watched buffer that shares the -+ * page with a watched buffer. -+ */ -+ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) -+ continue; -+#endif - if (size <= 4 * SPA_MINBLOCKSIZE) { - align = SPA_MINBLOCKSIZE; -- } else if (P2PHASE(size, PAGESIZE) == 0) { -+ } else if (IS_P2ALIGNED(size, PAGESIZE)) { - align = PAGESIZE; -- } else if (P2PHASE(size, p2 >> 2) == 0) { -+ } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = p2 >> 2; -@@ -219,3 +209,4 @@ zio_init(void) - */ -- zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); -+ if (zfs_mg_alloc_failures == 0) -+ zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); - -@@ -397,3 +388,3 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) - zio->io_data, data, zio->io_size, size) != 0) -- zio->io_error = EIO; -+ zio->io_error = SET_ERROR(EIO); - } -@@ -540,3 +531,6 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) - ASSERT3U(*countp, >, 0); -- if (--*countp == 0 && pio->io_stall == countp) { -+ -+ (*countp)--; -+ -+ if (*countp == 0 && pio->io_stall == countp) { - pio->io_stall = NULL; -@@ -564,3 +558,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- zio_type_t type, int priority, enum zio_flag flags, -+ zio_type_t type, zio_priority_t priority, enum zio_flag flags, - vdev_t *vd, uint64_t offset, const zbookmark_t *zb, -@@ -611,2 +605,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_ready = NULL; -+ zio->io_physdone = NULL; - zio->io_done = done; -@@ -620,3 +615,2 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_offset = offset; -- zio->io_deadline = 0; - zio->io_timestamp = 0; -@@ -637,2 +631,3 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio->io_child_count = 0; -+ zio->io_phys_children = 0; - zio->io_parent_count = 0; -@@ -697,3 +692,3 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb) -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) - { -@@ -713,4 +708,5 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, -- zio_done_func_t *ready, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, const zbookmark_t *zb) -+ zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, -+ void *private, -+ zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) - { -@@ -725,5 +721,3 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zp->zp_copies > 0 && -- zp->zp_copies <= spa_max_replication(spa) && -- zp->zp_dedup <= 1 && -- zp->zp_dedup_verify <= 1); -+ zp->zp_copies <= spa_max_replication(spa)); - -@@ -735,2 +729,3 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio->io_ready = ready; -+ zio->io_physdone = physdone; - zio->io_prop = *zp; -@@ -742,4 +737,4 @@ zio_t * - zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, -- uint64_t size, zio_done_func_t *done, void *private, int priority, -- enum zio_flag flags, zbookmark_t *zb) -+ uint64_t size, zio_done_func_t *done, void *private, -+ zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) - { -@@ -755,3 +750,3 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, - void --zio_write_override(zio_t *zio, blkptr_t *bp, int copies) -+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) - { -@@ -762,2 +757,9 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies) - -+ /* -+ * We must reset the io_prop to match the values that existed -+ * when the bp was first written by dmu_sync() keeping in mind -+ * that nopwrite and dedup are mutually exclusive. -+ */ -+ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; -+ zio->io_prop.zp_nopwrite = nopwrite; - zio->io_prop.zp_copies = copies; -@@ -769,3 +771,17 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) - { -- bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); -+ metaslab_check_free(spa, bp); -+ -+ /* -+ * Frees that are for the currently-syncing txg, are not going to be -+ * deferred, and which will not need to do a read (i.e. not GANG or -+ * DEDUP), can be processed immediately. Otherwise, put them on the -+ * in-memory list for later processing. -+ */ -+ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || -+ txg != spa->spa_syncing_txg || -+ spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { -+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); -+ } else { -+ VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); -+ } - } -@@ -777,2 +793,3 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_t *zio; -+ enum zio_stage stage = ZIO_FREE_PIPELINE; - -@@ -785,7 +802,16 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - -+ metaslab_check_free(spa, bp); - arc_freed(spa, bp); - -+ /* -+ * GANG and DEDUP blocks can induce a read (for the gang block header, -+ * or the DDT), so issue them asynchronously so that this thread is -+ * not tied up. -+ */ -+ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) -+ stage |= ZIO_STAGE_ISSUE_ASYNC; -+ - zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), -- NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, -- NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); -+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, -+ NULL, 0, NULL, ZIO_STAGE_OPEN, stage); - -@@ -825,3 +851,3 @@ zio_t * - zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, -- zio_done_func_t *done, void *private, int priority, enum zio_flag flags) -+ zio_done_func_t *done, void *private, enum zio_flag flags) - { -@@ -832,3 +858,3 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, -- ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, -+ ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); -@@ -841,3 +867,3 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, -- done, private, priority, flags)); -+ done, private, flags)); - } -@@ -850,3 +876,3 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, boolean_t labels) -+ zio_priority_t priority, enum zio_flag flags, boolean_t labels) - { -@@ -871,3 +897,3 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, -- int priority, enum zio_flag flags, boolean_t labels) -+ zio_priority_t priority, enum zio_flag flags, boolean_t labels) - { -@@ -906,4 +932,4 @@ zio_t * - zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, -- void *data, uint64_t size, int type, int priority, enum zio_flag flags, -- zio_done_func_t *done, void *private) -+ void *data, uint64_t size, int type, zio_priority_t priority, -+ enum zio_flag flags, zio_done_func_t *done, void *private) - { -@@ -942,2 +968,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - -+ zio->io_physdone = pio->io_physdone; -+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) -+ zio->io_logical->io_phys_children++; -+ - return (zio); -@@ -947,3 +977,3 @@ zio_t * - zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, -- int type, int priority, enum zio_flag flags, -+ int type, zio_priority_t priority, enum zio_flag flags, - zio_done_func_t *done, void *private) -@@ -956,3 +986,3 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, - data, size, done, private, type, priority, -- flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, -+ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, - vd, offset, NULL, -@@ -967,3 +997,3 @@ zio_flush(zio_t *zio, vdev_t *vd) - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, -- NULL, NULL, ZIO_PRIORITY_NOW, -+ NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); -@@ -1051,2 +1081,15 @@ zio_write_bp_init(zio_t *zio) - -+ /* -+ * If we've been overridden and nopwrite is set then -+ * set the flag accordingly to indicate that a nopwrite -+ * has already occurred. -+ */ -+ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { -+ ASSERT(!zp->zp_dedup); -+ zio->io_flags |= ZIO_FLAG_NOPWRITE; -+ return (ZIO_PIPELINE_CONTINUE); -+ } -+ -+ ASSERT(!zp->zp_nopwrite); -+ - if (BP_IS_HOLE(bp) || !zp->zp_dedup) -@@ -1138,2 +1181,7 @@ zio_write_bp_init(zio_t *zio) - } -+ if (zp->zp_nopwrite) { -+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); -+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); -+ zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; -+ } - } -@@ -1404,2 +1452,3 @@ zio_reexecute(zio_t *pio) - pio->io_reexecute = 0; -+ pio->io_flags |= ZIO_FLAG_REEXECUTED; - pio->io_error = 0; -@@ -1805,3 +1854,2 @@ zio_write_gang_member_ready(zio_t *zio) - zio_t *pio = zio_unique_parent(zio); -- ASSERTV(zio_t *gio = zio->io_gang_leader;) - dva_t *cdva = zio->io_bp->blk_dva; -@@ -1810,2 +1858,3 @@ zio_write_gang_member_ready(zio_t *zio) - int d; -+ ASSERTV(zio_t *gio = zio->io_gang_leader); - -@@ -1887,4 +1936,5 @@ zio_write_gang_block(zio_t *pio) - zp.zp_copies = gio->io_prop.zp_copies; -- zp.zp_dedup = 0; -- zp.zp_dedup_verify = 0; -+ zp.zp_dedup = B_FALSE; -+ zp.zp_dedup_verify = B_FALSE; -+ zp.zp_nopwrite = B_FALSE; - -@@ -1892,3 +1942,3 @@ zio_write_gang_block(zio_t *pio) - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, -- zio_write_gang_member_ready, NULL, &gn->gn_child[g], -+ zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), -@@ -1913,2 +1963,58 @@ zio_write_gang_block(zio_t *pio) - /* -+ * The zio_nop_write stage in the pipeline determines if allocating -+ * a new bp is necessary. By leveraging a cryptographically secure checksum, -+ * such as SHA256, we can compare the checksums of the new data and the old -+ * to determine if allocating a new block is required. The nopwrite -+ * feature can handle writes in either syncing or open context (i.e. zil -+ * writes) and as a result is mutually exclusive with dedup. -+ */ -+static int -+zio_nop_write(zio_t *zio) -+{ -+ blkptr_t *bp = zio->io_bp; -+ blkptr_t *bp_orig = &zio->io_bp_orig; -+ zio_prop_t *zp = &zio->io_prop; -+ -+ ASSERT(BP_GET_LEVEL(bp) == 0); -+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); -+ ASSERT(zp->zp_nopwrite); -+ ASSERT(!zp->zp_dedup); -+ ASSERT(zio->io_bp_override == NULL); -+ ASSERT(IO_IS_ALLOCATING(zio)); -+ -+ /* -+ * Check to see if the original bp and the new bp have matching -+ * characteristics (i.e. same checksum, compression algorithms, etc). -+ * If they don't then just continue with the pipeline which will -+ * allocate a new bp. -+ */ -+ if (BP_IS_HOLE(bp_orig) || -+ !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || -+ BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || -+ BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || -+ BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || -+ zp->zp_copies != BP_GET_NDVAS(bp_orig)) -+ return (ZIO_PIPELINE_CONTINUE); -+ -+ /* -+ * If the checksums match then reset the pipeline so that we -+ * avoid allocating a new bp and issuing any I/O. -+ */ -+ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { -+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); -+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); -+ ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); -+ ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); -+ ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, -+ sizeof (uint64_t)) == 0); -+ -+ *bp = *bp_orig; -+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; -+ zio->io_flags |= ZIO_FLAG_NOPWRITE; -+ } -+ -+ return (ZIO_PIPELINE_CONTINUE); -+} -+ -+/* - * ========================================================================== -@@ -2061,4 +2167,4 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) - zio->io_orig_size) != 0) -- error = EEXIST; -- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); -+ error = SET_ERROR(EEXIST); -+ VERIFY(arc_buf_remove_ref(abuf, &abuf)); - } -@@ -2186,3 +2292,3 @@ zio_ddt_write(zio_t *zio) - } else { -- zp->zp_dedup = 0; -+ zp->zp_dedup = B_FALSE; - } -@@ -2220,3 +2326,3 @@ zio_ddt_write(zio_t *zio) - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, -- zio->io_orig_size, &czp, NULL, -+ zio->io_orig_size, &czp, NULL, NULL, - zio_ddt_ditto_write_done, dde, zio->io_priority, -@@ -2242,3 +2348,3 @@ zio_ddt_write(zio_t *zio) - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, -- zio->io_orig_size, zp, zio_ddt_child_write_ready, -+ zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, -@@ -2404,3 +2510,3 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, - new_bp, 1, txg, NULL, -- METASLAB_FASTWRITE | METASLAB_GANG_AVOID); -+ METASLAB_FASTWRITE); - } -@@ -2529,3 +2635,3 @@ zio_vdev_io_start(zio_t *zio) - if (!vdev_accessible(vd, zio)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); -@@ -2566,3 +2672,3 @@ zio_vdev_io_done(zio_t *zio) - if (!vdev_accessible(vd, zio)) { -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - } else { -@@ -2651,3 +2757,3 @@ zio_vdev_io_assess(zio_t *zio) - !vdev_accessible(vd, zio)) -- zio->io_error = ENXIO; -+ zio->io_error = SET_ERROR(ENXIO); - -@@ -2658,4 +2764,5 @@ zio_vdev_io_assess(zio_t *zio) - if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && -- vd != NULL && !vd->vdev_ops->vdev_op_leaf) -+ vd != NULL && !vd->vdev_ops->vdev_op_leaf) { - vd->vdev_cant_write = B_TRUE; -+ } - -@@ -2664,2 +2771,9 @@ zio_vdev_io_assess(zio_t *zio) - -+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf && -+ zio->io_physdone != NULL) { -+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); -+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); -+ zio->io_physdone(zio->io_logical); -+ } -+ - return (ZIO_PIPELINE_CONTINUE); -@@ -2814,3 +2928,4 @@ zio_ready(zio_t *zio) - ASSERT(IO_IS_ALLOCATING(zio)); -- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); -+ ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || -+ (zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); -@@ -2882,3 +2997,4 @@ zio_done(zio_t *zio) - ASSERT(zio->io_bp->blk_pad[1] == 0); -- ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || -+ ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, -+ sizeof (blkptr_t)) == 0 || - (zio->io_bp == zio_unique_parent(zio)->io_bp)); -@@ -2888,6 +3004,10 @@ zio_done(zio_t *zio) - ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); -- ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); -+ ASSERT3U(zio->io_prop.zp_copies, <=, -+ BP_GET_NDVAS(zio->io_bp)); - ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || -- (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); -+ (BP_COUNT_GANG(zio->io_bp) == -+ BP_GET_NDVAS(zio->io_bp))); - } -+ if (zio->io_flags & ZIO_FLAG_NOPWRITE) -+ VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); - } -@@ -2915,3 +3035,3 @@ zio_done(zio_t *zio) - bcopy(zio->io_data, abuf, zio->io_size); -- bzero(abuf + zio->io_size, asize - zio->io_size); -+ bzero(abuf+zio->io_size, asize-zio->io_size); - } -@@ -2940,3 +3060,3 @@ zio_done(zio_t *zio) - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, -- zio->io_vd, zio, 0, 0); -+ zio->io_vd, zio, 0, 0); - } -@@ -2963,4 +3083,4 @@ zio_done(zio_t *zio) - spa_log_error(zio->io_spa, zio); -- zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, -- 0, 0); -+ zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, -+ NULL, zio, 0, 0); - } -@@ -3014,3 +3134,3 @@ zio_done(zio_t *zio) - IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && -- !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) -+ !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) - zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); -@@ -3111,3 +3231,3 @@ zio_done(zio_t *zio) - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && -- !BP_IS_HOLE(zio->io_bp)) { -+ !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); -@@ -3158,2 +3278,3 @@ static zio_pipe_stage_t *zio_pipeline[] = { - zio_checksum_generate, -+ zio_nop_write, - zio_ddt_read_start, -@@ -3226,3 +3347,2 @@ EXPORT_SYMBOL(zio_handle_device_injection); - EXPORT_SYMBOL(zio_handle_label_injection); --EXPORT_SYMBOL(zio_priority_table); - EXPORT_SYMBOL(zio_type_name); -@@ -3240,3 +3360,3 @@ module_param(zfs_sync_pass_deferred_free, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_deferred_free, -- "defer frees starting in this pass"); -+ "Defer frees starting in this pass"); - -@@ -3244,3 +3364,3 @@ module_param(zfs_sync_pass_dont_compress, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_dont_compress, -- "don't compress starting in this pass"); -+ "Don't compress starting in this pass"); - -@@ -3248,3 +3368,3 @@ module_param(zfs_sync_pass_rewrite, int, 0644); - MODULE_PARM_DESC(zfs_sync_pass_rewrite, -- "rewrite new bps starting in this pass"); -+ "Rewrite new bps starting in this pass"); - #endif -diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c -index c8fe20f..bc73317 100644 ---- a/module/zfs/zio_checksum.c -+++ b/module/zfs/zio_checksum.c -@@ -22,2 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -203,3 +204,3 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -218,6 +219,6 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - else -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - - if (nused > size) -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - -@@ -263,3 +264,3 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) -- return (ECKSUM); -+ return (SET_ERROR(ECKSUM)); - -diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c -index 1dc780d..5b63f0a 100644 ---- a/module/zfs/zio_compress.c -+++ b/module/zfs/zio_compress.c -@@ -29,2 +29,6 @@ - -+/* -+ * Copyright (c) 2013 by Delphix. All rights reserved. -+ */ -+ - #include -@@ -55,3 +59,3 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {zle_compress, zle_decompress, 64, "zle"}, -- {lz4_compress, lz4_decompress, 0, "lz4"}, -+ {lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, - }; -@@ -132,3 +136,3 @@ zio_decompress_data(enum zio_compress c, void *src, void *dst, - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c -index eb589c4..39ec590 100644 ---- a/module/zfs/zio_inject.c -+++ b/module/zfs/zio_inject.c -@@ -22,3 +22,3 @@ - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. -- * Copyright (c) 2012 by Delphix. All rights reserved. -+ * Copyright (c) 2013 by Delphix. All rights reserved. - */ -@@ -278,3 +278,3 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) - if (handler->zi_record.zi_error == ENXIO) { -- ret = EIO; -+ ret = SET_ERROR(EIO); - break; -@@ -418,3 +418,3 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) - if ((spa = spa_inject_addref(name)) == NULL) -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - -@@ -470,3 +470,3 @@ zio_inject_list_next(int *id, char *name, size_t buflen, - } else { -- ret = ENOENT; -+ ret = SET_ERROR(ENOENT); - } -@@ -497,3 +497,3 @@ zio_clear_fault(int id) - rw_exit(&inject_lock); -- return (ENOENT); -+ return (SET_ERROR(ENOENT)); - } -diff --git a/module/zfs/zpl_ctldir.c b/module/zfs/zpl_ctldir.c -index 1bb646f..9e587e3 100644 ---- a/module/zfs/zpl_ctldir.c -+++ b/module/zfs/zpl_ctldir.c -@@ -45,3 +45,3 @@ zpl_common_open(struct inode *ip, struct file *filp) - -- return generic_file_open(ip, filp); -+ return (generic_file_open(ip, filp)); - } -@@ -131,8 +131,8 @@ zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -176,3 +176,3 @@ zpl_snapdir_automount(struct path *path) - if (error) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -@@ -200,3 +200,3 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) - { -- return 0; -+ return (0); - } -@@ -239,3 +239,3 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - if (error && error != -ENOENT) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -@@ -245,3 +245,3 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -263,4 +263,6 @@ zpl_snapdir_iterate(struct file *filp, struct dir_context *ctx) - while (error == 0) { -+ dsl_pool_config_enter(dmu_objset_pool(zsb->z_os), FTAG); - error = -dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN, -- snapname, &id, &(ctx->pos), &case_conflict); -+ snapname, &id, &ctx->pos, &case_conflict); -+ dsl_pool_config_exit(dmu_objset_pool(zsb->z_os), FTAG); - if (error) -@@ -334,3 +336,3 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, mode | S_IFDIR, cr); -@@ -344,3 +346,3 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - ASSERT3S(error, <=, 0); -@@ -423,8 +425,8 @@ zpl_shares_lookup(struct inode *dip, struct dentry *dentry, - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -497,6 +499,7 @@ zpl_shares_getattr(struct vfsmount *mnt, struct dentry *dentry, - error = -zfs_zget(zsb, zsb->z_shares_dir, &dzp); -- if (error == 0) -- error = -zfs_getattr_fast(dentry->d_inode, stat); -+ if (error == 0) { -+ error = -zfs_getattr_fast(ZTOI(dzp), stat); -+ iput(ZTOI(dzp)); -+ } - -- iput(ZTOI(dzp)); - ZFS_EXIT(zsb); -diff --git a/module/zfs/zpl_export.c b/module/zfs/zpl_export.c -index 94625e1..ac94494 100644 ---- a/module/zfs/zpl_export.c -+++ b/module/zfs/zpl_export.c -@@ -47,3 +47,3 @@ zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) - if (len_bytes < offsetof(fid_t, fid_data)) -- return 255; -+ return (255); - -@@ -78,3 +78,3 @@ zpl_dentry_obtain_alias(struct inode *ip) - -- return result; -+ return (result); - } -@@ -94,3 +94,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) -- return ERR_PTR(-EINVAL); -+ return (ERR_PTR(-EINVAL)); - -@@ -99,3 +99,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - if (rc != 0) -- return ERR_PTR(-rc); -+ return (ERR_PTR(-rc)); - -@@ -103,3 +103,3 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, - -- return zpl_dentry_obtain_alias(ip); -+ return (zpl_dentry_obtain_alias(ip)); - } -@@ -119,5 +119,5 @@ zpl_get_parent(struct dentry *child) - if (error) -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - -- return zpl_dentry_obtain_alias(ip); -+ return (zpl_dentry_obtain_alias(ip)); - } -@@ -136,3 +136,3 @@ zpl_commit_metadata(struct inode *inode) - -- return error; -+ return (error); - } -@@ -141,7 +141,7 @@ zpl_commit_metadata(struct inode *inode) - const struct export_operations zpl_export_operations = { -- .encode_fh = zpl_encode_fh, -- .fh_to_dentry = zpl_fh_to_dentry, -- .get_parent = zpl_get_parent, -+ .encode_fh = zpl_encode_fh, -+ .fh_to_dentry = zpl_fh_to_dentry, -+ .get_parent = zpl_get_parent, - #ifdef HAVE_COMMIT_METADATA -- .commit_metadata= zpl_commit_metadata, -+ .commit_metadata = zpl_commit_metadata, - #endif /* HAVE_COMMIT_METADATA */ -diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c -index 6598c17..3737bb5 100644 ---- a/module/zfs/zpl_file.c -+++ b/module/zfs/zpl_file.c -@@ -25,2 +25,3 @@ - -+#include - #include -@@ -37,2 +38,6 @@ zpl_open(struct inode *ip, struct file *filp) - -+ error = generic_file_open(ip, filp); -+ if (error) -+ return (error); -+ - crhold(cr); -@@ -42,6 +47,3 @@ zpl_open(struct inode *ip, struct file *filp) - -- if (error) -- return (error); -- -- return generic_file_open(ip, filp); -+ return (error); - } -@@ -169,5 +171,6 @@ ssize_t - zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos, -- uio_seg_t segment, int flags, cred_t *cr) -+ uio_seg_t segment, int flags, cred_t *cr) - { - int error; -+ ssize_t read; - struct iovec iov; -@@ -189,3 +192,6 @@ zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - -- return (len - uio.uio_resid); -+ read = len - uio.uio_resid; -+ task_io_account_read(read); -+ -+ return (read); - } -@@ -215,2 +221,3 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - int error; -+ ssize_t wrote; - struct iovec iov; -@@ -232,3 +239,6 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t pos, - -- return (len - uio.uio_resid); -+ wrote = len - uio.uio_resid; -+ task_io_account_write(wrote); -+ -+ return (wrote); - } -@@ -272,3 +282,3 @@ zpl_llseek(struct file *filp, loff_t offset, int whence) - -- return generic_file_llseek(filp, offset, whence); -+ return (generic_file_llseek(filp, offset, whence)); - } -@@ -373,3 +383,3 @@ zpl_readpage(struct file *filp, struct page *pp) - unlock_page(pp); -- return error; -+ return (error); - } -@@ -414,3 +424,39 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) - { -- return write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ znode_t *zp = ITOZ(mapping->host); -+ zfs_sb_t *zsb = ITOZSB(mapping->host); -+ enum writeback_sync_modes sync_mode; -+ int result; -+ -+ ZFS_ENTER(zsb); -+ if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) -+ wbc->sync_mode = WB_SYNC_ALL; -+ ZFS_EXIT(zsb); -+ sync_mode = wbc->sync_mode; -+ -+ /* -+ * We don't want to run write_cache_pages() in SYNC mode here, because -+ * that would make putpage() wait for a single page to be committed to -+ * disk every single time, resulting in atrocious performance. Instead -+ * we run it once in non-SYNC mode so that the ZIL gets all the data, -+ * and then we commit it all in one go. -+ */ -+ wbc->sync_mode = WB_SYNC_NONE; -+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ if (sync_mode != wbc->sync_mode) { -+ ZFS_ENTER(zsb); -+ ZFS_VERIFY_ZP(zp); -+ zil_commit(zsb->z_log, zp->z_id); -+ ZFS_EXIT(zsb); -+ -+ /* -+ * We need to call write_cache_pages() again (we can't just -+ * return after the commit) because the previous call in -+ * non-SYNC mode does not guarantee that we got all the dirty -+ * pages (see the implementation of write_cache_pages() for -+ * details). That being said, this is a no-op in most cases. -+ */ -+ wbc->sync_mode = sync_mode; -+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); -+ } -+ return (result); - } -@@ -426,3 +472,6 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) - { -- return zpl_putpage(pp, wbc, pp->mapping); -+ if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) -+ wbc->sync_mode = WB_SYNC_ALL; -+ -+ return (zpl_putpage(pp, wbc, pp->mapping)); - } -@@ -489,3 +538,3 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) - { -- return zpl_ioctl(filp, cmd, arg); -+ return (zpl_ioctl(filp, cmd, arg)); - } -@@ -498,3 +547,3 @@ const struct address_space_operations zpl_address_space_operations = { - .writepage = zpl_writepage, -- .writepages = zpl_writepages, -+ .writepages = zpl_writepages, - }; -@@ -510,7 +559,7 @@ const struct file_operations zpl_file_operations = { - #ifdef HAVE_FILE_FALLOCATE -- .fallocate = zpl_fallocate, -+ .fallocate = zpl_fallocate, - #endif /* HAVE_FILE_FALLOCATE */ -- .unlocked_ioctl = zpl_ioctl, -+ .unlocked_ioctl = zpl_ioctl, - #ifdef CONFIG_COMPAT -- .compat_ioctl = zpl_compat_ioctl, -+ .compat_ioctl = zpl_compat_ioctl, - #endif -diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c -index ab1fe68..c009807 100644 ---- a/module/zfs/zpl_inode.c -+++ b/module/zfs/zpl_inode.c -@@ -44,3 +44,3 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) - if (dlen(dentry) > ZFS_MAXNAMELEN) -- return ERR_PTR(-ENAMETOOLONG); -+ return (ERR_PTR(-ENAMETOOLONG)); - -@@ -60,8 +60,8 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) - if (error == -ENOENT) -- return d_splice_alias(NULL, dentry); -+ return (d_splice_alias(NULL, dentry)); - else -- return ERR_PTR(error); -+ return (ERR_PTR(error)); - } - -- return d_splice_alias(ip, dentry); -+ return (d_splice_alias(ip, dentry)); - } -@@ -99,3 +99,3 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); -@@ -104,4 +104,4 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - if (error == 0) { -- error = zpl_xattr_security_init(ip, dir, &dentry->d_name); -- VERIFY3S(error, ==, 0); -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -@@ -109,3 +109,3 @@ zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -133,3 +133,3 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); -@@ -138,6 +138,9 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -145,3 +148,3 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, - -- return (-error); -+ return (error); - } -@@ -171,3 +174,3 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode | S_IFDIR, cr); -@@ -175,6 +178,9 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) - error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); -+ VERIFY0(zpl_init_acl(ip, dir)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -225,2 +231,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - { -+ struct inode *ip = dentry->d_inode; - cred_t *cr = CRED(); -@@ -229,3 +236,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - -- error = inode_change_ok(dentry->d_inode, ia); -+ error = inode_change_ok(ip, ia); - if (error) -@@ -234,3 +241,3 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; -@@ -244,5 +251,7 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) - -- error = -zfs_setattr(dentry->d_inode, vap, 0, cr); -+ error = -zfs_setattr(ip, vap, 0, cr); -+ if (!error && (ia->ia_valid & ATTR_MODE)) -+ error = zpl_chmod_acl(ip); - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -277,3 +286,3 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) - crhold(cr); -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); -@@ -281,6 +290,8 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) - error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); -- if (error == 0) -+ if (error == 0) { -+ VERIFY0(zpl_xattr_security_init(ip, dir, &dentry->d_name)); - d_instantiate(dentry, ip); -+ } - -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - crfree(cr); -@@ -340,3 +351,3 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) - if (ip->i_nlink >= ZFS_LINK_MAX) -- return -EMLINK; -+ return (-EMLINK); - -@@ -362,3 +373,3 @@ out: - static void --zpl_truncate_range(struct inode* ip, loff_t start, loff_t end) -+zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) - { -@@ -393,3 +404,3 @@ zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) - { -- return zpl_fallocate_common(ip, mode, offset, len); -+ return (zpl_fallocate_common(ip, mode, offset, len)); - } -@@ -457,2 +468,11 @@ const struct inode_operations zpl_inode_operations = { - #endif /* HAVE_INODE_FALLOCATE */ -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -@@ -475,2 +495,11 @@ const struct inode_operations zpl_dir_inode_operations = { - .listxattr = zpl_xattr_list, -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -@@ -496,2 +525,11 @@ const struct inode_operations zpl_special_inode_operations = { - .listxattr = zpl_xattr_list, -+#if defined(CONFIG_FS_POSIX_ACL) -+#if defined(HAVE_GET_ACL) -+ .get_acl = zpl_get_acl, -+#elif defined(HAVE_CHECK_ACL) -+ .check_acl = zpl_check_acl, -+#elif defined(HAVE_PERMISSION) -+ .permission = zpl_permission, -+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* CONFIG_FS_POSIX_ACL */ - }; -diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c -index eee4a50..45639a6 100644 ---- a/module/zfs/zpl_super.c -+++ b/module/zfs/zpl_super.c -@@ -46,3 +46,3 @@ zpl_inode_destroy(struct inode *ip) - { -- ASSERT(atomic_read(&ip->i_count) == 0); -+ ASSERT(atomic_read(&ip->i_count) == 0); - zfs_inode_destroy(ip); -@@ -181,15 +181,41 @@ zpl_umount_begin(struct super_block *sb) - /* -- * The Linux VFS automatically handles the following flags: -- * MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, MNT_NOATIME, MNT_READONLY -+ * ZFS specific features must be explicitly handled here, the VFS will -+ * automatically handled the following generic functionality. -+ * -+ * MNT_NOSUID, -+ * MNT_NODEV, -+ * MNT_NOEXEC, -+ * MNT_NOATIME, -+ * MNT_NODIRATIME, -+ * MNT_READONLY, -+ * MNT_STRICTATIME, -+ * MS_SYNCHRONOUS, -+ * MS_DIRSYNC, -+ * MS_MANDLOCK. - */ --#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY - static int --zpl_show_options(struct seq_file *seq, struct dentry *root) -+__zpl_show_options(struct seq_file *seq, zfs_sb_t *zsb) - { -- zfs_sb_t *zsb = root->d_sb->s_fs_info; -- - seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); - -+#ifdef CONFIG_FS_POSIX_ACL -+ switch (zsb->z_acl_type) { -+ case ZFS_ACLTYPE_POSIXACL: -+ seq_puts(seq, ",posixacl"); -+ break; -+ default: -+ seq_puts(seq, ",noacl"); -+ break; -+ } -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - return (0); - } -+ -+#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY -+static int -+zpl_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ return (__zpl_show_options(seq, root->d_sb->s_fs_info)); -+} - #else -@@ -198,7 +224,3 @@ zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) - { -- zfs_sb_t *zsb = vfsp->mnt_sb->s_fs_info; -- -- seq_printf(seq, ",%s", zsb->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); -- -- return (0); -+ return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); - } -@@ -224,3 +246,3 @@ zpl_mount(struct file_system_type *fs_type, int flags, - -- return mount_nodev(fs_type, flags, &zmd, zpl_fill_super); -+ return (mount_nodev(fs_type, flags, &zmd, zpl_fill_super)); - } -@@ -233,3 +255,3 @@ zpl_get_sb(struct file_system_type *fs_type, int flags, - -- return get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt); -+ return (get_sb_nodev(fs_type, flags, &zmd, zpl_fill_super, mnt)); - } -@@ -267,4 +289,2 @@ zpl_prune_sb(struct super_block *sb, void *arg) - ASSERT3S(error, <=, 0); -- -- return; - } -@@ -274,3 +294,3 @@ zpl_prune_sbs(int64_t bytes_to_scan, void *private) - { -- unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); -+ unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t)); - -@@ -291,7 +311,7 @@ zpl_prune_sbs(int64_t bytes_to_scan, void *private) - { -- unsigned long nr_to_scan = (bytes_to_scan / sizeof(znode_t)); -+ unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t)); - -- shrink_dcache_memory(nr_to_scan, GFP_KERNEL); -- shrink_icache_memory(nr_to_scan, GFP_KERNEL); -- kmem_reap(); -+ shrink_dcache_memory(nr_to_scan, GFP_KERNEL); -+ shrink_icache_memory(nr_to_scan, GFP_KERNEL); -+ kmem_reap(); - } -@@ -324,3 +344,3 @@ zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) - { -- arc_adjust_meta(nr_to_scan * sizeof(znode_t), B_FALSE); -+ /* noop */ - } -diff --git a/module/zfs/zpl_xattr.c b/module/zfs/zpl_xattr.c -index d79d35b..c5c15a2 100644 ---- a/module/zfs/zpl_xattr.c -+++ b/module/zfs/zpl_xattr.c -@@ -96,3 +96,3 @@ zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) - { -- if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) -+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) - if (!(ITOZSB(xf->inode)->z_flags & ZSB_XATTR)) -@@ -100,3 +100,3 @@ zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) - -- if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) -+ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) - if (!capable(CAP_SYS_ADMIN)) -@@ -196,3 +196,3 @@ zpl_xattr_list_sa(xattr_filldir_t *xf) - error = zpl_xattr_filldir(xf, nvpair_name(nvp), -- strlen(nvpair_name(nvp))); -+ strlen(nvpair_name(nvp))); - if (error) -@@ -357,8 +357,16 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, - ssize_t wrote; -- int error; -+ int lookup_flags, error; - const int xattr_mode = S_IFREG | 0644; - -- /* Lookup the xattr directory and create it if required. */ -- error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR, -- cr, NULL, NULL); -+ /* -+ * Lookup the xattr directory. When we're adding an entry pass -+ * CREATE_XATTR_DIR to ensure the xattr directory is created. -+ * When removing an entry this flag is not passed to avoid -+ * unnecessarily creating a new xattr directory. -+ */ -+ lookup_flags = LOOKUP_XATTR; -+ if (value != NULL) -+ lookup_flags |= CREATE_XATTR_DIR; -+ -+ error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL); - if (error) -@@ -383,3 +391,3 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, - if (xip == NULL) { -- vap = kmem_zalloc(sizeof(vattr_t), KM_SLEEP); -+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - vap->va_mode = xattr_mode; -@@ -407,3 +415,3 @@ out: - if (vap) -- kmem_free(vap, sizeof(vattr_t)); -+ kmem_free(vap, sizeof (vattr_t)); - -@@ -440,6 +448,2 @@ zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, - } else { -- /* Do not allow SA xattrs in symlinks (issue #1648) */ -- if (S_ISLNK(ip->i_mode)) -- return (-EMLINK); -- - /* Limited to 32k to keep nvpair memory allocations small */ -@@ -495,3 +499,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, - -- if ((error == -ENODATA) && (flags & XATTR_REPLACE)) -+ if (flags & XATTR_REPLACE) -+ goto out; -+ -+ /* The xattr to be removed already doesn't exist */ -+ error = 0; -+ if (value == NULL) - goto out; -@@ -527,6 +536,6 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) -- return -EOPNOTSUPP; -+ return (-EOPNOTSUPP); - -@@ -548,6 +557,6 @@ __zpl_xattr_user_set(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - - if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) -- return -EOPNOTSUPP; -+ return (-EOPNOTSUPP); - -@@ -575,6 +584,6 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, - if (!capable(CAP_SYS_ADMIN)) -- return -EACCES; -+ return (-EACCES); - - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -596,6 +605,6 @@ __zpl_xattr_trusted_set(struct inode *ip, const char *name, - if (!capable(CAP_SYS_ADMIN)) -- return -EACCES; -+ return (-EACCES); - - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -623,3 +632,3 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -641,3 +650,3 @@ __zpl_xattr_security_set(struct inode *ip, const char *name, - if (strcmp(name, "") == 0) -- return -EINVAL; -+ return (-EINVAL); - -@@ -689,6 +698,7 @@ zpl_xattr_security_init(struct inode *ip, struct inode *dip, - error = zpl_security_inode_init_security(ip, dip, qstr, -- &name, &value, &len); -+ &name, &value, &len); - if (error) { - if (error == -EOPNOTSUPP) -- return 0; -+ return (0); -+ - return (error); -@@ -711,2 +721,471 @@ xattr_handler_t zpl_xattr_security_handler = { - -+#ifdef CONFIG_FS_POSIX_ACL -+ -+int -+zpl_set_acl(struct inode *ip, int type, struct posix_acl *acl) -+{ -+ struct super_block *sb = ITOZSB(ip)->z_sb; -+ char *name, *value = NULL; -+ int error = 0; -+ size_t size = 0; -+ -+ if (S_ISLNK(ip->i_mode)) -+ return (-EOPNOTSUPP); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ name = POSIX_ACL_XATTR_ACCESS; -+ if (acl) { -+ zpl_equivmode_t mode = ip->i_mode; -+ error = posix_acl_equiv_mode(acl, &mode); -+ if (error < 0) { -+ return (error); -+ } else { -+ /* -+ * The mode bits will have been set by -+ * ->zfs_setattr()->zfs_acl_chmod_setattr() -+ * using the ZFS ACL conversion. If they -+ * differ from the Posix ACL conversion dirty -+ * the inode to write the Posix mode bits. -+ */ -+ if (ip->i_mode != mode) { -+ ip->i_mode = mode; -+ ip->i_ctime = current_fs_time(sb); -+ mark_inode_dirty(ip); -+ } -+ -+ if (error == 0) -+ acl = NULL; -+ } -+ } -+ break; -+ -+ case ACL_TYPE_DEFAULT: -+ name = POSIX_ACL_XATTR_DEFAULT; -+ if (!S_ISDIR(ip->i_mode)) -+ return (acl ? -EACCES : 0); -+ break; -+ -+ default: -+ return (-EINVAL); -+ } -+ -+ if (acl) { -+ size = posix_acl_xattr_size(acl->a_count); -+ value = kmem_alloc(size, KM_SLEEP); -+ -+ error = zpl_acl_to_xattr(acl, value, size); -+ if (error < 0) { -+ kmem_free(value, size); -+ return (error); -+ } -+ } -+ -+ error = zpl_xattr_set(ip, name, value, size, 0); -+ if (value) -+ kmem_free(value, size); -+ -+ if (!error) { -+ if (acl) -+ zpl_set_cached_acl(ip, type, acl); -+ else -+ zpl_forget_cached_acl(ip, type); -+ } -+ -+ return (error); -+} -+ -+struct posix_acl * -+zpl_get_acl(struct inode *ip, int type) -+{ -+ struct posix_acl *acl; -+ void *value = NULL; -+ char *name; -+ int size; -+ -+#ifdef HAVE_POSIX_ACL_CACHING -+ acl = get_cached_acl(ip, type); -+ if (acl != ACL_NOT_CACHED) -+ return (acl); -+#endif /* HAVE_POSIX_ACL_CACHING */ -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ name = POSIX_ACL_XATTR_ACCESS; -+ break; -+ case ACL_TYPE_DEFAULT: -+ name = POSIX_ACL_XATTR_DEFAULT; -+ break; -+ default: -+ return (ERR_PTR(-EINVAL)); -+ } -+ -+ size = zpl_xattr_get(ip, name, NULL, 0); -+ if (size > 0) { -+ value = kmem_alloc(size, KM_PUSHPAGE); -+ size = zpl_xattr_get(ip, name, value, size); -+ } -+ -+ if (size > 0) { -+ acl = zpl_acl_from_xattr(value, size); -+ } else if (size == -ENODATA || size == -ENOSYS) { -+ acl = NULL; -+ } else { -+ acl = ERR_PTR(-EIO); -+ } -+ -+ if (size > 0) -+ kmem_free(value, size); -+ -+ if (!IS_ERR(acl)) -+ zpl_set_cached_acl(ip, type, acl); -+ -+ return (acl); -+} -+ -+#if !defined(HAVE_GET_ACL) -+static int -+__zpl_check_acl(struct inode *ip, int mask) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ -+ if (acl) { -+ error = posix_acl_permission(ip, acl, mask); -+ zpl_posix_acl_release(acl); -+ return (error); -+ } -+ -+ return (-EAGAIN); -+} -+ -+#if defined(HAVE_CHECK_ACL_WITH_FLAGS) -+int -+zpl_check_acl(struct inode *ip, int mask, unsigned int flags) -+{ -+ return (__zpl_check_acl(ip, mask)); -+} -+#elif defined(HAVE_CHECK_ACL) -+int -+zpl_check_acl(struct inode *ip, int mask) -+{ -+ return (__zpl_check_acl(ip, mask)); -+} -+#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) -+int -+zpl_permission(struct inode *ip, int mask, struct nameidata *nd) -+{ -+ return (generic_permission(ip, mask, __zpl_check_acl)); -+} -+#elif defined(HAVE_PERMISSION) -+int -+zpl_permission(struct inode *ip, int mask) -+{ -+ return (generic_permission(ip, mask, __zpl_check_acl)); -+} -+#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ -+#endif /* !HAVE_GET_ACL */ -+ -+int -+zpl_init_acl(struct inode *ip, struct inode *dir) -+{ -+ struct posix_acl *acl = NULL; -+ int error = 0; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ if (!S_ISLNK(ip->i_mode)) { -+ if (ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) { -+ acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ } -+ -+ if (!acl) { -+ ip->i_mode &= ~current_umask(); -+ ip->i_ctime = current_fs_time(ITOZSB(ip)->z_sb); -+ mark_inode_dirty(ip); -+ return (0); -+ } -+ } -+ -+ if ((ITOZSB(ip)->z_acl_type == ZFS_ACLTYPE_POSIXACL) && acl) { -+ umode_t mode; -+ -+ if (S_ISDIR(ip->i_mode)) { -+ error = zpl_set_acl(ip, ACL_TYPE_DEFAULT, acl); -+ if (error) -+ goto out; -+ } -+ -+ mode = ip->i_mode; -+ error = __posix_acl_create(&acl, GFP_KERNEL, &mode); -+ if (error >= 0) { -+ ip->i_mode = mode; -+ mark_inode_dirty(ip); -+ if (error > 0) -+ error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); -+ } -+ } -+out: -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+int -+zpl_chmod_acl(struct inode *ip) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ if (S_ISLNK(ip->i_mode)) -+ return (-EOPNOTSUPP); -+ -+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); -+ if (IS_ERR(acl) || !acl) -+ return (PTR_ERR(acl)); -+ -+ error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); -+ if (!error) -+ error = zpl_set_acl(ip, ACL_TYPE_ACCESS, acl); -+ -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+static size_t -+zpl_xattr_acl_list(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len, int type) -+{ -+ char *xattr_name; -+ size_t xattr_size; -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (0); -+ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ xattr_name = POSIX_ACL_XATTR_ACCESS; -+ xattr_size = sizeof (xattr_name); -+ break; -+ case ACL_TYPE_DEFAULT: -+ xattr_name = POSIX_ACL_XATTR_DEFAULT; -+ xattr_size = sizeof (xattr_name); -+ break; -+ default: -+ return (0); -+ } -+ -+ if (list && xattr_size <= list_size) -+ memcpy(list, xattr_name, xattr_size); -+ -+ return (xattr_size); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_LIST -+static size_t -+zpl_xattr_acl_list_access(struct dentry *dentry, char *list, -+ size_t list_size, const char *name, size_t name_len, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return zpl_xattr_acl_list(dentry->d_inode, -+ list, list_size, name, name_len, type); -+} -+ -+static size_t -+zpl_xattr_acl_list_default(struct dentry *dentry, char *list, -+ size_t list_size, const char *name, size_t name_len, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return zpl_xattr_acl_list(dentry->d_inode, -+ list, list_size, name, name_len, type); -+} -+ -+#else -+ -+static size_t -+zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ return zpl_xattr_acl_list(ip, -+ list, list_size, name, name_len, ACL_TYPE_ACCESS); -+} -+ -+static size_t -+zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, -+ const char *name, size_t name_len) -+{ -+ return zpl_xattr_acl_list(ip, -+ list, list_size, name, name_len, ACL_TYPE_DEFAULT); -+} -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+ -+static int -+zpl_xattr_acl_get(struct inode *ip, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ struct posix_acl *acl; -+ int error; -+ -+ if (strcmp(name, "") != 0) -+ return (-EINVAL); -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (-EOPNOTSUPP); -+ -+ acl = zpl_get_acl(ip, type); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ if (acl == NULL) -+ return (-ENODATA); -+ -+ error = zpl_acl_to_xattr(acl, buffer, size); -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_GET -+static int -+zpl_xattr_acl_get_access(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return (zpl_xattr_acl_get(dentry->d_inode, name, buffer, size, type)); -+} -+ -+static int -+zpl_xattr_acl_get_default(struct dentry *dentry, const char *name, -+ void *buffer, size_t size, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return (zpl_xattr_acl_get(dentry->d_inode, name, buffer, size, type)); -+} -+ -+#else -+ -+static int -+zpl_xattr_acl_get_access(struct inode *ip, const char *name, -+ void *buffer, size_t size) -+{ -+ return (zpl_xattr_acl_get(ip, name, buffer, size, ACL_TYPE_ACCESS)); -+} -+ -+static int -+zpl_xattr_acl_get_default(struct inode *ip, const char *name, -+ void *buffer, size_t size) -+{ -+ return (zpl_xattr_acl_get(ip, name, buffer, size, ACL_TYPE_DEFAULT)); -+} -+#endif /* HAVE_DENTRY_XATTR_GET */ -+ -+static int -+zpl_xattr_acl_set(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ struct posix_acl *acl; -+ int error = 0; -+ -+ if (strcmp(name, "") != 0) -+ return (-EINVAL); -+ -+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) -+ return (-EOPNOTSUPP); -+ -+ if (!zpl_inode_owner_or_capable(ip)) -+ return (-EPERM); -+ -+ if (value) { -+ acl = zpl_acl_from_xattr(value, size); -+ if (IS_ERR(acl)) -+ return (PTR_ERR(acl)); -+ else if (acl) { -+ error = posix_acl_valid(acl); -+ if (error) { -+ zpl_posix_acl_release(acl); -+ return (error); -+ } -+ } -+ } else { -+ acl = NULL; -+ } -+ -+ error = zpl_set_acl(ip, type, acl); -+ zpl_posix_acl_release(acl); -+ -+ return (error); -+} -+ -+#ifdef HAVE_DENTRY_XATTR_SET -+static int -+zpl_xattr_acl_set_access(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_ACCESS); -+ return (zpl_xattr_acl_set(dentry->d_inode, -+ name, value, size, flags, type)); -+} -+ -+static int -+zpl_xattr_acl_set_default(struct dentry *dentry, const char *name, -+ const void *value, size_t size, int flags, int type) -+{ -+ ASSERT3S(type, ==, ACL_TYPE_DEFAULT); -+ return zpl_xattr_acl_set(dentry->d_inode, -+ name, value, size, flags, type); -+} -+ -+#else -+ -+static int -+zpl_xattr_acl_set_access(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return zpl_xattr_acl_set(ip, -+ name, value, size, flags, ACL_TYPE_ACCESS); -+} -+ -+static int -+zpl_xattr_acl_set_default(struct inode *ip, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ return zpl_xattr_acl_set(ip, -+ name, value, size, flags, ACL_TYPE_DEFAULT); -+} -+#endif /* HAVE_DENTRY_XATTR_SET */ -+ -+struct xattr_handler zpl_xattr_acl_access_handler = -+{ -+ .prefix = POSIX_ACL_XATTR_ACCESS, -+ .list = zpl_xattr_acl_list_access, -+ .get = zpl_xattr_acl_get_access, -+ .set = zpl_xattr_acl_set_access, -+#ifdef HAVE_DENTRY_XATTR_LIST -+ .flags = ACL_TYPE_ACCESS, -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+}; -+ -+struct xattr_handler zpl_xattr_acl_default_handler = -+{ -+ .prefix = POSIX_ACL_XATTR_DEFAULT, -+ .list = zpl_xattr_acl_list_default, -+ .get = zpl_xattr_acl_get_default, -+ .set = zpl_xattr_acl_set_default, -+#ifdef HAVE_DENTRY_XATTR_LIST -+ .flags = ACL_TYPE_DEFAULT, -+#endif /* HAVE_DENTRY_XATTR_LIST */ -+}; -+ -+#endif /* CONFIG_FS_POSIX_ACL */ -+ - xattr_handler_t *zpl_xattr_handlers[] = { -@@ -715,6 +1194,6 @@ xattr_handler_t *zpl_xattr_handlers[] = { - &zpl_xattr_user_handler, --#ifdef HAVE_POSIX_ACLS -+#ifdef CONFIG_FS_POSIX_ACL - &zpl_xattr_acl_access_handler, - &zpl_xattr_acl_default_handler, --#endif /* HAVE_POSIX_ACLS */ -+#endif /* CONFIG_FS_POSIX_ACL */ - NULL -diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c -index b516156..fa5c7eb 100644 ---- a/module/zfs/zvol.c -+++ b/module/zfs/zvol.c -@@ -37,2 +37,3 @@ - -+#include - #include -@@ -63,4 +64,4 @@ typedef struct zvol_state { - char zv_name[MAXNAMELEN]; /* name */ -- uint64_t zv_volsize; /* advertised space */ -- uint64_t zv_volblocksize;/* volume block size */ -+ uint64_t zv_volsize; /* advertised space */ -+ uint64_t zv_volblocksize; /* volume block size */ - objset_t *zv_objset; /* objset handle */ -@@ -95,3 +96,3 @@ zvol_find_minor(unsigned *minor) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { -+ zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { - if (MINOR(zv->zv_dev) != MINOR(*minor)) -@@ -102,5 +103,5 @@ zvol_find_minor(unsigned *minor) - if (*minor >= (1 << MINORBITS)) -- return ENXIO; -+ return (SET_ERROR(ENXIO)); - -- return 0; -+ return (0); - } -@@ -117,8 +118,8 @@ zvol_find_by_dev(dev_t dev) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -+ zv = list_next(&zvol_state_list, zv)) { - if (zv->zv_dev == dev) -- return zv; -+ return (zv); - } - -- return NULL; -+ return (NULL); - } -@@ -135,8 +136,8 @@ zvol_find_by_name(const char *name) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -- if (!strncmp(zv->zv_name, name, MAXNAMELEN)) -- return zv; -+ zv = list_next(&zvol_state_list, zv)) { -+ if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0) -+ return (zv); - } - -- return NULL; -+ return (NULL); - } -@@ -161,3 +162,3 @@ zvol_is_zvol(const char *device) - if (major == zvol_major) -- return (B_TRUE); -+ return (B_TRUE); - -@@ -216,6 +217,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) - if (error) -- return (error); -+ return (SET_ERROR(error)); - - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - error = dmu_object_info(os, ZVOL_OBJ, doi); -@@ -227,5 +228,30 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) - -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - -- return (error); -+ return (SET_ERROR(error)); -+} -+ -+static void -+zvol_size_changed(zvol_state_t *zv, uint64_t volsize) -+{ -+ struct block_device *bdev; -+ -+ bdev = bdget_disk(zv->zv_disk, 0); -+ if (bdev == NULL) -+ return; -+/* -+ * 2.6.28 API change -+ * Added check_disk_size_change() helper function. -+ */ -+#ifdef HAVE_CHECK_DISK_SIZE_CHANGE -+ set_capacity(zv->zv_disk, volsize >> 9); -+ zv->zv_volsize = volsize; -+ check_disk_size_change(zv->zv_disk, bdev); -+#else -+ zv->zv_volsize = volsize; -+ zv->zv_changed = 1; -+ (void) check_disk_change(bdev); -+#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ -+ -+ bdput(bdev); - } -@@ -239,6 +265,6 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - if (volsize == 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - - if (volsize % blocksize != 0) -- return (EINVAL); -+ return (SET_ERROR(EINVAL)); - -@@ -246,3 +272,3 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - if (volsize - 1 > MAXOFFSET_T) -- return (EOVERFLOW); -+ return (SET_ERROR(EOVERFLOW)); - #endif -@@ -255,5 +281,4 @@ zvol_check_volsize(uint64_t volsize, uint64_t blocksize) - static int --zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) -+zvol_update_volsize(uint64_t volsize, objset_t *os) - { -- struct block_device *bdev; - dmu_tx_t *tx; -@@ -268,3 +293,3 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) - dmu_tx_abort(tx); -- return (error); -+ return (SET_ERROR(error)); - } -@@ -275,28 +300,19 @@ zvol_update_volsize(zvol_state_t *zv, uint64_t volsize, objset_t *os) - -- if (error) -- return (error); -+ if (error == 0) -+ error = dmu_free_long_range(os, -+ ZVOL_OBJ, volsize, DMU_OBJECT_END); - -- error = dmu_free_long_range(os, -- ZVOL_OBJ, volsize, DMU_OBJECT_END); -- if (error) -- return (error); -+ return (error); -+} - -- bdev = bdget_disk(zv->zv_disk, 0); -- if (!bdev) -- return (EIO); --/* -- * 2.6.28 API change -- * Added check_disk_size_change() helper function. -- */ --#ifdef HAVE_CHECK_DISK_SIZE_CHANGE -- set_capacity(zv->zv_disk, volsize >> 9); -- zv->zv_volsize = volsize; -- check_disk_size_change(zv->zv_disk, bdev); --#else -- zv->zv_volsize = volsize; -- zv->zv_changed = 1; -- (void) check_disk_change(bdev); --#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */ -+static int -+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) -+{ -+ zvol_size_changed(zv, volsize); - -- bdput(bdev); -+ /* -+ * We should post a event here describing the expansion. However, -+ * the zfs_ereport_post() interface doesn't nicely support posting -+ * events for zvols, it assumes events relate to vdevs or zios. -+ */ - -@@ -311,46 +327,50 @@ zvol_set_volsize(const char *name, uint64_t volsize) - { -- zvol_state_t *zv; -- dmu_object_info_t *doi; -+ zvol_state_t *zv = NULL; - objset_t *os = NULL; -- uint64_t readonly; - int error; -+ dmu_object_info_t *doi; -+ uint64_t readonly; -+ boolean_t owned = B_FALSE; - -- mutex_enter(&zvol_state_lock); -+ error = dsl_prop_get_integer(name, -+ zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); -+ if (error != 0) -+ return (SET_ERROR(error)); -+ if (readonly) -+ return (SET_ERROR(EROFS)); - -+ mutex_enter(&zvol_state_lock); - zv = zvol_find_by_name(name); -- if (zv == NULL) { -- error = ENXIO; -- goto out; -- } -- -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); - -- error = dmu_objset_hold(name, FTAG, &os); -- if (error) -- goto out_doi; -+ if (zv == NULL || zv->zv_objset == NULL) { -+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, -+ FTAG, &os)) != 0) { -+ mutex_exit(&zvol_state_lock); -+ return (SET_ERROR(error)); -+ } -+ owned = B_TRUE; -+ if (zv != NULL) -+ zv->zv_objset = os; -+ } else { -+ os = zv->zv_objset; -+ } - -- if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) != 0 || -- (error = zvol_check_volsize(volsize,doi->doi_data_block_size)) != 0) -- goto out_doi; -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - -- VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, NULL) == 0); -- if (readonly) { -- error = EROFS; -- goto out_doi; -- } -+ if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || -+ (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) -+ goto out; - -- if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { -- error = EROFS; -- goto out_doi; -- } -+ error = zvol_update_volsize(volsize, os); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - -- error = zvol_update_volsize(zv, volsize, os); --out_doi: -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ if (error == 0 && zv != NULL) -+ error = zvol_update_live_volsize(zv, volsize); - out: -- if (os) -- dmu_objset_rele(os, FTAG); -- -+ if (owned) { -+ dmu_objset_disown(os, FTAG); -+ if (zv != NULL) -+ zv->zv_objset = NULL; -+ } - mutex_exit(&zvol_state_lock); -- - return (error); -@@ -367,3 +387,3 @@ zvol_check_volblocksize(uint64_t volblocksize) - !ISP2(volblocksize)) -- return (EDOM); -+ return (SET_ERROR(EDOM)); - -@@ -386,3 +406,3 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - if (zv == NULL) { -- error = ENXIO; -+ error = SET_ERROR(ENXIO); - goto out; -@@ -390,4 +410,4 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - -- if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { -- error = EROFS; -+ if (zv->zv_flags & ZVOL_RDONLY) { -+ error = SET_ERROR(EROFS); - goto out; -@@ -404,3 +424,3 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) - if (error == ENOTSUP) -- error = EBUSY; -+ error = SET_ERROR(EBUSY); - dmu_tx_commit(tx); -@@ -412,3 +432,3 @@ out: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -442,3 +462,3 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -448,3 +468,3 @@ zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) - { -- return (ENOTSUP); -+ return (SET_ERROR(ENOTSUP)); - } -@@ -480,4 +500,4 @@ ssize_t zvol_immediate_write_sz = 32768; - static void --zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, -- uint64_t offset, uint64_t size, int sync) -+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, -+ uint64_t size, int sync) - { -@@ -656,3 +676,3 @@ zvol_discard(void *arg) - -- error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end - start); -+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, end-start); - -@@ -700,3 +720,3 @@ zvol_read(void *arg) - if (error == ECKSUM) -- error = EIO; -+ error = SET_ERROR(EIO); - -@@ -744,6 +764,6 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO -- "%s: bad access: block=%llu, count=%lu\n", -- req->rq_disk->disk_name, -- (long long unsigned)blk_rq_pos(req), -- (long unsigned)blk_rq_sectors(req)); -+ "%s: bad access: block=%llu, count=%lu\n", -+ req->rq_disk->disk_name, -+ (long long unsigned)blk_rq_pos(req), -+ (long unsigned)blk_rq_sectors(req)); - __blk_end_request(req, -EIO, size); -@@ -754,3 +774,3 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO "%s: non-fs cmd\n", -- req->rq_disk->disk_name); -+ req->rq_disk->disk_name); - __blk_end_request(req, -EIO, size); -@@ -764,4 +784,3 @@ zvol_request(struct request_queue *q) - case WRITE: -- if (unlikely(get_disk_ro(zv->zv_disk)) || -- unlikely(zv->zv_flags & ZVOL_RDONLY)) { -+ if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - __blk_end_request(req, -EROFS, size); -@@ -781,3 +800,3 @@ zvol_request(struct request_queue *q) - printk(KERN_INFO "%s: unknown cmd: %d\n", -- req->rq_disk->disk_name, (int)rq_data_dir(req)); -+ req->rq_disk->disk_name, (int)rq_data_dir(req)); - __blk_end_request(req, -EIO, size); -@@ -810,4 +829,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - objset_t *os = zv->zv_objset; -+ uint64_t object = ZVOL_OBJ; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; -+ blkptr_t *bp = &lr->lr_blkptr; - dmu_buf_t *db; -@@ -831,3 +852,3 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - if (buf != NULL) { /* immediate write */ -- error = dmu_read(os, ZVOL_OBJ, offset, size, buf, -+ error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); -@@ -836,5 +857,11 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - offset = P2ALIGN_TYPED(offset, size, uint64_t); -- error = dmu_buf_hold(os, ZVOL_OBJ, offset, zgd, &db, -+ error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - if (error == 0) { -+ blkptr_t *obp = dmu_buf_get_blkptr(db); -+ if (obp) { -+ ASSERT(BP_IS_HOLE(bp)); -+ *bp = *obp; -+ } -+ - zgd->zgd_db = db; -@@ -856,3 +883,3 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -870,3 +897,3 @@ zvol_insert(zvol_state_t *zv_insert) - for (zv = list_head(&zvol_state_list); zv != NULL; -- zv = list_next(&zvol_state_list, zv)) { -+ zv = list_next(&zvol_state_list, zv)) { - if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) -@@ -917,3 +944,3 @@ zvol_first_open(zvol_state_t *zv) - if (!locked) -- return (-ERESTARTSYS); -+ return (-SET_ERROR(ERESTARTSYS)); - } -@@ -956,3 +983,3 @@ out_mutex: - -- return (-error); -+ return (SET_ERROR(-error)); - } -@@ -1005,4 +1032,3 @@ zvol_open(struct block_device *bdev, fmode_t flag) - -- if ((flag & FMODE_WRITE) && -- (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY))) { -+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; -@@ -1023,3 +1049,3 @@ out_mutex: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1057,3 +1083,3 @@ static int - zvol_ioctl(struct block_device *bdev, fmode_t mode, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { -@@ -1063,3 +1089,3 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, - if (zv == NULL) -- return (-ENXIO); -+ return (SET_ERROR(-ENXIO)); - -@@ -1079,3 +1105,3 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1085,8 +1111,8 @@ static int - zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, -- unsigned cmd, unsigned long arg) -+ unsigned cmd, unsigned long arg) - { -- return zvol_ioctl(bdev, mode, cmd, arg); -+ return (zvol_ioctl(bdev, mode, cmd, arg)); - } - #else --#define zvol_compat_ioctl NULL -+#define zvol_compat_ioctl NULL - #endif -@@ -1097,3 +1123,3 @@ static int zvol_media_changed(struct gendisk *disk) - -- return zv->zv_changed; -+ return (zv->zv_changed); - } -@@ -1107,3 +1133,3 @@ static int zvol_revalidate_disk(struct gendisk *disk) - -- return 0; -+ return (0); - } -@@ -1133,3 +1159,3 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) - -- return 0; -+ return (0); - } -@@ -1147,3 +1173,3 @@ zvol_probe(dev_t dev, int *part, void *arg) - -- return kobj; -+ return (kobj); - } -@@ -1152,10 +1178,10 @@ zvol_probe(dev_t dev, int *part, void *arg) - static struct block_device_operations zvol_ops = { -- .open = zvol_open, -- .release = zvol_release, -- .ioctl = zvol_ioctl, -- .compat_ioctl = zvol_compat_ioctl, -- .media_changed = zvol_media_changed, -- .revalidate_disk = zvol_revalidate_disk, -- .getgeo = zvol_getgeo, -- .owner = THIS_MODULE, -+ .open = zvol_open, -+ .release = zvol_release, -+ .ioctl = zvol_ioctl, -+ .compat_ioctl = zvol_compat_ioctl, -+ .media_changed = zvol_media_changed, -+ .revalidate_disk = zvol_revalidate_disk, -+ .getgeo = zvol_getgeo, -+ .owner = THIS_MODULE, - }; -@@ -1167,3 +1193,3 @@ zvol_open_by_inode(struct inode *inode, struct file *file) - { -- return zvol_open(inode->i_bdev, file->f_mode); -+ return (zvol_open(inode->i_bdev, file->f_mode)); - } -@@ -1173,3 +1199,3 @@ zvol_release_by_inode(struct inode *inode, struct file *file) - { -- return zvol_release(inode->i_bdev->bd_disk, file->f_mode); -+ return (zvol_release(inode->i_bdev->bd_disk, file->f_mode)); - } -@@ -1178,32 +1204,34 @@ static int - zvol_ioctl_by_inode(struct inode *inode, struct file *file, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { - if (file == NULL || inode == NULL) -- return -EINVAL; -- return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg); -+ return (SET_ERROR(-EINVAL)); -+ -+ return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg)); - } - --# ifdef CONFIG_COMPAT -+#ifdef CONFIG_COMPAT - static long - zvol_compat_ioctl_by_inode(struct file *file, -- unsigned int cmd, unsigned long arg) -+ unsigned int cmd, unsigned long arg) - { - if (file == NULL) -- return -EINVAL; -- return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, -- file->f_mode, cmd, arg); -+ return (SET_ERROR(-EINVAL)); -+ -+ return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, -+ file->f_mode, cmd, arg)); - } --# else --# define zvol_compat_ioctl_by_inode NULL --# endif -+#else -+#define zvol_compat_ioctl_by_inode NULL -+#endif - - static struct block_device_operations zvol_ops = { -- .open = zvol_open_by_inode, -- .release = zvol_release_by_inode, -- .ioctl = zvol_ioctl_by_inode, -- .compat_ioctl = zvol_compat_ioctl_by_inode, -- .media_changed = zvol_media_changed, -- .revalidate_disk = zvol_revalidate_disk, -- .getgeo = zvol_getgeo, -- .owner = THIS_MODULE, -+ .open = zvol_open_by_inode, -+ .release = zvol_release_by_inode, -+ .ioctl = zvol_ioctl_by_inode, -+ .compat_ioctl = zvol_compat_ioctl_by_inode, -+ .media_changed = zvol_media_changed, -+ .revalidate_disk = zvol_revalidate_disk, -+ .getgeo = zvol_getgeo, -+ .owner = THIS_MODULE, - }; -@@ -1221,3 +1249,3 @@ zvol_alloc(dev_t dev, const char *name) - -- zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); -+ zv = kmem_zalloc(sizeof (zvol_state_t), KM_PUSHPAGE); - -@@ -1267,3 +1295,3 @@ zvol_alloc(dev_t dev, const char *name) - -- return zv; -+ return (zv); - -@@ -1274,3 +1302,3 @@ out_kmem: - -- return NULL; -+ return (NULL); - } -@@ -1296,18 +1324,20 @@ __zvol_snapdev_hidden(const char *name) - { -- uint64_t snapdev; -- char *parent; -- char *atp; -- int error = 0; -- -- parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); -- (void) strlcpy(parent, name, MAXPATHLEN); -- -- if ((atp = strrchr(parent, '@')) != NULL) { -- *atp = '\0'; -- error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); -- if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) -- error = ENODEV; -- } -- kmem_free(parent, MAXPATHLEN); -- return (error); -+ uint64_t snapdev; -+ char *parent; -+ char *atp; -+ int error = 0; -+ -+ parent = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); -+ (void) strlcpy(parent, name, MAXPATHLEN); -+ -+ if ((atp = strrchr(parent, '@')) != NULL) { -+ *atp = '\0'; -+ error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); -+ if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) -+ error = SET_ERROR(ENODEV); -+ } -+ -+ kmem_free(parent, MAXPATHLEN); -+ -+ return (SET_ERROR(error)); - } -@@ -1328,3 +1358,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - if (zv) { -- error = EEXIST; -+ error = SET_ERROR(EEXIST); - goto out; -@@ -1338,3 +1368,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - -- doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); -+ doi = kmem_alloc(sizeof (dmu_object_info_t), KM_PUSHPAGE); - -@@ -1358,3 +1388,3 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) - if (zv == NULL) { -- error = EAGAIN; -+ error = SET_ERROR(EAGAIN); - goto out_dmu_objset_disown; -@@ -1397,3 +1427,3 @@ out_dmu_objset_disown: - out_doi: -- kmem_free(doi, sizeof(dmu_object_info_t)); -+ kmem_free(doi, sizeof (dmu_object_info_t)); - out: -@@ -1405,3 +1435,3 @@ out: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1422,3 +1452,3 @@ zvol_create_minor(const char *name) - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1434,6 +1464,6 @@ __zvol_remove_minor(const char *name) - if (zv == NULL) -- return (ENXIO); -+ return (SET_ERROR(ENXIO)); - - if (zv->zv_open_count > 0) -- return (EBUSY); -+ return (SET_ERROR(EBUSY)); - -@@ -1457,3 +1487,27 @@ zvol_remove_minor(const char *name) - -- return (error); -+ return (SET_ERROR(error)); -+} -+ -+/* -+ * Rename a block device minor mode for the specified volume. -+ */ -+static void -+__zvol_rename_minor(zvol_state_t *zv, const char *newname) -+{ -+ int readonly = get_disk_ro(zv->zv_disk); -+ -+ ASSERT(MUTEX_HELD(&zvol_state_lock)); -+ -+ strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); -+ -+ /* -+ * The block device's read-only state is briefly changed causing -+ * a KOBJ_CHANGE uevent to be issued. This ensures udev detects -+ * the name change and fixes the symlinks. This does not change -+ * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never -+ * changes. This would normally be done using kobject_uevent() but -+ * that is a GPL-only symbol which is why we need this workaround. -+ */ -+ set_disk_ro(zv->zv_disk, !readonly); -+ set_disk_ro(zv->zv_disk, readonly); - } -@@ -1461,9 +1515,6 @@ zvol_remove_minor(const char *name) - static int --zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, -- const char *dsname, void *arg) -+zvol_create_minors_cb(const char *dsname, void *arg) - { -- if (strchr(dsname, '/') == NULL) -- return 0; -+ (void) zvol_create_minor(dsname); - -- (void) __zvol_create_minor(dsname, B_FALSE); - return (0); -@@ -1472,32 +1523,42 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, - /* -- * Create minors for specified pool, if pool is NULL create minors -- * for all available pools. -+ * Create minors for specified dataset including children and snapshots. - */ - int --zvol_create_minors(const char *pool) -+zvol_create_minors(const char *name) - { -- spa_t *spa = NULL; - int error = 0; - -+ if (!zvol_inhibit_dev) -+ error = dmu_objset_find((char *)name, zvol_create_minors_cb, -+ NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -+ -+ return (SET_ERROR(error)); -+} -+ -+/* -+ * Remove minors for specified dataset including children and snapshots. -+ */ -+void -+zvol_remove_minors(const char *name) -+{ -+ zvol_state_t *zv, *zv_next; -+ int namelen = ((name) ? strlen(name) : 0); -+ - if (zvol_inhibit_dev) -- return (0); -+ return; - - mutex_enter(&zvol_state_lock); -- if (pool) { -- error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, -- NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -- } else { -- mutex_enter(&spa_namespace_lock); -- while ((spa = spa_next(spa)) != NULL) { -- error = dmu_objset_find_spa(NULL, -- spa_name(spa), zvol_create_minors_cb, NULL, -- DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); -- if (error) -- break; -+ -+ for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { -+ zv_next = list_next(&zvol_state_list, zv); -+ -+ if (name == NULL || strcmp(zv->zv_name, name) == 0 || -+ (strncmp(zv->zv_name, name, namelen) == 0 && -+ zv->zv_name[namelen] == '/')) { -+ zvol_remove(zv); -+ zvol_free(zv); - } -- mutex_exit(&spa_namespace_lock); - } -- mutex_exit(&zvol_state_lock); - -- return error; -+ mutex_exit(&zvol_state_lock); - } -@@ -1505,9 +1566,10 @@ zvol_create_minors(const char *pool) - /* -- * Remove minors for specified pool, if pool is NULL remove all minors. -+ * Rename minors for specified dataset including children and snapshots. - */ - void --zvol_remove_minors(const char *pool) -+zvol_rename_minors(const char *oldname, const char *newname) - { - zvol_state_t *zv, *zv_next; -- char *str; -+ int oldnamelen, newnamelen; -+ char *name; - -@@ -1516,9 +1578,8 @@ zvol_remove_minors(const char *pool) - -- str = kmem_zalloc(MAXNAMELEN, KM_SLEEP); -- if (pool) { -- (void) strncpy(str, pool, strlen(pool)); -- (void) strcat(str, "/"); -- } -+ oldnamelen = strlen(oldname); -+ newnamelen = strlen(newname); -+ name = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE); - - mutex_enter(&zvol_state_lock); -+ - for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { -@@ -1526,9 +1587,17 @@ zvol_remove_minors(const char *pool) - -- if (pool == NULL || !strncmp(str, zv->zv_name, strlen(str))) { -- zvol_remove(zv); -- zvol_free(zv); -+ if (strcmp(zv->zv_name, oldname) == 0) { -+ __zvol_rename_minor(zv, newname); -+ } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && -+ (zv->zv_name[oldnamelen] == '/' || -+ zv->zv_name[oldnamelen] == '@')) { -+ snprintf(name, MAXNAMELEN, "%s%c%s", newname, -+ zv->zv_name[oldnamelen], -+ zv->zv_name + oldnamelen + 1); -+ __zvol_rename_minor(zv, name); - } - } -+ - mutex_exit(&zvol_state_lock); -- kmem_free(str, MAXNAMELEN); -+ -+ kmem_free(name, MAXNAMELEN); - } -@@ -1540,3 +1609,3 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - if (strchr(dsname, '@') == NULL) -- return 0; -+ return (0); - -@@ -1552,3 +1621,4 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { - } -- return 0; -+ -+ return (0); - } -@@ -1563,3 +1633,2 @@ zvol_set_snapdev(const char *dsname, uint64_t snapdev) { - -- - int -@@ -1570,3 +1639,4 @@ zvol_init(void) - list_create(&zvol_state_list, sizeof (zvol_state_t), -- offsetof(zvol_state_t, zv_next)); -+ offsetof(zvol_state_t, zv_next)); -+ - mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); -@@ -1574,3 +1644,3 @@ zvol_init(void) - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, -- zvol_threads, INT_MAX, TASKQ_PREPOPULATE); -+ zvol_threads, INT_MAX, TASKQ_PREPOPULATE); - if (zvol_taskq == NULL) { -@@ -1588,3 +1658,3 @@ zvol_init(void) - blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, -- THIS_MODULE, zvol_probe, NULL, NULL); -+ THIS_MODULE, zvol_probe, NULL, NULL); - -@@ -1598,3 +1668,3 @@ out1: - -- return (error); -+ return (SET_ERROR(error)); - } -@@ -1622,2 +1692,2 @@ MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); - module_param(zvol_max_discard_blocks, ulong, 0444); --MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard at once"); -+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); -diff --git a/module/zpios/pios.c b/module/zpios/pios.c -index 53cc77b..f0bad6c 100644 ---- a/module/zpios/pios.c -+++ b/module/zpios/pios.c -@@ -1,2 +1,2 @@ --/*****************************************************************************\ -+/* - * ZPIOS is a heavily modified version of the original PIOS test code. -@@ -31,3 +31,3 @@ - * with ZPIOS. If not, see . --\*****************************************************************************/ -+ */ - -@@ -36,2 +36,3 @@ - #include -+#include - #include -@@ -44,15 +45,16 @@ static char *zpios_tag = "zpios_tag"; - --static --int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) -+static int -+zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - { -- /* This is stack heavy but it should be OK since we are only -+ /* -+ * This is stack heavy but it should be OK since we are only - * making the upcall between tests when the stack is shallow. - */ -- char id[16], chunk_size[16], region_size[16], thread_count[16]; -+ char id[16], chunk_size[16], region_size[16], thread_count[16]; - char region_count[16], offset[16], region_noise[16], chunk_noise[16]; -- char thread_delay[16], flags[16], result[8]; -- char *argv[16], *envp[4]; -+ char thread_delay[16], flags[16], result[8]; -+ char *argv[16], *envp[4]; - - if ((path == NULL) || (strlen(path) == 0)) -- return -ENOENT; -+ return (-ENOENT); - -@@ -60,3 +62,3 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); -- snprintf(region_size, 15, "%lu",(long unsigned) run_args->region_size); -+ snprintf(region_size, 15, "%lu", (long unsigned) run_args->region_size); - snprintf(thread_count, 15, "%u", run_args->thread_count); -@@ -71,3 +73,3 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - /* Passing 15 args to registered pre/post upcall */ -- argv[0] = path; -+ argv[0] = path; - argv[1] = phase; -@@ -89,8 +91,37 @@ int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) - /* Passing environment for user space upcall */ -- envp[0] = "HOME=/"; -- envp[1] = "TERM=linux"; -- envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; -- envp[3] = NULL; -+ envp[0] = "HOME=/"; -+ envp[1] = "TERM=linux"; -+ envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; -+ envp[3] = NULL; - -- return call_usermodehelper(path, argv, envp, UMH_WAIT_PROC); -+ return (call_usermodehelper(path, argv, envp, UMH_WAIT_PROC)); -+} -+ -+static int -+zpios_print(struct file *file, const char *format, ...) -+{ -+ zpios_info_t *info = (zpios_info_t *)file->private_data; -+ va_list adx; -+ int rc; -+ -+ ASSERT(info); -+ ASSERT(info->info_buffer); -+ -+ va_start(adx, format); -+ spin_lock(&info->info_lock); -+ -+ /* Don't allow the kernel to start a write in the red zone */ -+ if ((int)(info->info_head - info->info_buffer) > -+ (info->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { -+ rc = -EOVERFLOW; -+ } else { -+ rc = vsprintf(info->info_head, format, adx); -+ if (rc >= 0) -+ info->info_head += rc; -+ } -+ -+ spin_unlock(&info->info_lock); -+ va_end(adx); -+ -+ return (rc); - } -@@ -101,3 +132,3 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - struct dmu_tx *tx; -- uint64_t obj = 0ULL; -+ uint64_t obj = 0ULL; - int rc; -@@ -109,9 +140,8 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - zpios_print(run_args->file, -- "dmu_tx_assign() failed: %d\n", rc); -+ "dmu_tx_assign() failed: %d\n", rc); - dmu_tx_abort(tx); -- return obj; -+ return (obj); - } - -- obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, -- DMU_OT_NONE, 0, tx); -+ obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, DMU_OT_NONE, 0, tx); - rc = dmu_object_set_blocksize(os, obj, 128ULL << 10, 0, tx); -@@ -120,4 +150,4 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - "dmu_object_set_blocksize() failed: %d\n", rc); -- dmu_tx_abort(tx); -- return obj; -+ dmu_tx_abort(tx); -+ return (obj); - } -@@ -126,3 +156,3 @@ zpios_dmu_object_create(run_args_t *run_args, objset_t *os) - -- return obj; -+ return (obj); - } -@@ -136,3 +166,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - tx = dmu_tx_create(os); -- dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); -+ dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); - rc = dmu_tx_assign(tx, TXG_WAIT); -@@ -142,3 +172,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - dmu_tx_abort(tx); -- return rc; -+ return (rc); - } -@@ -149,4 +179,4 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - "dmu_object_free() failed: %d\n", rc); -- dmu_tx_abort(tx); -- return rc; -+ dmu_tx_abort(tx); -+ return (rc); - } -@@ -155,3 +185,3 @@ zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) - -- return 0; -+ return (0); - } -@@ -167,6 +197,6 @@ zpios_dmu_setup(run_args_t *run_args) - -- (void)zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); - t->start = zpios_timespec_now(); - -- (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); -+ (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); - rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL); -@@ -178,4 +208,4 @@ zpios_dmu_setup(run_args_t *run_args) - -- rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); -- if (rc) { -+ rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); -+ if (rc) { - zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) " -@@ -183,3 +213,3 @@ zpios_dmu_setup(run_args_t *run_args) - goto out_destroy; -- } -+ } - -@@ -199,3 +229,3 @@ zpios_dmu_setup(run_args_t *run_args) - region = &run_args->regions[i]; -- mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); - -@@ -210,3 +240,3 @@ zpios_dmu_setup(run_args_t *run_args) - region->max_offset = run_args->offset + -- run_args->region_size; -+ run_args->region_size; - } else { -@@ -219,3 +249,3 @@ zpios_dmu_setup(run_args_t *run_args) - region->max_offset = run_args->offset * -- i + run_args->region_size; -+ i + run_args->region_size; - } -@@ -226,5 +256,5 @@ out_destroy: - if (rc) { -- rc2 = dmu_objset_destroy(name, B_FALSE); -+ rc2 = dsl_destroy_head(name); - if (rc2) -- zpios_print(run_args->file, "Error dmu_objset_destroy" -+ zpios_print(run_args->file, "Error dsl_destroy_head" - "(%s, ...) failed: %d\n", name, rc2); -@@ -234,5 +264,5 @@ out: - t->delta = zpios_timespec_sub(t->stop, t->start); -- (void)zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); - -- return rc; -+ return (rc); - } -@@ -245,3 +275,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - -- size = sizeof(*ra) + kcmd->cmd_region_count * sizeof(zpios_region_t); -+ size = sizeof (*ra) + kcmd->cmd_region_count * sizeof (zpios_region_t); - -@@ -251,3 +281,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - "for regions\n", size); -- return -ENOMEM; -+ return (-ENOMEM); - } -@@ -259,22 +289,22 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - strncpy(ra->log, kcmd->cmd_log, ZPIOS_PATH_SIZE - 1); -- ra->id = kcmd->cmd_id; -- ra->chunk_size = kcmd->cmd_chunk_size; -- ra->thread_count = kcmd->cmd_thread_count; -- ra->region_count = kcmd->cmd_region_count; -- ra->region_size = kcmd->cmd_region_size; -- ra->offset = kcmd->cmd_offset; -- ra->region_noise = kcmd->cmd_region_noise; -- ra->chunk_noise = kcmd->cmd_chunk_noise; -- ra->thread_delay = kcmd->cmd_thread_delay; -- ra->flags = kcmd->cmd_flags; -- ra->stats.wr_data = 0; -- ra->stats.wr_chunks = 0; -- ra->stats.rd_data = 0; -- ra->stats.rd_chunks = 0; -- ra->region_next = 0; -- ra->file = file; -- mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); -- mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); -- -- (void)zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); -+ ra->id = kcmd->cmd_id; -+ ra->chunk_size = kcmd->cmd_chunk_size; -+ ra->thread_count = kcmd->cmd_thread_count; -+ ra->region_count = kcmd->cmd_region_count; -+ ra->region_size = kcmd->cmd_region_size; -+ ra->offset = kcmd->cmd_offset; -+ ra->region_noise = kcmd->cmd_region_noise; -+ ra->chunk_noise = kcmd->cmd_chunk_noise; -+ ra->thread_delay = kcmd->cmd_thread_delay; -+ ra->flags = kcmd->cmd_flags; -+ ra->stats.wr_data = 0; -+ ra->stats.wr_chunks = 0; -+ ra->stats.rd_data = 0; -+ ra->stats.rd_chunks = 0; -+ ra->region_next = 0; -+ ra->file = file; -+ mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); -+ mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); -+ -+ (void) zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); - -@@ -282,4 +312,4 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - if (rc) { -- mutex_destroy(&ra->lock_ctl); -- mutex_destroy(&ra->lock_work); -+ mutex_destroy(&ra->lock_ctl); -+ mutex_destroy(&ra->lock_work); - vmem_free(ra, size); -@@ -288,3 +318,3 @@ zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) - -- return rc; -+ return (rc); - } -@@ -298,3 +328,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - -@@ -303,3 +333,4 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- /* XXX: I don't much care for this chunk selection mechansim -+ /* -+ * XXX: I don't much care for this chunk selection mechansim - * there's the potential to burn a lot of time here doing nothing -@@ -341,4 +372,5 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - if (run_args->region_noise) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -- run_args->region_next += random_int % run_args->region_noise; -+ get_random_bytes(&random_int, sizeof (unsigned int)); -+ run_args->region_next += -+ random_int % run_args->region_noise; - } else { -@@ -348,3 +380,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - mutex_exit(&run_args->lock_work); -- return 1; -+ return (1); - } -@@ -354,3 +386,3 @@ zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - -- return 0; -+ return (0); - } -@@ -365,6 +397,6 @@ zpios_remove_objset(run_args_t *run_args) - -- (void)zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); - t->start = zpios_timespec_now(); - -- (void)snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); -+ (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); - -@@ -375,8 +407,7 @@ zpios_remove_objset(run_args_t *run_args) - rc = zpios_dmu_object_free(run_args, -- region->obj.os, -- region->obj.obj); -+ region->obj.os, region->obj.obj); - if (rc) -- zpios_print(run_args->file, "Error " -- "removing object %d, %d\n", -- (int)region->obj.obj, rc); -+ zpios_print(run_args->file, -+ "Error removing object %d, %d\n", -+ (int)region->obj.obj, rc); - } -@@ -385,8 +416,7 @@ zpios_remove_objset(run_args_t *run_args) - rc = zpios_dmu_object_free(run_args, -- region->obj.os, -- region->obj.obj); -+ region->obj.os, region->obj.obj); - if (rc) -- zpios_print(run_args->file, "Error " -- "removing object %d, %d\n", -- (int)region->obj.obj, rc); -+ zpios_print(run_args->file, -+ "Error removing object %d, %d\n", -+ (int)region->obj.obj, rc); - } -@@ -397,6 +427,6 @@ zpios_remove_objset(run_args_t *run_args) - if (run_args->flags & DMU_REMOVE) { -- rc = dmu_objset_destroy(name, B_FALSE); -+ rc = dsl_destroy_head(name); - if (rc) -- zpios_print(run_args->file, "Error dmu_objset_destroy" -- "(%s, ...) failed: %d\n", name, rc); -+ zpios_print(run_args->file, "Error dsl_destroy_head" -+ "(%s, ...) failed: %d\n", name, rc); - } -@@ -405,3 +435,3 @@ zpios_remove_objset(run_args_t *run_args) - t->delta = zpios_timespec_sub(t->stop, t->start); -- (void)zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); - } -@@ -421,3 +451,3 @@ zpios_cleanup_run(run_args_t *run_args) - kmem_free(run_args->threads[i], -- sizeof(thread_data_t)); -+ sizeof (thread_data_t)); - } -@@ -426,3 +456,3 @@ zpios_cleanup_run(run_args_t *run_args) - kmem_free(run_args->threads, -- sizeof(thread_data_t *) * run_args->thread_count); -+ sizeof (thread_data_t *) * run_args->thread_count); - } -@@ -434,5 +464,5 @@ zpios_cleanup_run(run_args_t *run_args) - mutex_destroy(&run_args->lock_ctl); -- size = run_args->region_count * sizeof(zpios_region_t); -+ size = run_args->region_count * sizeof (zpios_region_t); - -- vmem_free(run_args, sizeof(*run_args) + size); -+ vmem_free(run_args, sizeof (*run_args) + size); - } -@@ -464,3 +494,3 @@ zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, - dmu_tx_abort(tx); -- return rc; -+ return (rc); - } -@@ -475,3 +505,3 @@ zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, - -- return 0; -+ return (0); - } -@@ -480,3 +510,3 @@ static int - zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, -- uint64_t offset, uint64_t size, void *buf) -+ uint64_t offset, uint64_t size, void *buf) - { -@@ -490,3 +520,3 @@ zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, - -- return dmu_read(os, object, offset, size, buf, flags); -+ return (dmu_read(os, object, offset, size, buf, flags)); - } -@@ -512,3 +542,3 @@ zpios_thread_main(void *data) - if (chunk_noise) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - chunk_noise_tmp = (random_int % (chunk_noise * 2))-chunk_noise; -@@ -516,3 +546,4 @@ zpios_thread_main(void *data) - -- /* It's OK to vmem_alloc() this memory because it will be copied -+ /* -+ * It's OK to vmem_alloc() this memory because it will be copied - * in to the slab and pointers to the slab copy will be setup in -@@ -536,5 +567,5 @@ zpios_thread_main(void *data) - while (zpios_get_work_item(run_args, &obj, &offset, -- &chunk_size, ®ion, DMU_WRITE)) { -+ &chunk_size, ®ion, DMU_WRITE)) { - if (thread_delay) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - thread_delay_tmp = random_int % thread_delay; -@@ -546,3 +577,3 @@ zpios_thread_main(void *data) - rc = zpios_dmu_write(run_args, obj.os, obj.obj, -- offset, chunk_size, buf); -+ offset, chunk_size, buf); - t.stop = zpios_timespec_now(); -@@ -560,3 +591,3 @@ zpios_thread_main(void *data) - thr->stats.wr_time.delta = zpios_timespec_add( -- thr->stats.wr_time.delta, t.delta); -+ thr->stats.wr_time.delta, t.delta); - mutex_exit(&thr->lock); -@@ -567,3 +598,3 @@ zpios_thread_main(void *data) - region->stats.wr_time.delta = zpios_timespec_add( -- region->stats.wr_time.delta, t.delta); -+ region->stats.wr_time.delta, t.delta); - -@@ -602,5 +633,5 @@ zpios_thread_main(void *data) - while (zpios_get_work_item(run_args, &obj, &offset, -- &chunk_size, ®ion, DMU_READ)) { -+ &chunk_size, ®ion, DMU_READ)) { - if (thread_delay) { -- get_random_bytes(&random_int, sizeof(unsigned int)); -+ get_random_bytes(&random_int, sizeof (unsigned int)); - thread_delay_tmp = random_int % thread_delay; -@@ -630,5 +661,5 @@ zpios_thread_main(void *data) - zpios_print(run_args->file, -- "IO verify error: %d/%d/%d\n", -- (int)obj.obj, (int)offset, -- (int)chunk_size); -+ "IO verify error: %d/%d/%d\n", -+ (int)obj.obj, (int)offset, -+ (int)chunk_size); - break; -@@ -642,3 +673,3 @@ zpios_thread_main(void *data) - thr->stats.rd_time.delta = zpios_timespec_add( -- thr->stats.rd_time.delta, t.delta); -+ thr->stats.rd_time.delta, t.delta); - mutex_exit(&thr->lock); -@@ -649,3 +680,3 @@ zpios_thread_main(void *data) - region->stats.rd_time.delta = zpios_timespec_add( -- region->stats.rd_time.delta, t.delta); -+ region->stats.rd_time.delta, t.delta); - -@@ -672,3 +703,3 @@ out: - -- return rc; /* Unreachable, due to do_exit() */ -+ return (rc); /* Unreachable, due to do_exit() */ - } -@@ -692,3 +723,3 @@ zpios_threads_run(run_args_t *run_args) - -- tsks = kmem_zalloc(sizeof(struct task_struct *) * tc, KM_SLEEP); -+ tsks = kmem_zalloc(sizeof (struct task_struct *) * tc, KM_SLEEP); - if (tsks == NULL) { -@@ -698,3 +729,3 @@ zpios_threads_run(run_args_t *run_args) - -- run_args->threads = kmem_zalloc(sizeof(thread_data_t *) * tc, KM_SLEEP); -+ run_args->threads = kmem_zalloc(sizeof (thread_data_t *)*tc, KM_SLEEP); - if (run_args->threads == NULL) { -@@ -709,3 +740,3 @@ zpios_threads_run(run_args_t *run_args) - for (i = 0; i < tc; i++) { -- thr = kmem_zalloc(sizeof(thread_data_t), KM_SLEEP); -+ thr = kmem_zalloc(sizeof (thread_data_t), KM_SLEEP); - if (thr == NULL) { -@@ -722,3 +753,3 @@ zpios_threads_run(run_args_t *run_args) - tsk = kthread_create(zpios_thread_main, (void *)thr, -- "%s/%d", "zpios_io", i); -+ "%s/%d", "zpios_io", i); - if (IS_ERR(tsk)) { -@@ -734,3 +765,3 @@ zpios_threads_run(run_args_t *run_args) - /* Wake up all threads for write phase */ -- (void)zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); - for (i = 0; i < tc; i++) -@@ -742,3 +773,3 @@ zpios_threads_run(run_args_t *run_args) - tw->stop = zpios_timespec_now(); -- (void)zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); - -@@ -775,4 +806,4 @@ zpios_threads_run(run_args_t *run_args) - /* Wake up all threads for read phase */ -- (void)zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); -- for (i = 0; i < tc; i++) -+ (void) zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); -+ for (i = 0; i < tc; i++) - wake_up_process(tsks[i]); -@@ -783,3 +814,3 @@ zpios_threads_run(run_args_t *run_args) - tr->stop = zpios_timespec_now(); -- (void)zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); -+ (void) zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); - -@@ -804,6 +835,6 @@ out: - cleanup: -- kmem_free(tsks, sizeof(struct task_struct *) * tc); -+ kmem_free(tsks, sizeof (struct task_struct *) * tc); - cleanup2: - /* Returns first encountered thread error (if any) */ -- return rc; -+ return (rc); - -@@ -820,3 +851,3 @@ static int - zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, -- int data_size, void *data) -+ int data_size, void *data) - { -@@ -829,4 +860,4 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid chunk_size, region_size, " -- "thread_count, or region_count, %d\n", -EINVAL); -- return -EINVAL; -+ "thread_count, or region_count, %d\n", -EINVAL); -+ return (-EINVAL); - } -@@ -836,4 +867,4 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid flags, minimally DMU_WRITE " -- "and DMU_READ must be set, %d\n", -EINVAL); -- return -EINVAL; -+ "and DMU_READ must be set, %d\n", -EINVAL); -+ return (-EINVAL); - } -@@ -843,8 +874,9 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - zpios_print(file, "Invalid flags, DMU_*_ZC incompatible " -- "with DMU_VERIFY, used for performance analysis " -- "only, %d\n", -EINVAL); -- return -EINVAL; -+ "with DMU_VERIFY, used for performance analysis " -+ "only, %d\n", -EINVAL); -+ return (-EINVAL); - } - -- /* Opaque data on return contains structs of the following form: -+ /* -+ * Opaque data on return contains structs of the following form: - * -@@ -857,9 +889,9 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - */ -- size = (sizeof(zpios_stats_t) + -- (kcmd->cmd_thread_count * sizeof(zpios_stats_t)) + -- (kcmd->cmd_region_count * sizeof(zpios_stats_t))); -+ size = (sizeof (zpios_stats_t) + -+ (kcmd->cmd_thread_count * sizeof (zpios_stats_t)) + -+ (kcmd->cmd_region_count * sizeof (zpios_stats_t))); - if (data_size < size) { - zpios_print(file, "Invalid size, command data buffer " -- "size too small, (%d < %d)\n", data_size, size); -- return -ENOSPC; -+ "size too small, (%d < %d)\n", data_size, size); -+ return (-ENOSPC); - } -@@ -868,5 +900,5 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - if (rc) -- return rc; -+ return (rc); - -- rc = zpios_threads_run(run_args); -+ rc = zpios_threads_run(run_args); - zpios_remove_objset(run_args); -@@ -888,7 +920,7 @@ zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, - cleanup: -- zpios_cleanup_run(run_args); -+ zpios_cleanup_run(run_args); - -- (void)zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); -+ (void) zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); - -- return rc; -+ return (rc); - } -@@ -902,7 +934,7 @@ zpios_open(struct inode *inode, struct file *file) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -- info = (zpios_info_t *)kmem_alloc(sizeof(*info), KM_SLEEP); -+ info = (zpios_info_t *)kmem_alloc(sizeof (*info), KM_SLEEP); - if (info == NULL) -- return -ENOMEM; -+ return (-ENOMEM); - -@@ -910,6 +942,7 @@ zpios_open(struct inode *inode, struct file *file) - info->info_size = ZPIOS_INFO_BUFFER_SIZE; -- info->info_buffer = (char *)vmem_alloc(ZPIOS_INFO_BUFFER_SIZE,KM_SLEEP); -+ info->info_buffer = -+ (char *) vmem_alloc(ZPIOS_INFO_BUFFER_SIZE, KM_SLEEP); - if (info->info_buffer == NULL) { -- kmem_free(info, sizeof(*info)); -- return -ENOMEM; -+ kmem_free(info, sizeof (*info)); -+ return (-ENOMEM); - } -@@ -919,3 +952,3 @@ zpios_open(struct inode *inode, struct file *file) - -- return 0; -+ return (0); - } -@@ -929,3 +962,3 @@ zpios_release(struct inode *inode, struct file *file) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -935,5 +968,5 @@ zpios_release(struct inode *inode, struct file *file) - vmem_free(info->info_buffer, ZPIOS_INFO_BUFFER_SIZE); -- kmem_free(info, sizeof(*info)); -+ kmem_free(info, sizeof (*info)); - -- return 0; -+ return (0); - } -@@ -953,3 +986,3 @@ zpios_buffer_clear(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) - -- return 0; -+ return (0); - } -@@ -988,3 +1021,4 @@ zpios_buffer_size(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) - -- if (copy_to_user((struct zpios_cfg_t __user *)arg, kcfg, sizeof(*kcfg))) -+ if (copy_to_user((struct zpios_cfg_t __user *)arg, -+ kcfg, sizeof (*kcfg))) - rc = -EFAULT; -@@ -993,3 +1027,3 @@ out: - -- return rc; -+ return (rc); - } -@@ -1002,4 +1036,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - -- if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof(kcfg))) -- return -EFAULT; -+ if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof (kcfg))) -+ return (-EFAULT); - -@@ -1007,4 +1041,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - zpios_print(file, "Bad config magic 0x%x != 0x%x\n", -- kcfg.cfg_magic, ZPIOS_CFG_MAGIC); -- return -EINVAL; -+ kcfg.cfg_magic, ZPIOS_CFG_MAGIC); -+ return (-EINVAL); - } -@@ -1013,3 +1047,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - case ZPIOS_CFG_BUFFER_CLEAR: -- /* cfg_arg1 - Unused -+ /* -+ * cfg_arg1 - Unused - * cfg_rc1 - Unused -@@ -1019,3 +1054,4 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - case ZPIOS_CFG_BUFFER_SIZE: -- /* cfg_arg1 - 0 - query size; >0 resize -+ /* -+ * cfg_arg1 - 0 - query size; >0 resize - * cfg_rc1 - Set to current buffer size -@@ -1031,3 +1067,3 @@ zpios_ioctl_cfg(struct file *file, unsigned long arg) - -- return rc; -+ return (rc); - } -@@ -1041,10 +1077,10 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - -- kcmd = kmem_alloc(sizeof(zpios_cmd_t), KM_SLEEP); -+ kcmd = kmem_alloc(sizeof (zpios_cmd_t), KM_SLEEP); - if (kcmd == NULL) { - zpios_print(file, "Unable to kmem_alloc() %ld byte for " -- "zpios_cmd_t\n", (long int)sizeof(zpios_cmd_t)); -- return -ENOMEM; -+ "zpios_cmd_t\n", (long int)sizeof (zpios_cmd_t)); -+ return (-ENOMEM); - } - -- rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof(zpios_cmd_t)); -+ rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof (zpios_cmd_t)); - if (rc) { -@@ -1057,4 +1093,4 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - zpios_print(file, "Bad command magic 0x%x != 0x%x\n", -- kcmd->cmd_magic, ZPIOS_CFG_MAGIC); -- rc = -EINVAL; -+ kcmd->cmd_magic, ZPIOS_CFG_MAGIC); -+ rc = (-EINVAL); - goto out_cmd; -@@ -1074,3 +1110,3 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - rc = copy_from_user(data, (void *)(arg + offsetof(zpios_cmd_t, -- cmd_data_str)), kcmd->cmd_data_size); -+ cmd_data_str)), kcmd->cmd_data_size); - if (rc) { -@@ -1090,3 +1126,3 @@ zpios_ioctl_cmd(struct file *file, unsigned long arg) - rc = copy_to_user((void *)(arg + offsetof(zpios_cmd_t, -- cmd_data_str)), data, kcmd->cmd_data_size); -+ cmd_data_str)), data, kcmd->cmd_data_size); - if (rc) { -@@ -1101,5 +1137,5 @@ out_data: - out_cmd: -- kmem_free(kcmd, sizeof(zpios_cmd_t)); -+ kmem_free(kcmd, sizeof (zpios_cmd_t)); - -- return rc; -+ return (rc); - } -@@ -1109,3 +1145,3 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - int rc = 0; -@@ -1114,6 +1150,6 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - if ((cmd & 0xffffff00) == ((int)'T') << 8) -- return -ENOTTY; -+ return (-ENOTTY); - - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1132,3 +1168,3 @@ zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - -- return rc; -+ return (rc); - } -@@ -1140,3 +1176,3 @@ zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - { -- return zpios_unlocked_ioctl(file, cmd, arg); -+ return (zpios_unlocked_ioctl(file, cmd, arg)); - } -@@ -1144,3 +1180,4 @@ zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) - --/* I'm not sure why you would want to write in to this buffer from -+/* -+ * I'm not sure why you would want to write in to this buffer from - * user space since its principle use is to pass test status info -@@ -1150,5 +1187,5 @@ static ssize_t - zpios_write(struct file *file, const char __user *buf, -- size_t count, loff_t *ppos) -+ size_t count, loff_t *ppos) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1157,3 +1194,3 @@ zpios_write(struct file *file, const char __user *buf, - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1183,3 +1220,3 @@ out: - spin_unlock(&info->info_lock); -- return rc; -+ return (rc); - } -@@ -1187,6 +1224,5 @@ out: - static ssize_t --zpios_read(struct file *file, char __user *buf, -- size_t count, loff_t *ppos) -+zpios_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1195,3 +1231,3 @@ zpios_read(struct file *file, char __user *buf, - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1219,3 +1255,3 @@ out: - spin_unlock(&info->info_lock); -- return rc; -+ return (rc); - } -@@ -1224,3 +1260,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - { -- unsigned int minor = iminor(file->f_dentry->d_inode); -+ unsigned int minor = iminor(file->f_dentry->d_inode); - zpios_info_t *info = (zpios_info_t *)file->private_data; -@@ -1229,3 +1265,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - if (minor >= ZPIOS_MINORS) -- return -ENXIO; -+ return (-ENXIO); - -@@ -1255,3 +1291,3 @@ static loff_t zpios_seek(struct file *file, loff_t offset, int origin) - -- return rc; -+ return (rc); - } -@@ -1304,7 +1340,8 @@ zpios_init(void) - zpios_device = spl_device_create(zpios_class, NULL, -- dev, NULL, ZPIOS_NAME); -- return 0; -+ dev, NULL, ZPIOS_NAME); -+ -+ return (0); - error: - printk(KERN_ERR "ZPIOS: Error registering zpios device, %d\n", rc); -- return rc; -+ return (rc); - } -@@ -1321,3 +1358,3 @@ zpios_fini(void) - -- return 0; -+ return (0); - } -@@ -1330 +1367,2 @@ MODULE_DESCRIPTION("Kernel PIOS implementation"); - MODULE_LICENSE("GPL"); -+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); -diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in -index a4b0b36..412feaf 100644 ---- a/rpm/generic/zfs-dkms.spec.in -+++ b/rpm/generic/zfs-dkms.spec.in -@@ -16,7 +16,3 @@ BuildArch: noarch - --%if 0%{?dkms_version:1} --Requires: dkms = %{dkms_version} --%else --Requires: dkms >= 2.2.0.2 --%endif -+Requires: dkms >= 2.2.0.3-20 - Requires: spl-dkms = %{version} -diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in -index 0797124..5c2196f 100644 ---- a/rpm/generic/zfs.spec.in -+++ b/rpm/generic/zfs.spec.in -@@ -7,3 +7,3 @@ - %global _udevdir /lib/udev --%global _dracutdir /lib/dracut -+%global _dracutdir %{_prefix}/share/dracut - %endif -@@ -12,4 +12,20 @@ - %bcond_with blkid --%bcond_with selinux -+%bcond_with systemd - -+# Generic enable switch for systemd -+%if %{with systemd} -+%define _systemd 1 -+%endif -+ -+# Fedora >= 15 comes with systemd, but only >= 18 has -+# the proper macros -+%if 0%{?fedora} >= 18 -+%define _systemd 1 -+%endif -+ -+# opensuse >= 12.1 comes with systemd, but only >= 13.1 -+# has the proper macros -+%if 0%{?suse_version} >= 1310 -+%define _systemd 1 -+%endif - -@@ -31,5 +47,9 @@ ExcludeArch: ppc ppc64 - Requires: spl = %{version} --Requires: %{name}-kmod >= %{version} -+Requires: %{name}-kmod = %{version} - Provides: %{name}-kmod-common = %{version} - -+# zfs-fuse provides the same commands and man pages that ZoL does. Renaming -+# those on either side would conflict with all available documentation. -+Conflicts: zfs-fuse -+ - %if 0%{?rhel}%{?fedora}%{?suse_version} -@@ -40,5 +60,8 @@ BuildRequires: libblkid-devel - %endif --%if %{with selinux} --BuildRequires: libselinux-devel - %endif -+%if 0%{?_systemd} -+Requires(post): systemd -+Requires(preun): systemd -+Requires(postun): systemd -+BuildRequires: systemd - %endif -@@ -91,6 +114,6 @@ image which is ZFS aware. - %endif --%if %{with selinux} -- %define selinux --with-selinux -+%if 0%{?_systemd} -+ %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --disable-sysvinit - %else -- %define selinux --without-selinux -+ %define systemd --enable-sysvinit --disable-systemd - %endif -@@ -107,3 +130,3 @@ image which is ZFS aware. - %{blkid} \ -- %{selinux} -+ %{systemd} - make %{?_smp_mflags} -@@ -117,3 +140,7 @@ find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; - /sbin/ldconfig -+%if 0%{?_systemd} -+%systemd_post zfs.target -+%else - [ -x /sbin/chkconfig ] && /sbin/chkconfig --add zfs -+%endif - exit 0 -@@ -121,2 +148,5 @@ exit 0 - %preun -+%if 0%{?_systemd} -+%systemd_preun zfs.target -+%else - if [ $1 -eq 0 ] ; then -@@ -124,5 +154,10 @@ if [ $1 -eq 0 ] ; then - fi -+%endif - exit 0 - --%postun -p /sbin/ldconfig -+%postun -+/sbin/ldconfig -+%if 0%{?_systemd} -+%systemd_postun zfs.target -+%endif - -@@ -133,3 +168,4 @@ exit 0 - %{_bindir}/* --%{_libdir}/*.so.1* -+%{_libdir}/*.so.* -+%{_libexecdir}/%{name} - %{_mandir}/man1/* -@@ -141,3 +177,9 @@ exit 0 - %config(noreplace) %{_sysconfdir}/%{name} -+%if 0%{?_systemd} -+/usr/lib/modules-load.d/* -+%{_unitdir}/* -+%{_presetdir}/* -+%else - %{_sysconfdir}/init.d/* -+%endif - -diff --git a/scripts/Makefile.am b/scripts/Makefile.am -index 08a32b4..7894db4 100644 ---- a/scripts/Makefile.am -+++ b/scripts/Makefile.am -@@ -2,3 +2,3 @@ SUBDIRS = zpool-config zpios-test zpios-profile - --EXTRA_DIST = dkms.mkconf dkms.postinst kmodtool -+EXTRA_DIST = dkms.mkconf dkms.postinst kmodtool zfs2zol-patch.sed cstyle.pl - -@@ -9,2 +9,3 @@ dist_pkgdata_SCRIPTS = \ - $(top_srcdir)/scripts/zfault.sh \ -+ $(top_srcdir)/scripts/zimport.sh \ - $(top_srcdir)/scripts/zfs.sh \ -@@ -19,2 +20,3 @@ ZCONFIG=$(top_builddir)/scripts/zconfig.sh - ZFAULT=$(top_builddir)/scripts/zfault.sh -+ZIMPORT=$(top_builddir)/scripts/zimport.sh - ZTEST=$(top_builddir)/cmd/ztest/ztest -diff --git a/scripts/common.sh.in b/scripts/common.sh.in -index 29b85d3..2fac2a9 100644 ---- a/scripts/common.sh.in -+++ b/scripts/common.sh.in -@@ -40,2 +40,3 @@ udevruledir=@udevruledir@ - sysconfdir=@sysconfdir@ -+localstatedir=@localstatedir@ - -@@ -74,2 +75,4 @@ AWK=${AWK:-/usr/bin/awk} - -+ZED_PIDFILE=${ZED_PIDFILE:-${localstatedir}/run/zed.pid} -+ - COLOR_BLACK="\033[0;30m" -@@ -206,3 +209,7 @@ load_module() { - -- ${LDMOD} $* &>/dev/null || ERROR="Failed to load $1" return 1 -+ ${LDMOD} $* &>/dev/null -+ if [ $? -ne 0 ]; then -+ echo "Failed to load ${NAME} ($@)" -+ return 1 -+ fi - -@@ -215,3 +222,3 @@ load_modules() { - for MOD in ${KERNEL_MODULES[*]}; do -- load_module ${MOD} -+ load_module ${MOD} >/dev/null - done -@@ -287,14 +294,41 @@ check_loop_utils() { - # --# Find and return an unused loopback device. -+# Find and return an unused loop device. A new /dev/loopN node will be -+# created if required. The kernel loop driver will automatically register -+# the minor as long as it's less than /sys/module/loop/parameters/max_loop. - # - unused_loop_device() { -- for DEVICE in `ls -1 /dev/loop[0-9]* 2>/dev/null`; do -- ${LOSETUP} ${DEVICE} &>/dev/null -- if [ $? -ne 0 ]; then -- echo ${DEVICE} -- return -+ local DEVICE=`${LOSETUP} -f` -+ local MAX_LOOP_PATH="/sys/module/loop/parameters/max_loop" -+ local MAX_LOOP; -+ -+ # An existing /dev/loopN device was available. -+ if [ -n "${DEVICE}" ]; then -+ echo "${DEVICE}" -+ return 0 -+ fi -+ -+ # Create a new /dev/loopN provided we are not at MAX_LOOP. -+ if [ -f "${MAX_LOOP_PATH}" ]; then -+ MAX_LOOP=`cat /sys/module/loop/parameters/max_loop` -+ if [ ${MAX_LOOP} -eq 0 ]; then -+ MAX_LOOP=255 - fi -- done - -- die "Error: Unable to find unused loopback device" -+ for (( i=0; i<=${MAX_LOOP}; i++ )); do -+ DEVICE="/dev/loop$i" -+ -+ if [ -b "${DEVICE}" ]; then -+ continue -+ else -+ mknod -m660 "${DEVICE}" b 7 $i -+ chown root.disk "${DEVICE}" -+ chmod 666 "${DEVICE}" -+ -+ echo "${DEVICE}" -+ return 0 -+ fi -+ done -+ fi -+ -+ die "Error: Unable to create new loopback device" - } -@@ -305,4 +339,4 @@ unused_loop_device() { - # in use we will not be able to remove them, and we only remove --# devices which include 'zpool' in the name. So any damage we might --# do should be limited to other zfs related testing. -+# devices which include 'zpool' or 'deleted' in the name. So any -+# damage we might do should be limited to other zfs related testing. - # -@@ -313,4 +347,4 @@ cleanup_loop_devices() { - ${AWK} -F":" -v losetup="$LOSETUP" \ -- '/zpool/ { system("losetup -d "$1) }' ${TMP_FILE} -- ${AWK} -F" " '/zpool/ { system("rm -f "$3) }' ${TMP_FILE} -+ '/zpool/ || /deleted/ { system("losetup -d "$1) }' ${TMP_FILE} -+ ${AWK} -F" " '/zpool/ || /deleted/ { system("rm -f "$3) }' ${TMP_FILE} - -@@ -335,3 +369,3 @@ destroy_loop_devices() { - # --# Create a device label. -+# Create a device label taking care to briefly wait if udev needs to settle. - # -@@ -341,3 +375,4 @@ label() { - -- ${PARTED} ${DEVICE} --script -- mklabel ${LABEL} || return 1 -+ wait_udev ${DEVICE} 30 || return 1 -+ ${PARTED} ${DEVICE} --script -- mklabel ${LABEL} || return 2 - -@@ -746 +781,7 @@ stack_check() { - } -+ -+kill_zed() { -+ if [ -f $ZED_PIDFILE ]; then -+ kill $(cat $ZED_PIDFILE) -+ fi -+} -diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl -new file mode 100755 -index 0000000..083b30f ---- /dev/null -+++ b/scripts/cstyle.pl -@@ -0,0 +1,950 @@ -+#!/usr/bin/perl -w -+# -+# CDDL HEADER START -+# -+# The contents of this file are subject to the terms of the -+# Common Development and Distribution License (the "License"). -+# You may not use this file except in compliance with the License. -+# -+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -+# or http://www.opensolaris.org/os/licensing. -+# See the License for the specific language governing permissions -+# and limitations under the License. -+# -+# When distributing Covered Code, include this CDDL HEADER in each -+# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -+# If applicable, add the following below this CDDL HEADER, with the -+# fields enclosed by brackets "[]" replaced with your own identifying -+# information: Portions Copyright [yyyy] [name of copyright owner] -+# -+# CDDL HEADER END -+# -+# -+# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -+# Use is subject to license terms. -+# -+# @(#)cstyle 1.58 98/09/09 (from shannon) -+#ident "%Z%%M% %I% %E% SMI" -+# -+# cstyle - check for some common stylistic errors. -+# -+# cstyle is a sort of "lint" for C coding style. -+# It attempts to check for the style used in the -+# kernel, sometimes known as "Bill Joy Normal Form". -+# -+# There's a lot this can't check for, like proper indentation -+# of code blocks. There's also a lot more this could check for. -+# -+# A note to the non perl literate: -+# -+# perl regular expressions are pretty much like egrep -+# regular expressions, with the following special symbols -+# -+# \s any space character -+# \S any non-space character -+# \w any "word" character [a-zA-Z0-9_] -+# \W any non-word character -+# \d a digit [0-9] -+# \D a non-digit -+# \b word boundary (between \w and \W) -+# \B non-word boundary -+# -+ -+require 5.0; -+use IO::File; -+use Getopt::Std; -+use strict; -+ -+my $usage = -+"usage: cstyle [-chpvCP] [-o constructs] file ... -+ -c check continuation indentation inside functions -+ -h perform heuristic checks that are sometimes wrong -+ -p perform some of the more picky checks -+ -v verbose -+ -C don't check anything in header block comments -+ -P check for use of non-POSIX types -+ -o constructs -+ allow a comma-seperated list of optional constructs: -+ doxygen allow doxygen-style block comments (/** /*!) -+ splint allow splint-style lint comments (/*@ ... @*/) -+"; -+ -+my %opts; -+ -+if (!getopts("cho:pvCP", \%opts)) { -+ print $usage; -+ exit 2; -+} -+ -+my $check_continuation = $opts{'c'}; -+my $heuristic = $opts{'h'}; -+my $picky = $opts{'p'}; -+my $verbose = $opts{'v'}; -+my $ignore_hdr_comment = $opts{'C'}; -+my $check_posix_types = $opts{'P'}; -+ -+my $doxygen_comments = 0; -+my $splint_comments = 0; -+ -+if (defined($opts{'o'})) { -+ for my $x (split /,/, $opts{'o'}) { -+ if ($x eq "doxygen") { -+ $doxygen_comments = 1; -+ } elsif ($x eq "splint") { -+ $splint_comments = 1; -+ } else { -+ print "cstyle: unrecognized construct \"$x\"\n"; -+ print $usage; -+ exit 2; -+ } -+ } -+} -+ -+my ($filename, $line, $prev); # shared globals -+ -+my $fmt; -+my $hdr_comment_start; -+ -+if ($verbose) { -+ $fmt = "%s: %d: %s\n%s\n"; -+} else { -+ $fmt = "%s: %d: %s\n"; -+} -+ -+if ($doxygen_comments) { -+ # doxygen comments look like "/*!" or "/**"; allow them. -+ $hdr_comment_start = qr/^\s*\/\*[\!\*]?$/; -+} else { -+ $hdr_comment_start = qr/^\s*\/\*$/; -+} -+ -+# Note, following must be in single quotes so that \s and \w work right. -+my $typename = '(int|char|short|long|unsigned|float|double' . -+ '|\w+_t|struct\s+\w+|union\s+\w+|FILE)'; -+ -+# mapping of old types to POSIX compatible types -+my %old2posix = ( -+ 'unchar' => 'uchar_t', -+ 'ushort' => 'ushort_t', -+ 'uint' => 'uint_t', -+ 'ulong' => 'ulong_t', -+ 'u_int' => 'uint_t', -+ 'u_short' => 'ushort_t', -+ 'u_long' => 'ulong_t', -+ 'u_char' => 'uchar_t', -+ 'quad' => 'quad_t' -+); -+ -+my $lint_re = qr/\/\*(?: -+ ARGSUSED[0-9]*|NOTREACHED|LINTLIBRARY|VARARGS[0-9]*| -+ CONSTCOND|CONSTANTCOND|CONSTANTCONDITION|EMPTY| -+ FALLTHRU|FALLTHROUGH|LINTED.*?|PRINTFLIKE[0-9]*| -+ PROTOLIB[0-9]*|SCANFLIKE[0-9]*|CSTYLED.*? -+ )\*\//x; -+ -+my $splint_re = qr/\/\*@.*?@\*\//x; -+ -+my $warlock_re = qr/\/\*\s*(?: -+ VARIABLES\ PROTECTED\ BY| -+ MEMBERS\ PROTECTED\ BY| -+ ALL\ MEMBERS\ PROTECTED\ BY| -+ READ-ONLY\ VARIABLES:| -+ READ-ONLY\ MEMBERS:| -+ VARIABLES\ READABLE\ WITHOUT\ LOCK:| -+ MEMBERS\ READABLE\ WITHOUT\ LOCK:| -+ LOCKS\ COVERED\ BY| -+ LOCK\ UNNEEDED\ BECAUSE| -+ LOCK\ NEEDED:| -+ LOCK\ HELD\ ON\ ENTRY:| -+ READ\ LOCK\ HELD\ ON\ ENTRY:| -+ WRITE\ LOCK\ HELD\ ON\ ENTRY:| -+ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ READ\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ WRITE\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| -+ LOCK\ RELEASED\ AS\ SIDE\ EFFECT:| -+ LOCK\ UPGRADED\ AS\ SIDE\ EFFECT:| -+ LOCK\ DOWNGRADED\ AS\ SIDE\ EFFECT:| -+ FUNCTIONS\ CALLED\ THROUGH\ POINTER| -+ FUNCTIONS\ CALLED\ THROUGH\ MEMBER| -+ LOCK\ ORDER: -+ )/x; -+ -+my $err_stat = 0; # exit status -+ -+if ($#ARGV >= 0) { -+ foreach my $arg (@ARGV) { -+ my $fh = new IO::File $arg, "r"; -+ if (!defined($fh)) { -+ printf "%s: can not open\n", $arg; -+ } else { -+ &cstyle($arg, $fh); -+ close $fh; -+ } -+ } -+} else { -+ &cstyle("", *STDIN); -+} -+exit $err_stat; -+ -+my $no_errs = 0; # set for CSTYLED-protected lines -+ -+sub err($) { -+ my ($error) = @_; -+ unless ($no_errs) { -+ printf $fmt, $filename, $., $error, $line; -+ $err_stat = 1; -+ } -+} -+ -+sub err_prefix($$) { -+ my ($prevline, $error) = @_; -+ my $out = $prevline."\n".$line; -+ unless ($no_errs) { -+ printf $fmt, $filename, $., $error, $out; -+ $err_stat = 1; -+ } -+} -+ -+sub err_prev($) { -+ my ($error) = @_; -+ unless ($no_errs) { -+ printf $fmt, $filename, $. - 1, $error, $prev; -+ $err_stat = 1; -+ } -+} -+ -+sub cstyle($$) { -+ -+my ($fn, $filehandle) = @_; -+$filename = $fn; # share it globally -+ -+my $in_cpp = 0; -+my $next_in_cpp = 0; -+ -+my $in_comment = 0; -+my $in_header_comment = 0; -+my $comment_done = 0; -+my $in_warlock_comment = 0; -+my $in_function = 0; -+my $in_function_header = 0; -+my $in_declaration = 0; -+my $note_level = 0; -+my $nextok = 0; -+my $nocheck = 0; -+ -+my $in_string = 0; -+ -+my ($okmsg, $comment_prefix); -+ -+$line = ''; -+$prev = ''; -+reset_indent(); -+ -+line: while (<$filehandle>) { -+ s/\r?\n$//; # strip return and newline -+ -+ # save the original line, then remove all text from within -+ # double or single quotes, we do not want to check such text. -+ -+ $line = $_; -+ -+ # -+ # C allows strings to be continued with a backslash at the end of -+ # the line. We translate that into a quoted string on the previous -+ # line followed by an initial quote on the next line. -+ # -+ # (we assume that no-one will use backslash-continuation with character -+ # constants) -+ # -+ $_ = '"' . $_ if ($in_string && !$nocheck && !$in_comment); -+ -+ # -+ # normal strings and characters -+ # -+ s/'([^\\']|\\[^xX0]|\\0[0-9]*|\\[xX][0-9a-fA-F]*)'/''/g; -+ s/"([^\\"]|\\.)*"/\"\"/g; -+ -+ # -+ # detect string continuation -+ # -+ if ($nocheck || $in_comment) { -+ $in_string = 0; -+ } else { -+ # -+ # Now that all full strings are replaced with "", we check -+ # for unfinished strings continuing onto the next line. -+ # -+ $in_string = -+ (s/([^"](?:"")*)"([^\\"]|\\.)*\\$/$1""/ || -+ s/^("")*"([^\\"]|\\.)*\\$/""/); -+ } -+ -+ # -+ # figure out if we are in a cpp directive -+ # -+ $in_cpp = $next_in_cpp || /^\s*#/; # continued or started -+ $next_in_cpp = $in_cpp && /\\$/; # only if continued -+ -+ # strip off trailing backslashes, which appear in long macros -+ s/\s*\\$//; -+ -+ # an /* END CSTYLED */ comment ends a no-check block. -+ if ($nocheck) { -+ if (/\/\* *END *CSTYLED *\*\//) { -+ $nocheck = 0; -+ } else { -+ reset_indent(); -+ next line; -+ } -+ } -+ -+ # a /*CSTYLED*/ comment indicates that the next line is ok. -+ if ($nextok) { -+ if ($okmsg) { -+ err($okmsg); -+ } -+ $nextok = 0; -+ $okmsg = 0; -+ if (/\/\* *CSTYLED.*\*\//) { -+ /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ $no_errs = 1; -+ } elsif ($no_errs) { -+ $no_errs = 0; -+ } -+ -+ # check length of line. -+ # first, a quick check to see if there is any chance of being too long. -+ if (($line =~ tr/\t/\t/) * 7 + length($line) > 80) { -+ # yes, there is a chance. -+ # replace tabs with spaces and check again. -+ my $eline = $line; -+ 1 while $eline =~ -+ s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e; -+ if (length($eline) > 80) { -+ err("line > 80 characters"); -+ } -+ } -+ -+ # ignore NOTE(...) annotations (assumes NOTE is on lines by itself). -+ if ($note_level || /\b_?NOTE\s*\(/) { # if in NOTE or this is NOTE -+ s/[^()]//g; # eliminate all non-parens -+ $note_level += s/\(//g - length; # update paren nest level -+ next; -+ } -+ -+ # a /* BEGIN CSTYLED */ comment starts a no-check block. -+ if (/\/\* *BEGIN *CSTYLED *\*\//) { -+ $nocheck = 1; -+ } -+ -+ # a /*CSTYLED*/ comment indicates that the next line is ok. -+ if (/\/\* *CSTYLED.*\*\//) { -+ /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ if (/\/\/ *CSTYLED/) { -+ /^.*\/\/ *CSTYLED *(.*)$/; -+ $okmsg = $1; -+ $nextok = 1; -+ } -+ -+ # universal checks; apply to everything -+ if (/\t +\t/) { -+ err("spaces between tabs"); -+ } -+ if (/ \t+ /) { -+ err("tabs between spaces"); -+ } -+ if (/\s$/) { -+ err("space or tab at end of line"); -+ } -+ if (/[^ \t(]\/\*/ && !/\w\(\/\*.*\*\/\);/) { -+ err("comment preceded by non-blank"); -+ } -+ -+ # is this the beginning or ending of a function? -+ # (not if "struct foo\n{\n") -+ if (/^{$/ && $prev =~ /\)\s*(const\s*)?(\/\*.*\*\/\s*)?\\?$/) { -+ $in_function = 1; -+ $in_declaration = 1; -+ $in_function_header = 0; -+ $prev = $line; -+ next line; -+ } -+ if (/^}\s*(\/\*.*\*\/\s*)*$/) { -+ if ($prev =~ /^\s*return\s*;/) { -+ err_prev("unneeded return at end of function"); -+ } -+ $in_function = 0; -+ reset_indent(); # we don't check between functions -+ $prev = $line; -+ next line; -+ } -+ if (/^\w*\($/) { -+ $in_function_header = 1; -+ } -+ -+ if ($in_warlock_comment && /\*\//) { -+ $in_warlock_comment = 0; -+ $prev = $line; -+ next line; -+ } -+ -+ # a blank line terminates the declarations within a function. -+ # XXX - but still a problem in sub-blocks. -+ if ($in_declaration && /^$/) { -+ $in_declaration = 0; -+ } -+ -+ if ($comment_done) { -+ $in_comment = 0; -+ $in_header_comment = 0; -+ $comment_done = 0; -+ } -+ # does this looks like the start of a block comment? -+ if (/$hdr_comment_start/) { -+ if (!/^\t*\/\*/) { -+ err("block comment not indented by tabs"); -+ } -+ $in_comment = 1; -+ /^(\s*)\//; -+ $comment_prefix = $1; -+ if ($comment_prefix eq "") { -+ $in_header_comment = 1; -+ } -+ $prev = $line; -+ next line; -+ } -+ # are we still in the block comment? -+ if ($in_comment) { -+ if (/^$comment_prefix \*\/$/) { -+ $comment_done = 1; -+ } elsif (/\*\//) { -+ $comment_done = 1; -+ err("improper block comment close") -+ unless ($ignore_hdr_comment && $in_header_comment); -+ } elsif (!/^$comment_prefix \*[ \t]/ && -+ !/^$comment_prefix \*$/) { -+ err("improper block comment") -+ unless ($ignore_hdr_comment && $in_header_comment); -+ } -+ } -+ -+ if ($in_header_comment && $ignore_hdr_comment) { -+ $prev = $line; -+ next line; -+ } -+ -+ # check for errors that might occur in comments and in code. -+ -+ # allow spaces to be used to draw pictures in all comments. -+ if (/[^ ] / && !/".* .*"/ && !$in_comment) { -+ err("spaces instead of tabs"); -+ } -+ if (/^ / && !/^ \*[ \t\/]/ && !/^ \*$/ && -+ (!/^ \w/ || $in_function != 0)) { -+ err("indent by spaces instead of tabs"); -+ } -+ if (/^\t+ [^ \t\*]/ || /^\t+ \S/ || /^\t+ \S/) { -+ err("continuation line not indented by 4 spaces"); -+ } -+ if (/$warlock_re/ && !/\*\//) { -+ $in_warlock_comment = 1; -+ $prev = $line; -+ next line; -+ } -+ if (/^\s*\/\*./ && !/^\s*\/\*.*\*\// && !/$hdr_comment_start/) { -+ err("improper first line of block comment"); -+ } -+ -+ if ($in_comment) { # still in comment, don't do further checks -+ $prev = $line; -+ next line; -+ } -+ -+ if ((/[^(]\/\*\S/ || /^\/\*\S/) && -+ !(/$lint_re/ || ($splint_comments && /$splint_re/))) { -+ err("missing blank after open comment"); -+ } -+ if (/\S\*\/[^)]|\S\*\/$/ && -+ !(/$lint_re/ || ($splint_comments && /$splint_re/))) { -+ err("missing blank before close comment"); -+ } -+ if (/\/\/\S/) { # C++ comments -+ err("missing blank after start comment"); -+ } -+ # check for unterminated single line comments, but allow them when -+ # they are used to comment out the argument list of a function -+ # declaration. -+ if (/\S.*\/\*/ && !/\S.*\/\*.*\*\// && !/\(\/\*/) { -+ err("unterminated single line comment"); -+ } -+ -+ if (/^(#else|#endif|#include)(.*)$/) { -+ $prev = $line; -+ if ($picky) { -+ my $directive = $1; -+ my $clause = $2; -+ # Enforce ANSI rules for #else and #endif: no noncomment -+ # identifiers are allowed after #endif or #else. Allow -+ # C++ comments since they seem to be a fact of life. -+ if ((($1 eq "#endif") || ($1 eq "#else")) && -+ ($clause ne "") && -+ (!($clause =~ /^\s+\/\*.*\*\/$/)) && -+ (!($clause =~ /^\s+\/\/.*$/))) { -+ err("non-comment text following " . -+ "$directive (or malformed $directive " . -+ "directive)"); -+ } -+ } -+ next line; -+ } -+ -+ # -+ # delete any comments and check everything else. Note that -+ # ".*?" is a non-greedy match, so that we don't get confused by -+ # multiple comments on the same line. -+ # -+ s/\/\*.*?\*\///g; -+ s/\/\/.*$//; # C++ comments -+ -+ # delete any trailing whitespace; we have already checked for that. -+ s/\s*$//; -+ -+ # following checks do not apply to text in comments. -+ -+ if (/[^<>\s][!<>=]=/ || /[^<>][!<>=]=[^\s,]/ || -+ (/[^->]>[^,=>\s]/ && !/[^->]>$/) || -+ (/[^<]<[^,=<\s]/ && !/[^<]<$/) || -+ /[^<\s]<[^<]/ || /[^->\s]>[^>]/) { -+ err("missing space around relational operator"); -+ } -+ if (/\S>>=/ || /\S<<=/ || />>=\S/ || /<<=\S/ || /\S[-+*\/&|^%]=/ || -+ (/[^-+*\/&|^%!<>=\s]=[^=]/ && !/[^-+*\/&|^%!<>=\s]=$/) || -+ (/[^!<>=]=[^=\s]/ && !/[^!<>=]=$/)) { -+ # XXX - should only check this for C++ code -+ # XXX - there are probably other forms that should be allowed -+ if (!/\soperator=/) { -+ err("missing space around assignment operator"); -+ } -+ } -+ if (/[,;]\S/ && !/\bfor \(;;\)/) { -+ err("comma or semicolon followed by non-blank"); -+ } -+ # allow "for" statements to have empty "while" clauses -+ if (/\s[,;]/ && !/^[\t]+;$/ && !/^\s*for \([^;]*; ;[^;]*\)/) { -+ err("comma or semicolon preceded by blank"); -+ } -+ if (/^\s*(&&|\|\|)/) { -+ err("improper boolean continuation"); -+ } -+ if (/\S *(&&|\|\|)/ || /(&&|\|\|) *\S/) { -+ err("more than one space around boolean operator"); -+ } -+ if (/\b(for|if|while|switch|sizeof|return|case)\(/) { -+ err("missing space between keyword and paren"); -+ } -+ if (/(\b(for|if|while|switch|return)\b.*){2,}/ && !/^#define/) { -+ # multiple "case" and "sizeof" allowed -+ err("more than one keyword on line"); -+ } -+ if (/\b(for|if|while|switch|sizeof|return|case)\s\s+\(/ && -+ !/^#if\s+\(/) { -+ err("extra space between keyword and paren"); -+ } -+ # try to detect "func (x)" but not "if (x)" or -+ # "#define foo (x)" or "int (*func)();" -+ if (/\w\s\(/) { -+ my $s = $_; -+ # strip off all keywords on the line -+ s/\b(for|if|while|switch|return|case|sizeof)\s\(/XXX(/g; -+ s/#elif\s\(/XXX(/g; -+ s/^#define\s+\w+\s+\(/XXX(/; -+ # do not match things like "void (*f)();" -+ # or "typedef void (func_t)();" -+ s/\w\s\(+\*/XXX(*/g; -+ s/\b($typename|void)\s+\(+/XXX(/og; -+ if (/\w\s\(/) { -+ err("extra space between function name and left paren"); -+ } -+ $_ = $s; -+ } -+ # try to detect "int foo(x)", but not "extern int foo(x);" -+ # XXX - this still trips over too many legitimate things, -+ # like "int foo(x,\n\ty);" -+# if (/^(\w+(\s|\*)+)+\w+\(/ && !/\)[;,](\s|)*$/ && -+# !/^(extern|static)\b/) { -+# err("return type of function not on separate line"); -+# } -+ # this is a close approximation -+ if (/^(\w+(\s|\*)+)+\w+\(.*\)(\s|)*$/ && -+ !/^(extern|static)\b/) { -+ err("return type of function not on separate line"); -+ } -+ if (/^#define /) { -+ err("#define followed by space instead of tab"); -+ } -+ if (/^\s*return\W[^;]*;/ && !/^\s*return\s*\(.*\);/) { -+ err("unparenthesized return expression"); -+ } -+ if (/\bsizeof\b/ && !/\bsizeof\s*\(.*\)/) { -+ err("unparenthesized sizeof expression"); -+ } -+ if (/\(\s/) { -+ err("whitespace after left paren"); -+ } -+ # allow "for" statements to have empty "continue" clauses -+ if (/\s\)/ && !/^\s*for \([^;]*;[^;]*; \)/) { -+ err("whitespace before right paren"); -+ } -+ if (/^\s*\(void\)[^ ]/) { -+ err("missing space after (void) cast"); -+ } -+ if (/\S{/ && !/{{/) { -+ err("missing space before left brace"); -+ } -+ if ($in_function && /^\s+{/ && -+ ($prev =~ /\)\s*$/ || $prev =~ /\bstruct\s+\w+$/)) { -+ err("left brace starting a line"); -+ } -+ if (/}(else|while)/) { -+ err("missing space after right brace"); -+ } -+ if (/}\s\s+(else|while)/) { -+ err("extra space after right brace"); -+ } -+ if (/\b_VOID\b|\bVOID\b|\bSTATIC\b/) { -+ err("obsolete use of VOID or STATIC"); -+ } -+ if (/\b$typename\*/o) { -+ err("missing space between type name and *"); -+ } -+ if (/^\s+#/) { -+ err("preprocessor statement not in column 1"); -+ } -+ if (/^#\s/) { -+ err("blank after preprocessor #"); -+ } -+ if (/!\s*(strcmp|strncmp|bcmp)\s*\(/) { -+ err("don't use boolean ! with comparison functions"); -+ } -+ -+ # -+ # We completely ignore, for purposes of indentation: -+ # * lines outside of functions -+ # * preprocessor lines -+ # -+ if ($check_continuation && $in_function && !$in_cpp) { -+ process_indent($_); -+ } -+ if ($picky) { -+ # try to detect spaces after casts, but allow (e.g.) -+ # "sizeof (int) + 1", "void (*funcptr)(int) = foo;", and -+ # "int foo(int) __NORETURN;" -+ if ((/^\($typename( \*+)?\)\s/o || -+ /\W\($typename( \*+)?\)\s/o) && -+ !/sizeof\s*\($typename( \*)?\)\s/o && -+ !/\($typename( \*+)?\)\s+=[^=]/o) { -+ err("space after cast"); -+ } -+ if (/\b$typename\s*\*\s/o && -+ !/\b$typename\s*\*\s+const\b/o) { -+ err("unary * followed by space"); -+ } -+ } -+ if ($check_posix_types) { -+ # try to detect old non-POSIX types. -+ # POSIX requires all non-standard typedefs to end in _t, -+ # but historically these have been used. -+ if (/\b(unchar|ushort|uint|ulong|u_int|u_short|u_long|u_char|quad)\b/) { -+ err("non-POSIX typedef $1 used: use $old2posix{$1} instead"); -+ } -+ } -+ if ($heuristic) { -+ # cannot check this everywhere due to "struct {\n...\n} foo;" -+ if ($in_function && !$in_declaration && -+ /}./ && !/}\s+=/ && !/{.*}[;,]$/ && !/}(\s|)*$/ && -+ !/} (else|while)/ && !/}}/) { -+ err("possible bad text following right brace"); -+ } -+ # cannot check this because sub-blocks in -+ # the middle of code are ok -+ if ($in_function && /^\s+{/) { -+ err("possible left brace starting a line"); -+ } -+ } -+ if (/^\s*else\W/) { -+ if ($prev =~ /^\s*}$/) { -+ err_prefix($prev, -+ "else and right brace should be on same line"); -+ } -+ } -+ $prev = $line; -+} -+ -+if ($prev eq "") { -+ err("last line in file is blank"); -+} -+ -+} -+ -+# -+# Continuation-line checking -+# -+# The rest of this file contains the code for the continuation checking -+# engine. It's a pretty simple state machine which tracks the expression -+# depth (unmatched '('s and '['s). -+# -+# Keep in mind that the argument to process_indent() has already been heavily -+# processed; all comments have been replaced by control-A, and the contents of -+# strings and character constants have been elided. -+# -+ -+my $cont_in; # currently inside of a continuation -+my $cont_off; # skipping an initializer or definition -+my $cont_noerr; # suppress cascading errors -+my $cont_start; # the line being continued -+my $cont_base; # the base indentation -+my $cont_first; # this is the first line of a statement -+my $cont_multiseg; # this continuation has multiple segments -+ -+my $cont_special; # this is a C statement (if, for, etc.) -+my $cont_macro; # this is a macro -+my $cont_case; # this is a multi-line case -+ -+my @cont_paren; # the stack of unmatched ( and [s we've seen -+ -+sub -+reset_indent() -+{ -+ $cont_in = 0; -+ $cont_off = 0; -+} -+ -+sub -+delabel($) -+{ -+ # -+ # replace labels with tabs. Note that there may be multiple -+ # labels on a line. -+ # -+ local $_ = $_[0]; -+ -+ while (/^(\t*)( *(?:(?:\w+\s*)|(?:case\b[^:]*)): *)(.*)$/) { -+ my ($pre_tabs, $label, $rest) = ($1, $2, $3); -+ $_ = $pre_tabs; -+ while ($label =~ s/^([^\t]*)(\t+)//) { -+ $_ .= "\t" x (length($2) + length($1) / 8); -+ } -+ $_ .= ("\t" x (length($label) / 8)).$rest; -+ } -+ -+ return ($_); -+} -+ -+sub -+process_indent($) -+{ -+ require strict; -+ local $_ = $_[0]; # preserve the global $_ -+ -+ s///g; # No comments -+ s/\s+$//; # Strip trailing whitespace -+ -+ return if (/^$/); # skip empty lines -+ -+ # regexps used below; keywords taking (), macros, and continued cases -+ my $special = '(?:(?:\}\s*)?else\s+)?(?:if|for|while|switch)\b'; -+ my $macro = '[A-Z_][A-Z_0-9]*\('; -+ my $case = 'case\b[^:]*$'; -+ -+ # skip over enumerations, array definitions, initializers, etc. -+ if ($cont_off <= 0 && !/^\s*$special/ && -+ (/(?:(?:\b(?:enum|struct|union)\s*[^\{]*)|(?:\s+=\s*)){/ || -+ (/^\s*{/ && $prev =~ /=\s*(?:\/\*.*\*\/\s*)*$/))) { -+ $cont_in = 0; -+ $cont_off = tr/{/{/ - tr/}/}/; -+ return; -+ } -+ if ($cont_off) { -+ $cont_off += tr/{/{/ - tr/}/}/; -+ return; -+ } -+ -+ if (!$cont_in) { -+ $cont_start = $line; -+ -+ if (/^\t* /) { -+ err("non-continuation indented 4 spaces"); -+ $cont_noerr = 1; # stop reporting -+ } -+ $_ = delabel($_); # replace labels with tabs -+ -+ # check if the statement is complete -+ return if (/^\s*\}?$/); -+ return if (/^\s*\}?\s*else\s*\{?$/); -+ return if (/^\s*do\s*\{?$/); -+ return if (/{$/); -+ return if (/}[,;]?$/); -+ -+ # Allow macros on their own lines -+ return if (/^\s*[A-Z_][A-Z_0-9]*$/); -+ -+ # cases we don't deal with, generally non-kosher -+ if (/{/) { -+ err("stuff after {"); -+ return; -+ } -+ -+ # Get the base line, and set up the state machine -+ /^(\t*)/; -+ $cont_base = $1; -+ $cont_in = 1; -+ @cont_paren = (); -+ $cont_first = 1; -+ $cont_multiseg = 0; -+ -+ # certain things need special processing -+ $cont_special = /^\s*$special/? 1 : 0; -+ $cont_macro = /^\s*$macro/? 1 : 0; -+ $cont_case = /^\s*$case/? 1 : 0; -+ } else { -+ $cont_first = 0; -+ -+ # Strings may be pulled back to an earlier (half-)tabstop -+ unless ($cont_noerr || /^$cont_base / || -+ (/^\t*(?: )?(?:gettext\()?\"/ && !/^$cont_base\t/)) { -+ err_prefix($cont_start, -+ "continuation should be indented 4 spaces"); -+ } -+ } -+ -+ my $rest = $_; # keeps the remainder of the line -+ -+ # -+ # The split matches 0 characters, so that each 'special' character -+ # is processed separately. Parens and brackets are pushed and -+ # popped off the @cont_paren stack. For normal processing, we wait -+ # until a ; or { terminates the statement. "special" processing -+ # (if/for/while/switch) is allowed to stop when the stack empties, -+ # as is macro processing. Case statements are terminated with a : -+ # and an empty paren stack. -+ # -+ foreach $_ (split /[^\(\)\[\]\{\}\;\:]*/) { -+ next if (length($_) == 0); -+ -+ # rest contains the remainder of the line -+ my $rxp = "[^\Q$_\E]*\Q$_\E"; -+ $rest =~ s/^$rxp//; -+ -+ if (/\(/ || /\[/) { -+ push @cont_paren, $_; -+ } elsif (/\)/ || /\]/) { -+ my $cur = $_; -+ tr/\)\]/\(\[/; -+ -+ my $old = (pop @cont_paren); -+ if (!defined($old)) { -+ err("unexpected '$cur'"); -+ $cont_in = 0; -+ last; -+ } elsif ($old ne $_) { -+ err("'$cur' mismatched with '$old'"); -+ $cont_in = 0; -+ last; -+ } -+ -+ # -+ # If the stack is now empty, do special processing -+ # for if/for/while/switch and macro statements. -+ # -+ next if (@cont_paren != 0); -+ if ($cont_special) { -+ if ($rest =~ /^\s*{?$/) { -+ $cont_in = 0; -+ last; -+ } -+ if ($rest =~ /^\s*;$/) { -+ err("empty if/for/while body ". -+ "not on its own line"); -+ $cont_in = 0; -+ last; -+ } -+ if (!$cont_first && $cont_multiseg == 1) { -+ err_prefix($cont_start, -+ "multiple statements continued ". -+ "over multiple lines"); -+ $cont_multiseg = 2; -+ } elsif ($cont_multiseg == 0) { -+ $cont_multiseg = 1; -+ } -+ # We've finished this section, start -+ # processing the next. -+ goto section_ended; -+ } -+ if ($cont_macro) { -+ if ($rest =~ /^$/) { -+ $cont_in = 0; -+ last; -+ } -+ } -+ } elsif (/\;/) { -+ if ($cont_case) { -+ err("unexpected ;"); -+ } elsif (!$cont_special) { -+ err("unexpected ;") if (@cont_paren != 0); -+ if (!$cont_first && $cont_multiseg == 1) { -+ err_prefix($cont_start, -+ "multiple statements continued ". -+ "over multiple lines"); -+ $cont_multiseg = 2; -+ } elsif ($cont_multiseg == 0) { -+ $cont_multiseg = 1; -+ } -+ if ($rest =~ /^$/) { -+ $cont_in = 0; -+ last; -+ } -+ if ($rest =~ /^\s*special/) { -+ err("if/for/while/switch not started ". -+ "on its own line"); -+ } -+ goto section_ended; -+ } -+ } elsif (/\{/) { -+ err("{ while in parens/brackets") if (@cont_paren != 0); -+ err("stuff after {") if ($rest =~ /[^\s}]/); -+ $cont_in = 0; -+ last; -+ } elsif (/\}/) { -+ err("} while in parens/brackets") if (@cont_paren != 0); -+ if (!$cont_special && $rest !~ /^\s*(while|else)\b/) { -+ if ($rest =~ /^$/) { -+ err("unexpected }"); -+ } else { -+ err("stuff after }"); -+ } -+ $cont_in = 0; -+ last; -+ } -+ } elsif (/\:/ && $cont_case && @cont_paren == 0) { -+ err("stuff after multi-line case") if ($rest !~ /$^/); -+ $cont_in = 0; -+ last; -+ } -+ next; -+section_ended: -+ # End of a statement or if/while/for loop. Reset -+ # cont_special and cont_macro based on the rest of the -+ # line. -+ $cont_special = ($rest =~ /^\s*$special/)? 1 : 0; -+ $cont_macro = ($rest =~ /^\s*$macro/)? 1 : 0; -+ $cont_case = 0; -+ next; -+ } -+ $cont_noerr = 0 if (!$cont_in); -+} -diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh -index 281166c..d6695be 100755 ---- a/scripts/zconfig.sh -+++ b/scripts/zconfig.sh -@@ -405,10 +405,12 @@ test_7() { - -- # Mount the ext2 filesystem and copy some data to it. -- mkdir -p /tmp/${ZVOL_NAME}-part1 || fail 6 -- mount /dev/zvol/${FULL_ZVOL_NAME}-part1 /tmp/${ZVOL_NAME}-part1 \ -- || fail 7 -+ # Snapshot the pristine ext2 filesystem. -+ ${ZFS} snapshot ${FULL_SNAP_NAME} || fail 6 -+ wait_udev /dev/zvol/${FULL_SNAP_NAME}-part1 30 || fail 7 - -- # Snapshot the pristine ext2 filesystem and mount it read-only. -- ${ZFS} snapshot ${FULL_SNAP_NAME} || fail 8 -- wait_udev /dev/zvol/${FULL_SNAP_NAME}-part1 30 || fail 8 -+ # Mount the ext2 filesystem so some data can be copied to it. -+ mkdir -p /tmp/${ZVOL_NAME}-part1 || fail 7 -+ mount /dev/zvol/${FULL_ZVOL_NAME}-part1 \ -+ /tmp/${ZVOL_NAME}-part1 || fail 8 -+ -+ # Mount the pristine ext2 snapshot. - mkdir -p /tmp/${SNAP_NAME}-part1 || fail 9 -@@ -498,7 +500,10 @@ test_8() { - cp -RL ${SRC_DIR} /tmp/${FULL_ZVOL_NAME1}-part1 || fail 8 -- sync || fail 9 - -- # Snapshot the ext2 filesystem so it may be sent. -- ${ZFS} snapshot ${FULL_SNAP_NAME1} || fail 11 -- wait_udev /dev/zvol/${FULL_SNAP_NAME1} 30 || fail 11 -+ # Unmount, snapshot, mount the ext2 filesystem so it may be sent. -+ # We only unmount to ensure the ext2 filesystem is clean. -+ umount /tmp/${FULL_ZVOL_NAME1}-part1 || fail 9 -+ ${ZFS} snapshot ${FULL_SNAP_NAME1} || fail 10 -+ wait_udev /dev/zvol/${FULL_SNAP_NAME1} 30 || fail 10 -+ mount /dev/zvol/${FULL_ZVOL_NAME1}-part1 \ -+ /tmp/${FULL_ZVOL_NAME1}-part1 || 11 - -@@ -551,2 +556,3 @@ test_9() { - ${ZFS} create -V 300M ${FULL_NAME} || fail 3 -+ udev_trigger - -diff --git a/scripts/zfs-images b/scripts/zfs-images -new file mode 160000 -index 0000000..3331601 ---- /dev/null -+++ b/scripts/zfs-images -@@ -0,0 +1 @@ -+Subproject commit 3331601f6dc50ef2c9779c1656218701b48b276c -diff --git a/scripts/zfs.sh b/scripts/zfs.sh -index f44053e..b97a057 100755 ---- a/scripts/zfs.sh -+++ b/scripts/zfs.sh -@@ -67,2 +67,3 @@ fi - if [ ${UNLOAD} ]; then -+ kill_zed - umount -t zfs -a -@@ -73,4 +74,4 @@ else - check_modules || die "${ERROR}" -- load_modules "$@" -- wait_udev /dev/zfs 30 -+ load_modules "$@" || die "Failed to load modules" -+ wait_udev /dev/zfs 30 || die "'/dev/zfs' was not created" - fi -diff --git a/scripts/zfs2zol-patch.sed b/scripts/zfs2zol-patch.sed -new file mode 100755 -index 0000000..3a7280f ---- /dev/null -+++ b/scripts/zfs2zol-patch.sed -@@ -0,0 +1,15 @@ -+#!/bin/sed -f -+ -+s:usr/src/uts/common/fs/zfs/sys:include/sys:g -+s:usr/src/uts/common/fs/zfs:module/zfs:g -+s:usr/src/lib/libzpool:lib/libzpool:g -+s:usr/src/cmd:cmd:g -+s:usr/src/common/nvpair:module/nvpair:g -+s:usr/src/lib/libzfs/common/libzfs.h:include/libzfs.h:g -+s:usr/src/man/man1m/zfs.1m:man/man8/zfs.8:g -+s:usr/src/uts/common/sys:include/sys:g -+s:usr/src/lib/libzfs_core/common/libzfs_core.h:include/libzfs_core.h:g -+s:usr/src/lib/libzfs/common:lib/libzfs:g -+s:usr/src/lib/libzfs_core/common:lib/libzfs_core:g -+s:lib/libzpool/common/sys:include/sys:g -+s:lib/libzpool/common:lib/libzpool:g -diff --git a/scripts/zimport.sh b/scripts/zimport.sh -new file mode 100755 -index 0000000..8a6cdf0 ---- /dev/null -+++ b/scripts/zimport.sh -@@ -0,0 +1,495 @@ -+#!/bin/bash -+# -+# Verify that an assortment of known good reference pools can be imported -+# using different versions of the ZoL code. -+# -+# By default references pools for the major ZFS implementation will be -+# checked against the most recent ZoL tags and the master development branch. -+# Alternate tags or branches may be verified with the '-s option. -+# Passing the keyword "installed" will instruct the script to test whatever -+# version is installed. -+# -+# Preferentially a reference pool is used for all tests. However, if one -+# does not exist and the pool-tag matches one of the src-tags then a new -+# reference pool will be created using binaries from that source build. -+# This is particularly useful when you need to test your changes before -+# opening a pull request. The keyword 'all' can be used as short hand -+# refer to all available reference pools. -+# -+# New reference pools may be added by placing a bzip2 compressed tarball -+# of the pool in the scripts/zfs-images directory and then passing -+# the -p option. To increase the test coverage reference pools -+# should be collected for all the major ZFS implementations. Having these -+# pools easily available is also helpful to the developers. -+# -+# Care should be taken to run these tests with a kernel supported by all -+# the listed tags. Otherwise build failure will cause false positives. -+# -+# -+# EXAMPLES: -+# -+# The following example will verify the zfs-0.6.2 tag, the master branch, -+# and the installed zfs version can correctly import the listed pools. -+# Note there is no reference pool available for master and installed but -+# because binaries are available one is automatically constructed. The -+# working directory is also preserved between runs (-k) preventing the -+# need to rebuild from source for multiple runs. -+# -+# zimport.sh -k -f /var/tmp/zimport \ -+# -s "zfs-0.6.2 master installed" \ -+# -p "zevo-1.1.1 zol-0.6.2 zol-0.6.2-173 master installed" -+# -+# --------------------- ZFS on Linux Source Versions -------------- -+# zfs-0.6.2 master 0.6.2-175_g36eb554 -+# ----------------------------------------------------------------- -+# Clone SPL Local Local Skip -+# Clone ZFS Local Local Skip -+# Build SPL Pass Pass Skip -+# Build ZFS Pass Pass Skip -+# ----------------------------------------------------------------- -+# zevo-1.1.1 Pass Pass Pass -+# zol-0.6.2 Pass Pass Pass -+# zol-0.6.2-173 Fail Pass Pass -+# master Pass Pass Pass -+# installed Pass Pass Pass -+# -+basedir="$(dirname $0)" -+ -+SCRIPT_COMMON=common.sh -+if [ -f "${basedir}/${SCRIPT_COMMON}" ]; then -+. "${basedir}/${SCRIPT_COMMON}" -+else -+echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 -+fi -+ -+PROG=zimport.sh -+ -+SRC_TAGS="zfs-0.6.1 zfs-0.6.2 master" -+POOL_TAGS="all master" -+TEST_DIR=`mktemp -u -d -p /var/tmp zimport.XXXXXXXX` -+KEEP=0 -+VERBOSE=0 -+COLOR=1 -+REPO="https://github.com/zfsonlinux" -+IMAGES_DIR="$SCRIPTDIR/zfs-images/" -+IMAGES_TAR="https://github.com/zfsonlinux/zfs-images/tarball/master" -+CPUS=`grep -c ^processor /proc/cpuinfo` -+ERROR=0 -+ -+usage() { -+cat << EOF -+USAGE: -+zimport.sh [hvl] [-r repo] [-s src-tag] [-i pool-dir] [-p pool-tag] [-f path] -+ -+DESCRIPTION: -+ ZPOOL import verification tests -+ -+OPTIONS: -+ -h Show this message -+ -v Verbose -+ -c No color -+ -k Keep temporary directory -+ -r Source repository ($REPO) -+ -s ... Verify ZoL versions with the listed tags -+ -i Pool image directory -+ -p ... Verify pools created with the listed tags -+ -f Temporary directory to use -+ -+EOF -+} -+ -+while getopts 'hvckr:s:i:p:f:?' OPTION; do -+ case $OPTION in -+ h) -+ usage -+ exit 1 -+ ;; -+ v) -+ VERBOSE=1 -+ ;; -+ c) -+ COLOR=0 -+ ;; -+ k) -+ KEEP=1 -+ ;; -+ r) -+ REPO="$OPTARG" -+ ;; -+ s) -+ SRC_TAGS="$OPTARG" -+ ;; -+ i) -+ IMAGES_DIR="$OPTARG" -+ ;; -+ p) -+ POOL_TAGS="$OPTARG" -+ ;; -+ f) -+ TEST_DIR="$OPTARG" -+ ;; -+ ?) -+ usage -+ exit -+ ;; -+ esac -+done -+ -+# Initialize the test suite -+init -+check_modules || die "ZFS modules must be unloaded" -+ -+SRC_DIR="$TEST_DIR/src" -+SRC_DIR_SPL="$SRC_DIR/spl" -+SRC_DIR_ZFS="$SRC_DIR/zfs" -+ -+if [ $COLOR -eq 0 ]; then -+ COLOR_GREEN="" -+ COLOR_BROWN="" -+ COLOR_RED="" -+ COLOR_RESET="" -+fi -+ -+pass_nonewline() { -+ echo -n -e "${COLOR_GREEN}Pass${COLOR_RESET}\t\t" -+} -+ -+skip_nonewline() { -+ echo -n -e "${COLOR_BROWN}Skip${COLOR_RESET}\t\t" -+} -+ -+fail_nonewline() { -+ echo -n -e "${COLOR_RED}Fail${COLOR_RESET}\t\t" -+} -+ -+# -+# Set several helper variables which are derived from a source tag. -+# -+# SPL_TAG - The tag zfs-x.y.z is translated to spl-x.y.z. -+# SPL_DIR - The spl directory name. -+# SPL_URL - The spl github URL to fetch the tarball. -+# ZFS_TAG - The passed zfs-x.y.z tag -+# ZFS_DIR - The zfs directory name -+# ZFS_URL - The zfs github URL to fetch the tarball -+# -+src_set_vars() { -+ local TAG=$1 -+ -+ SPL_TAG=`echo $TAG | sed -e 's/zfs/spl/'` -+ SPL_DIR=$SRC_DIR_SPL/$SPL_TAG -+ SPL_URL=$REPO/spl/tarball/$SPL_TAG -+ -+ ZFS_TAG=$TAG -+ ZFS_DIR=$SRC_DIR_ZFS/$ZFS_TAG -+ ZFS_URL=$REPO/zfs/tarball/$ZFS_TAG -+ -+ if [ "$TAG" = "installed" ]; then -+ ZPOOL_CMD=`which zpool` -+ ZFS_CMD=`which zfs` -+ ZFS_SH="/usr/share/zfs/zfs.sh" -+ ZPOOL_CREATE="/usr/share/zfs/zpool-create.sh" -+ else -+ ZPOOL_CMD="./cmd/zpool/zpool" -+ ZFS_CMD="./cmd/zfs/zfs" -+ ZFS_SH="./scripts/zfs.sh" -+ ZPOOL_CREATE="./scripts/zpool-create.sh" -+ fi -+} -+ -+# -+# Set several helper variables which are derived from a pool name such -+# as zol-0.6.x, zevo-1.1.1, etc. These refer to example pools from various -+# ZFS implementations which are used to verify compatibility. -+# -+# POOL_TAG - The example pools name in scripts/zfs-images/. -+# POOL_BZIP - The full path to the example bzip2 compressed pool. -+# POOL_DIR - The top level test path for this pool. -+# POOL_DIR_PRISTINE - The directory containing a pristine version of the pool. -+# POOL_DIR_COPY - The directory containing a working copy of the pool. -+# POOL_DIR_SRC - Location of a source build if it exists for this pool. -+# -+pool_set_vars() { -+ local TAG=$1 -+ -+ POOL_TAG=$TAG -+ POOL_BZIP=$IMAGES_DIR/$POOL_TAG.tar.bz2 -+ POOL_DIR=$TEST_DIR/pools/$POOL_TAG -+ POOL_DIR_PRISTINE=$POOL_DIR/pristine -+ POOL_DIR_COPY=$POOL_DIR/copy -+ POOL_DIR_SRC=`echo -n "$SRC_DIR_ZFS/"; \ -+ echo "$POOL_TAG" | sed -e 's/zol/zfs/'` -+} -+ -+# -+# Construct a non-trivial pool given a specific version of the source. More -+# interesting pools provide better test coverage so this function should -+# extended as needed to create more realistic pools. -+# -+pool_create() { -+ pool_set_vars $1 -+ src_set_vars $1 -+ -+ if [ "$POOL_TAG" != "installed" ]; then -+ cd $POOL_DIR_SRC -+ fi -+ -+ $ZFS_SH zfs="spa_config_path=$POOL_DIR_PRISTINE" || fail -+ -+ # Create a file vdev RAIDZ pool. -+ FILEDIR="$POOL_DIR_PRISTINE" $ZPOOL_CREATE \ -+ -c file-raidz -p $POOL_TAG -v >/dev/null || fail -+ -+ # Create a pool/fs filesystem with some random contents. -+ $ZFS_CMD create $POOL_TAG/fs || fail -+ populate /$POOL_TAG/fs/ 10 100 -+ -+ # Snapshot that filesystem, clone it, remove the files/dirs, -+ # replace them with new files/dirs. -+ $ZFS_CMD snap $POOL_TAG/fs@snap || fail -+ $ZFS_CMD clone $POOL_TAG/fs@snap $POOL_TAG/clone || fail -+ rm -Rf /$POOL_TAG/clone/* || fail -+ populate /$POOL_TAG/clone/ 10 100 -+ -+ # Scrub the pool, delay slightly, then export it. It is now -+ # somewhat interesting for testing purposes. -+ $ZPOOL_CMD scrub $POOL_TAG || fail -+ sleep 10 -+ $ZPOOL_CMD export $POOL_TAG || fail -+ -+ $ZFS_SH -u || fail -+} -+ -+# If the zfs-images directory doesn't exist fetch a copy from Github then -+# cache it in the $TEST_DIR and update $IMAGES_DIR. -+if [ ! -d $IMAGES_DIR ]; then -+ IMAGES_DIR="$TEST_DIR/zfs-images" -+ mkdir -p $IMAGES_DIR -+ curl -sL $IMAGES_TAR | \ -+ tar -xz -C $IMAGES_DIR --strip-components=1 || fail -+fi -+ -+# Given the available images in the zfs-images directory substitute the -+# list of available images for the reserved keywork 'all'. -+for TAG in $POOL_TAGS; do -+ -+ if [ "$TAG" = "all" ]; then -+ ALL_TAGS=`ls $IMAGES_DIR | grep "tar.bz2" | \ -+ sed 's/.tar.bz2//' | tr '\n' ' '` -+ NEW_TAGS="$NEW_TAGS $ALL_TAGS" -+ else -+ NEW_TAGS="$NEW_TAGS $TAG" -+ fi -+done -+POOL_TAGS="$NEW_TAGS" -+ -+if [ $VERBOSE -ne 0 ]; then -+ echo "---------------------------- Options ----------------------------" -+ echo "VERBOSE=$VERBOSE" -+ echo "KEEP=$KEEP" -+ echo "REPO=$REPO" -+ echo "SRC_TAGS="$SRC_TAGS"" -+ echo "POOL_TAGS="$POOL_TAGS"" -+ echo "PATH=$TEST_DIR" -+ echo -+fi -+ -+if [ ! -d $TEST_DIR ]; then -+ mkdir -p $TEST_DIR -+fi -+ -+# Print a header for all tags which are being tested. -+echo "--------------------- ZFS on Linux Source Versions --------------" -+printf "%-16s" " " -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ "$TAG" = "installed" ]; then -+ ZFS_VERSION=`modinfo zfs | awk '/version:/ { print $2; exit }'` -+ if [ -n "$ZFS_VERSION" ]; then -+ printf "%-16s" $ZFS_VERSION -+ else -+ echo "ZFS is not installed\n" -+ fail -+ fi -+ else -+ printf "%-16s" $TAG -+ fi -+done -+echo -e "\n-----------------------------------------------------------------" -+ -+# -+# Attempt to generate the tarball from your local git repository, if that -+# fails then attempt to download the tarball from Github. -+# -+printf "%-16s" "Clone SPL" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -d $SPL_DIR ]; then -+ skip_nonewline -+ elif [ "$SPL_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SPLSRC -+ -+ if [ ! -d $SRC_DIR_SPL ]; then -+ mkdir -p $SRC_DIR_SPL -+ fi -+ -+ git archive --format=tar --prefix=$SPL_TAG/ $SPL_TAG \ -+ -o $SRC_DIR_SPL/$SPL_TAG.tar &>/dev/nul || \ -+ rm $SRC_DIR_SPL/$SPL_TAG.tar -+ if [ -s $SRC_DIR_SPL/$SPL_TAG.tar ]; then -+ tar -xf $SRC_DIR_SPL/$SPL_TAG.tar -C $SRC_DIR_SPL -+ rm $SRC_DIR_SPL/$SPL_TAG.tar -+ echo -n -e "${COLOR_GREEN}Local${COLOR_RESET}\t\t" -+ else -+ mkdir -p $SPL_DIR || fail -+ curl -sL $SPL_URL | tar -xz -C $SPL_DIR \ -+ --strip-components=1 || fail -+ echo -n -e "${COLOR_GREEN}Remote${COLOR_RESET}\t\t" -+ fi -+ fi -+done -+printf "\n" -+ -+# -+# Attempt to generate the tarball from your local git repository, if that -+# fails then attempt to download the tarball from Github. -+# -+printf "%-16s" "Clone ZFS" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -d $ZFS_DIR ]; then -+ skip_nonewline -+ elif [ "$ZFS_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SRCDIR -+ -+ if [ ! -d $SRC_DIR_ZFS ]; then -+ mkdir -p $SRC_DIR_ZFS -+ fi -+ -+ git archive --format=tar --prefix=$ZFS_TAG/ $ZFS_TAG \ -+ -o $SRC_DIR_ZFS/$ZFS_TAG.tar &>/dev/nul || \ -+ rm $SRC_DIR_ZFS/$ZFS_TAG.tar -+ if [ -s $SRC_DIR_ZFS/$ZFS_TAG.tar ]; then -+ tar -xf $SRC_DIR_ZFS/$ZFS_TAG.tar -C $SRC_DIR_ZFS -+ rm $SRC_DIR_ZFS/$ZFS_TAG.tar -+ echo -n -e "${COLOR_GREEN}Local${COLOR_RESET}\t\t" -+ else -+ mkdir -p $ZFS_DIR || fail -+ curl -sL $ZFS_URL | tar -xz -C $ZFS_DIR \ -+ --strip-components=1 || fail -+ echo -n -e "${COLOR_GREEN}Remote${COLOR_RESET}\t\t" -+ fi -+ fi -+done -+printf "\n" -+ -+# Build the listed tags -+printf "%-16s" "Build SPL" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -f $SPL_DIR/module/spl/spl.ko ]; then -+ skip_nonewline -+ elif [ "$SPL_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $SPL_DIR -+ make distclean &>/dev/null -+ sh ./autogen.sh &>/dev/null || fail -+ ./configure &>/dev/null || fail -+ make -s -j$CPUS &>/dev/null || fail -+ pass_nonewline -+ fi -+done -+printf "\n" -+ -+# Build the listed tags -+printf "%-16s" "Build ZFS" -+for TAG in $SRC_TAGS; do -+ src_set_vars $TAG -+ -+ if [ -f $ZFS_DIR/module/zfs/zfs.ko ]; then -+ skip_nonewline -+ elif [ "$ZFS_TAG" = "installed" ]; then -+ skip_nonewline -+ else -+ cd $ZFS_DIR -+ make distclean &>/dev/null -+ sh ./autogen.sh &>/dev/null || fail -+ ./configure --with-spl=$SPL_DIR &>/dev/null || fail -+ make -s -j$CPUS &>/dev/null || fail -+ pass_nonewline -+ fi -+done -+printf "\n" -+echo "-----------------------------------------------------------------" -+ -+# Either create a new pool using 'zpool create', or alternately restore an -+# existing pool from another ZFS implementation for compatibility testing. -+for TAG in $POOL_TAGS; do -+ pool_set_vars $TAG -+ SKIP=0 -+ -+ printf "%-16s" $POOL_TAG -+ rm -Rf $POOL_DIR -+ mkdir -p $POOL_DIR_PRISTINE -+ -+ # Use the existing compressed image if available. -+ if [ -f $POOL_BZIP ]; then -+ tar -xjf $POOL_BZIP -C $POOL_DIR_PRISTINE \ -+ --strip-components=1 || fail -+ # Use the installed version to create the pool. -+ elif [ "$TAG" = "installed" ]; then -+ pool_create $TAG -+ # A source build is available to create the pool. -+ elif [ -d $POOL_DIR_SRC ]; then -+ pool_create $TAG -+ else -+ SKIP=1 -+ fi -+ -+ # Verify 'zpool import' works for all listed source versions. -+ for TAG in $SRC_TAGS; do -+ -+ if [ $SKIP -eq 1 ]; then -+ skip_nonewline -+ continue -+ fi -+ -+ src_set_vars $TAG -+ if [ "$TAG" != "installed" ]; then -+ cd $ZFS_DIR -+ fi -+ $ZFS_SH zfs="spa_config_path=$POOL_DIR_COPY" -+ -+ cp -a --sparse=always $POOL_DIR_PRISTINE $POOL_DIR_COPY || fail -+ POOL_NAME=`$ZPOOL_CMD import -d $POOL_DIR_COPY | \ -+ awk '/pool:/ { print $2; exit 0 }'` -+ -+ $ZPOOL_CMD import -N -d $POOL_DIR_COPY $POOL_NAME &>/dev/null -+ if [ $? -ne 0 ]; then -+ fail_nonewline -+ ERROR=1 -+ else -+ $ZPOOL_CMD export $POOL_NAME || fail -+ pass_nonewline -+ fi -+ -+ rm -Rf $POOL_DIR_COPY -+ -+ $ZFS_SH -u || fail -+ done -+ printf "\n" -+done -+ -+if [ ! $KEEP ]; then -+ rm -Rf $TEST_DIR -+fi -+ -+exit $ERROR -diff --git a/scripts/zpool-config/file-raid0.sh b/scripts/zpool-config/file-raid0.sh -index 5ec80b0..ff11836 100644 ---- a/scripts/zpool-config/file-raid0.sh -+++ b/scripts/zpool-config/file-raid0.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/file-raid10.sh b/scripts/zpool-config/file-raid10.sh -index ae7f0ae..fa297b4 100644 ---- a/scripts/zpool-config/file-raid10.sh -+++ b/scripts/zpool-config/file-raid10.sh -@@ -5,6 +5,5 @@ - --FILES_M1="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1" --FILES_M2="/tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - FILES="${FILES_M1} ${FILES_M2}" -diff --git a/scripts/zpool-config/file-raidz.sh b/scripts/zpool-config/file-raidz.sh -index 5b6c3ea..768e3de 100644 ---- a/scripts/zpool-config/file-raidz.sh -+++ b/scripts/zpool-config/file-raidz.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/file-raidz2.sh b/scripts/zpool-config/file-raidz2.sh -index bc0e5ec..b1c18f4 100644 ---- a/scripts/zpool-config/file-raidz2.sh -+++ b/scripts/zpool-config/file-raidz2.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - -diff --git a/scripts/zpool-config/lo-faulty-raid0.sh b/scripts/zpool-config/lo-faulty-raid0.sh -index 10b8f88..bf057bb 100644 ---- a/scripts/zpool-config/lo-faulty-raid0.sh -+++ b/scripts/zpool-config/lo-faulty-raid0.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raid10.sh b/scripts/zpool-config/lo-faulty-raid10.sh -index ef81abb..0a3720a 100644 ---- a/scripts/zpool-config/lo-faulty-raid10.sh -+++ b/scripts/zpool-config/lo-faulty-raid10.sh -@@ -11,6 +11,6 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} -+FILES="${FILES_M1} ${FILES_M2}" - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raidz.sh b/scripts/zpool-config/lo-faulty-raidz.sh -index 2f1f08a..07fd145 100644 ---- a/scripts/zpool-config/lo-faulty-raidz.sh -+++ b/scripts/zpool-config/lo-faulty-raidz.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-faulty-raidz2.sh b/scripts/zpool-config/lo-faulty-raidz2.sh -index 2522fa7..4456a56 100644 ---- a/scripts/zpool-config/lo-faulty-raidz2.sh -+++ b/scripts/zpool-config/lo-faulty-raidz2.sh -@@ -11,6 +11,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - LODEVICES="" -diff --git a/scripts/zpool-config/lo-raid0.sh b/scripts/zpool-config/lo-raid0.sh -index f24050f..1f23fe1 100644 ---- a/scripts/zpool-config/lo-raid0.sh -+++ b/scripts/zpool-config/lo-raid0.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/lo-raid10.sh b/scripts/zpool-config/lo-raid10.sh -index f9fe3c0..18c1dcb 100644 ---- a/scripts/zpool-config/lo-raid10.sh -+++ b/scripts/zpool-config/lo-raid10.sh -@@ -5,6 +5,5 @@ - --FILES_M1="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1" --FILES_M2="/tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES_M1=${FILES_M1:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1"} -+FILES_M2=${FILES_M2:-"$FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - FILES="${FILES_M1} ${FILES_M2}" -diff --git a/scripts/zpool-config/lo-raidz.sh b/scripts/zpool-config/lo-raidz.sh -index db5de7c..483baf7 100644 ---- a/scripts/zpool-config/lo-raidz.sh -+++ b/scripts/zpool-config/lo-raidz.sh -@@ -4,7 +4,5 @@ - # -- --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/lo-raidz2.sh b/scripts/zpool-config/lo-raidz2.sh -index 53a032e..ea52236 100644 ---- a/scripts/zpool-config/lo-raidz2.sh -+++ b/scripts/zpool-config/lo-raidz2.sh -@@ -5,6 +5,5 @@ - --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2 \ -- /tmp/zpool-vdev3" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 \ -+ $FILEDIR/file-vdev2 $FILEDIR/file-vdev3"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raid0.sh b/scripts/zpool-config/scsi_debug-raid0.sh -index 797ea80..fc09798 100644 ---- a/scripts/zpool-config/scsi_debug-raid0.sh -+++ b/scripts/zpool-config/scsi_debug-raid0.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raid10.sh b/scripts/zpool-config/scsi_debug-raid10.sh -index 4ec205b..3c1f733 100644 ---- a/scripts/zpool-config/scsi_debug-raid10.sh -+++ b/scripts/zpool-config/scsi_debug-raid10.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES_M1="" -diff --git a/scripts/zpool-config/scsi_debug-raidz.sh b/scripts/zpool-config/scsi_debug-raidz.sh -index c811a01..54a4565 100644 ---- a/scripts/zpool-config/scsi_debug-raidz.sh -+++ b/scripts/zpool-config/scsi_debug-raidz.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/scripts/zpool-config/scsi_debug-raidz2.sh b/scripts/zpool-config/scsi_debug-raidz2.sh -index 429a841..fa6e77a 100644 ---- a/scripts/zpool-config/scsi_debug-raidz2.sh -+++ b/scripts/zpool-config/scsi_debug-raidz2.sh -@@ -11,5 +11,4 @@ SDLUNS=${SDLUNS:-1} - LDMOD=/sbin/modprobe --FILES="/tmp/zpool-vdev0 \ -- /tmp/zpool-vdev1 \ -- /tmp/zpool-vdev2" -+FILEDIR=${FILEDIR:-/var/tmp} -+FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} - DEVICES="" -diff --git a/zfs-script-config.sh.in b/zfs-script-config.sh.in -index ba676c8..10d24f0 100644 ---- a/zfs-script-config.sh.in -+++ b/zfs-script-config.sh.in -@@ -38,2 +38,4 @@ LDMOD=/sbin/insmod - -+ZED_PIDFILE=@runstatedir@/zed.pid -+ - KERNEL_MODULES=( \ --- -1.9.2 -