From ab50973958be81ff3a90c310f9ace45954fb34cb Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 24 Apr 2025 13:01:00 -0700 Subject: [PATCH] Add allocation profile export and zleak utility for import When attempting to debug performance problems on large systems, one of the major factors that affect performance is free space fragmentation. This heavily affects the allocation process, which is an area of active development in ZFS. Unfortunately, fragmenting a large pool for testing purposes is time consuming; it usually involves filling the pool and then repeatedly overwriting data until the free space becomes fragmented, which can take many hours. And even if the time is available, artificial workloads rarely generate the same fragmentation patterns as the natural workloads they're attempting to mimic. This patch has two parts. First, in zdb, we add the ability to export the full allocation map of the pool. It iterates over each vdev, printing every allocated segment in the ms_allocatable range tree. This can be done while the pool is online, though in that case the allocation map may actually be from several different TXGs as new ones are loaded on demand. The second is a new utility called zleak (and its supporting library and kernel changes). This is a small python program that invokes a new ioctl (via libzfs_core): zfs_ioc_raw_alloc. This ioctl takes in an nvlist of allocations to perform, and then allocates them. It does not currently store those allocations anywhere to make them reversible, and there is no corresponding raw_free ioctl (which would be extremely dangerous); this is an irreversible process, only intended for performance testing. The only way to reclaim the space afterwards is to destroy the pool or roll back to a checkpoint. Signed-off-by: Paul Dagnelie Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. --- cmd/Makefile.am | 8 +- cmd/zdb/zdb.c | 45 ++++++++-- cmd/zdb/zdb.h | 2 +- cmd/zdb/zdb_il.c | 2 +- cmd/zleak | 85 +++++++++++++++++++ contrib/debian/openzfs-zfsutils.install | 1 + contrib/debian/rules.in | 1 + contrib/pyzfs/libzfs_core/__init__.py | 2 + .../pyzfs/libzfs_core/_error_translation.py | 6 ++ contrib/pyzfs/libzfs_core/_libzfs_core.py | 32 +++++++ .../pyzfs/libzfs_core/bindings/libzfs_core.py | 3 + contrib/pyzfs/libzfs_core/exceptions.py | 4 + include/libzfs_core.h | 2 + include/sys/fs/zfs.h | 3 +- include/sys/metaslab.h | 3 + include/sys/vdev.h | 3 + lib/libzfs_core/libzfs_core.abi | 31 +++++-- lib/libzfs_core/libzfs_core.c | 21 +++++ module/zfs/metaslab.c | 28 ++++++ module/zfs/vdev.c | 41 +++++++++ module/zfs/zfs_ioctl.c | 77 +++++++++++++++++ rpm/generic/zfs.spec.in | 3 +- scripts/spdxcheck.pl | 1 + 23 files changed, 386 insertions(+), 18 deletions(-) create mode 100755 cmd/zleak diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 96040976e53e..8ad66dde4618 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -98,15 +98,17 @@ endif if USING_PYTHON -bin_SCRIPTS += arc_summary arcstat dbufstat zilstat -CLEANFILES += arc_summary arcstat dbufstat zilstat -dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in +bin_SCRIPTS += arc_summary arcstat dbufstat zilstat zleak +CLEANFILES += arc_summary arcstat dbufstat zilstat zleak +dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in %D%/zleak $(call SUBST,arcstat,%D%/) $(call SUBST,dbufstat,%D%/) $(call SUBST,zilstat,%D%/) arc_summary: %D%/arc_summary $(AM_V_at)cp $< $@ +zleak: %D%/zleak + $(AM_V_at)cp $< $@ endif diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 75b54ab4ea56..be6adcd083cf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; +uint8_t dump_opt[512]; + +#define ALLOCATABLE_OPT 256 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); @@ -1650,6 +1652,16 @@ dump_metaslab_stats(metaslab_t *msp) dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } +static void +dump_allocated(void *arg, uint64_t start, uint64_t size) +{ + uint64_t *off = arg; + if (*off != start) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, + start - *off); + *off = start + size; +} + static void dump_metaslab(metaslab_t *msp) { @@ -1666,13 +1678,24 @@ dump_metaslab(metaslab_t *msp) (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 2 && !dump_opt['L']) { + if (dump_opt[ALLOCATABLE_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); + } + + if (dump_opt['m'] > 2 && !dump_opt['L']) { zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); - metaslab_unload(msp); - mutex_exit(&msp->ms_lock); + } + + if (dump_opt[ALLOCATABLE_OPT]) { + uint64_t off = msp->ms_start; + zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, + &off); + if (off != msp->ms_start + msp->ms_size) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, + msp->ms_size - off); } if (dump_opt['m'] > 1 && sm != NULL && @@ -1687,6 +1710,12 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } + if (dump_opt[ALLOCATABLE_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else @@ -1723,8 +1752,9 @@ print_vdev_metaslab_header(vdev_t *vd) } } - (void) printf("\tvdev %10llu %s", - (u_longlong_t)vd->vdev_id, bias_str); + (void) printf("\tvdev %10llu\t%s metaslab shift %4lld", + (u_longlong_t)vd->vdev_id, bias_str, + (u_longlong_t)vd->vdev_ms_shift); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", @@ -9315,6 +9345,8 @@ main(int argc, char **argv) {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, + {"allocatable-map", no_argument, NULL, + ALLOCATABLE_OPT}, {0, 0, 0, 0} }; @@ -9345,6 +9377,7 @@ main(int argc, char **argv) case 'u': case 'y': case 'Z': + case ALLOCATABLE_OPT: dump_opt[c]++; dump_all = 0; break; diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h index 6b6c9169816b..48b561eb202c 100644 --- a/cmd/zdb/zdb.h +++ b/cmd/zdb/zdb.h @@ -29,6 +29,6 @@ #define _ZDB_H void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512]; #endif /* _ZDB_H */ diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 6b90b08ca1b1..ab63e8bd2b4a 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -48,7 +48,7 @@ #include "zdb.h" -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512]; static char tab_prefix[4] = "\t\t\t"; diff --git a/cmd/zleak b/cmd/zleak new file mode 100755 index 000000000000..7112ece6fe97 --- /dev/null +++ b/cmd/zleak @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara, Inc. +# + +import argparse, fileinput, libzfs_core, sys, errno + +def perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs, force, + verbose): + if args.verbose == 1: + print(f"Raw alloc: vdev {vdev_id}, {count} starting with offset " + f"{allocs[0][0]}") + if args.verbose >= 2: + print(f"Raw alloc: {pool} {ms_shift} {ms_count} {vdev_id} {count}") + try: + libzfs_core.lzc_raw_alloc(pool, 1 << ms_shift, ms_count, vdev_id, + allocs, args.force) + except libzfs_core.exceptions.ZFSGenericError as e: + if e.errno == errno.EINVAL: + print("Invalid map for provided pool") + sys.exit(1) + assert (e.errno == errno.E2BIG and force) + sys.exit(0) + +allocs = [] +count = 0 + +parser = argparse.ArgumentParser( + prog='zleak', + description='facility to replicate memory fragmentation in ZFS' +) +parser.add_argument('poolname') +parser.add_argument('-v', '--verbose', action='count', default=0) +parser.add_argument('-f', '--force', action='store_true', default=False) +args = parser.parse_args() + +pool = args.poolname.encode('utf-8') + +for line in fileinput.input('-'): + dump = False + line = line.rstrip() + if not line.startswith(("ALLOC: ", "\tvdev ", "\tmetaslabs ")): + continue + + tokens = line.split() + if line.startswith("\tvdev "): + next_vdev_id = int(tokens[1]) + next_ms_shift = int(tokens[4]) + next_ms_count = 0 + dump = True + elif line.startswith("\tmetaslabs "): + next_ms_count = int(tokens[1]) + else: + start = int(tokens[1]) + size = int(tokens[2]) + allocs.append((start, size)) + count = count + 1 + + if count == 1000000 or (dump and count != 0): + perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs, + args.force, args.verbose) + count = 0 + allocs = [] + vdev_id = next_vdev_id + ms_shift = next_ms_shift + ms_count = next_ms_count + + +if count > 0: + perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs, + args.force, args.verbose) + diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 37284a78ad18..abb2481fd3d4 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -40,6 +40,7 @@ usr/sbin/arc_summary usr/sbin/arcstat usr/sbin/dbufstat usr/sbin/zilstat +usr/sbin/zleak usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions usr/share/man/man1/arcstat.1 diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 3226d604546c..fc6503fd6f6a 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -85,6 +85,7 @@ override_dh_auto_install: mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat' mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' + mv '$(CURDIR)/debian/tmp/usr/bin/zleak' '$(CURDIR)/debian/tmp/usr/sbin/zleak' @# Zed has dependencies outside of the system root. mv '$(CURDIR)/debian/tmp/sbin/zed' '$(CURDIR)/debian/tmp/usr/sbin/zed' diff --git a/contrib/pyzfs/libzfs_core/__init__.py b/contrib/pyzfs/libzfs_core/__init__.py index 13b50ca4329f..9d8071e6fe57 100644 --- a/contrib/pyzfs/libzfs_core/__init__.py +++ b/contrib/pyzfs/libzfs_core/__init__.py @@ -95,6 +95,7 @@ lzc_set_props, lzc_list_children, lzc_list_snaps, + lzc_raw_alloc, receive_header, ) @@ -151,6 +152,7 @@ 'lzc_set_props', 'lzc_list_children', 'lzc_list_snaps', + 'lzc_raw_alloc', 'receive_header', ] diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py index d5491a3245cd..46085c5d589e 100644 --- a/contrib/pyzfs/libzfs_core/_error_translation.py +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -696,6 +696,12 @@ def lzc_list_translate_error(ret, name, opts): raise _generic_exception(ret, name, "Error obtaining a list") +def lzc_raw_alloc_translate_errors(ret, name): + if ret == 0: + return + raise _generic_exception(ret, name, "Error performing raw allocations") + + def _handle_err_list(ret, errlist, names, exception, mapper): ''' Convert one or more errors from an operation into the requested exception. diff --git a/contrib/pyzfs/libzfs_core/_libzfs_core.py b/contrib/pyzfs/libzfs_core/_libzfs_core.py index 0ebf99be67c2..b908f06ea0be 100644 --- a/contrib/pyzfs/libzfs_core/_libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/_libzfs_core.py @@ -2056,6 +2056,38 @@ def lzc_list_snaps(name): return iter(snaps) +def lzc_raw_alloc(poolname, metaslab_size, metaslab_count, vdev_id, + allocations, force): + ''' + Allocate regions of the provided vdev directly; useful primarily for + performance analysis of fragmented pools. Results in space leakage that it + is not currently possible to reclaim. + + :param bytes poolname: the name of the pool to allocate in + :param int metaslab_size: the size of a metaslab in this pool (for + validation) + :param int metaslab_count: the number of metaslabs in this top level + vdev (for validation) + :param int vdev_id: the id of the top-level vdev to perform allocations + from + :param allocations: pairs of offset and size to allocate + :type fromsnap: list of (int, int) + + :raises TooManyArguments: if too many allocations are passed in + ''' + if len(allocations) > 1000000: + raise exceptions.TooManyArguments() + allocs = _ffi.new(f"uint64_t[{2 * len(allocations)}]") + for i in range(len(allocations)): + (s, l) = allocations[i] + allocs[2 * i] = s + allocs[2 * i + 1] = l + ret = _lib.lzc_raw_alloc(poolname, uint64_t(metaslab_size), + uint64_t(metaslab_count), uint64_t(vdev_id), + allocs, 2 * len(allocations), force) + errors.lzc_raw_alloc_translate_errors(ret, poolname) + + # TODO: a better way to init and uninit the library def _initialize(): class LazyInit(object): diff --git a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py index ca752f65413d..542deba2c92f 100644 --- a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py +++ b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py @@ -140,6 +140,9 @@ int lzc_inherit(const char *fsname, const char *name, nvlist_t *); int lzc_set_props(const char *, nvlist_t *, nvlist_t *, nvlist_t *); int lzc_list (const char *, nvlist_t *); + + int lzc_raw_alloc(const char *, uint64_t, uint64_t, uint64_t, + uint64_t *, uint_t, boolean_t); """ SOURCE = """ diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py index b26a37f5de10..fe610feb0b68 100644 --- a/contrib/pyzfs/libzfs_core/exceptions.py +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -605,4 +605,8 @@ class RaidzExpansionRunning(ZFSError): message = "A raidz device is currently expanding" +class TooManyArguments(ZFSError): + error = errno.EOVERFLOW + message = "Too many arguments provided" + # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 231beaa69290..009fd8f2f534 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -164,6 +164,8 @@ _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, uint64_t); +_LIBZFS_CORE_H int lzc_raw_alloc(const char *, uint64_t, uint64_t, uint64_t, + uint64_t *, uint_t, boolean_t); #ifdef __cplusplus } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c8deb5be419e..5875659fa0cc 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1464,7 +1464,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 89/128 numbers reserved. + * Core features - 90/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1562,6 +1562,7 @@ typedef enum zfs_ioc { ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ + ZFS_IOC_RAW_ALLOC, /* 0x5a5a */ /* * Per-platform (Optional) - 8/128 numbers reserved. diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 36cbe06bacce..6824b9c8051f 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -147,6 +147,9 @@ extern int metaslab_debug_load; zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift); +void metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, + dmu_tx_t *tx); + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7f5a9aaef1b4..24038fe656a0 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -230,6 +230,9 @@ extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); extern int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl); extern int vdev_prop_get(vdev_t *vd, nvlist_t *nvprops, nvlist_t *outnvl); +extern int vdev_raw_alloc(vdev_t *vd, uint64_t *allocations, + uint_t alloc_count); + #ifdef __cplusplus } #endif diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 2af20894853d..fb72bc75c57b 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -182,6 +182,7 @@ + @@ -1426,6 +1427,11 @@ + + + + + @@ -1437,11 +1443,6 @@ - - - - - @@ -1774,6 +1775,7 @@ + @@ -2739,6 +2741,8 @@ + + @@ -2874,6 +2878,13 @@ + + + + + + + @@ -3391,6 +3402,16 @@ + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 9347aa7c6a28..04bd28e27281 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1994,3 +1994,24 @@ lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) return (error); } + +int +lzc_raw_alloc(const char *pool, uint64_t metaslab_size, + uint64_t metaslab_count, uint64_t vdev_id, + uint64_t *allocations, uint_t alloc_count, boolean_t force) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_uint64(args, "metaslab_size", metaslab_size); + fnvlist_add_uint64(args, "metaslab_count", metaslab_count); + fnvlist_add_uint64(args, "vdev_id", vdev_id); + fnvlist_add_uint64_array(args, "allocations", allocations, alloc_count); + fnvlist_add_boolean_value(args, "force", force); + + error = lzc_ioctl(ZFS_IOC_RAW_ALLOC, pool, args, NULL); + + fnvlist_free(args); + + return (error); +} diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 082d379cded5..945b3e4167c5 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -4756,6 +4756,34 @@ metaslab_trace_fini(zio_alloc_list_t *zal) * ========================================================================== */ +void +metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, + dmu_tx_t *tx) +{ + ASSERT(msp->ms_disabled); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + uint64_t txg = dmu_tx_get_txg(tx); + + for (uint64_t off = start; off < start + size; ) { + uint64_t ostart, osize; + boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable, + off, start + size - off, &ostart, &osize); + if (!found) + break; + zfs_range_tree_remove(msp->ms_allocatable, ostart, + osize); + + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp, + txg); + + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart, + osize); + msp->ms_allocating_total += osize; + off = ostart + osize; + } +} + static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator, int flags, uint64_t psize, const void *tag) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 01758b0c54c0..b4211a8e9194 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6584,6 +6584,47 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) return (0); } +int +vdev_raw_alloc(vdev_t *vd, uint64_t *allocations, uint_t alloc_count) +{ + int error = 0; + metaslab_t *prev = NULL; + dmu_tx_t *tx = NULL; + + for (uint_t i = 0; i < alloc_count; i += 2) { + uint64_t offset = allocations[i]; + uint64_t length = allocations[i + 1]; + if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) { + error = E2BIG; + break; + } + + metaslab_t *cur = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (prev != cur) { + if (prev) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + } + ASSERT(cur); + metaslab_disable(cur); + mutex_enter(&cur->ms_lock); + metaslab_load(cur); + prev = cur; + tx = dmu_tx_create_dd( + spa_get_dsl(vd->vdev_spa)->dp_root_dir); + dmu_tx_assign(tx, DMU_TX_WAIT); + } + + metaslab_force_alloc(cur, offset, length, tx); + } + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + + return (error); +} + EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3a413f4a7bdb..fed26931bc84 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7306,6 +7306,78 @@ zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) return (ret); } +static const zfs_ioc_key_t zfs_keys_raw_alloc[] = { + {"metaslab_size", DATA_TYPE_UINT64, 0}, + {"metaslab_count", DATA_TYPE_UINT64, 0}, + {"vdev_id", DATA_TYPE_UINT64, 0}, + {"allocations", DATA_TYPE_UINT64_ARRAY, 0}, + {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, +}; + +static int +zfs_ioc_raw_alloc(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) +{ + (void) outnvl; + int error = 0; + spa_t *spa; + + if ((error = spa_open(pool, &spa, FTAG)) != 0) + return (error); + + uint64_t ms_size, ms_count, vdev_id; + uint_t alloc_count; + uint64_t *allocations; + boolean_t force; + + if ((error = nvlist_lookup_uint64(innvl, "metaslab_size", + &ms_size)) != 0) { + spa_close(spa, FTAG); + return (error); + } + if ((error = nvlist_lookup_uint64(innvl, "metaslab_count", + &ms_count)) != 0) { + spa_close(spa, FTAG); + return (error); + } + if ((error = nvlist_lookup_uint64(innvl, "vdev_id", &vdev_id)) != 0) { + spa_close(spa, FTAG); + return (error); + } + if ((error = nvlist_lookup_boolean_value(innvl, "force", &force)) != + 0) { + spa_close(spa, FTAG); + return (error); + } + spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (vd == NULL) { + error = ENOENT; + goto out; + } + + if ((1ULL << vd->vdev_ms_shift) != ms_size || + (!force && vd->vdev_ms_count != ms_count)) { + error = EINVAL; + goto out; + } + + if ((error = nvlist_lookup_uint64_array(innvl, "allocations", + &allocations, &alloc_count)) != 0) { + goto out; + } + + if (alloc_count == 0 || alloc_count % 2 != 0) { + error = EINVAL; + goto out; + } + + error = vdev_raw_alloc(vd, allocations, alloc_count); +out: + spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG); + spa_close(spa, FTAG); + return (error); +} + static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void @@ -7612,6 +7684,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + zfs_ioctl_register("raw_alloc", ZFS_IOC_RAW_ALLOC, + zfs_ioc_raw_alloc, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_FALSE, + zfs_keys_raw_alloc, ARRAY_SIZE(zfs_keys_raw_alloc)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index dddc0a6c8f02..e01239995f5d 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -434,7 +434,7 @@ find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ \( -name arc_summary -or -name arcstat -or -name dbufstat \ - -or -name zilstat \) \ + -or -name zilstat -or -name zleak \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ \( -name test-runner.py -or -name zts-report.py \) \ @@ -513,6 +513,7 @@ systemctl --system daemon-reload >/dev/null || true %{_bindir}/arcstat %{_bindir}/dbufstat %{_bindir}/zilstat +%{_bindir}/zleak # Man pages %{_mandir}/man1/* %{_mandir}/man4/* diff --git a/scripts/spdxcheck.pl b/scripts/spdxcheck.pl index 88f5a235d70c..bd01a88e71f2 100755 --- a/scripts/spdxcheck.pl +++ b/scripts/spdxcheck.pl @@ -87,6 +87,7 @@ cmd/arc_summary cmd/dbufstat.in cmd/zilstat.in + cmd/zleak cmd/zpool/zpool.d/* etc/init.d/zfs-import.in etc/init.d/zfs-load-key.in