Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 80 additions & 10 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ mandir = join_paths(prefixdir, get_option('mandir'))
sbindir = join_paths(prefixdir, get_option('sbindir'))
sysconfdir = join_paths(prefixdir, get_option('sysconfdir'))

udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir'))
dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir'))
systemddir = join_paths(prefixdir, get_option('systemddir'))
rundir = join_paths(prefixdir, get_option('rundir'))
udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir'))
dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir'))
dracutmodulesdir = join_paths(prefixdir, get_option('dracutmodulesdir'))
systemddir = join_paths(prefixdir, get_option('systemddir'))
rundir = join_paths(prefixdir, get_option('rundir'))
networkmanagerdir = join_paths(prefixdir, get_option('networkmanagerdir'))

###############################################################################
conf = configuration_data()
Expand Down Expand Up @@ -199,14 +201,16 @@ substs.set('NAME', meson.project_name())
substs.set('VERSION', meson.project_version())
substs.set('LICENSE', meson.project_license()[0])
substs.set('UDEVRULESDIR', udevrulesdir)
substs.set('DRACUTRILESDIR', dracutrulesdir)
substs.set('DRACUTRULESDIR', dracutrulesdir)
substs.set('DRACUTMODULESDIR', dracutmodulesdir)
substs.set('REQUIRES', requires)
substs.set('DATADIR', datadir)
substs.set('MANDIR', mandir)
substs.set('RUNDIR', rundir)
substs.set('SBINDIR', sbindir)
substs.set('SYSCONFDIR', sysconfdir)
substs.set('SYSTEMDDIR', systemddir)
substs.set('NETWORKMANAGERDIR', networkmanagerdir)
substs.set('SYSTEMCTL', get_option('systemctl'))

configure_file(
Expand All @@ -221,11 +225,11 @@ disc = configure_file(
configuration: substs,
)

dracut_files = [
dracut_conf_files = [
'70-nvmf-autoconnect.conf',
]

foreach file : dracut_files
foreach file : dracut_conf_files
configure_file(
input: 'nvmf-autoconnect/dracut-conf/' + file + '.in',
output: file,
Expand All @@ -241,6 +245,52 @@ systemd_files = [
'[email protected]',
]

want_dracut_module = get_option('dracut-module')
if want_dracut_module
systemd_files += [
'nbft-boot-pre.service',
'nbft-boot-connect.service'
]

dracut_nbft_files = [
'module-setup.sh',
'nbft-boot-pre-dracut.conf',
'nbft-boot-connect-dracut.conf'
]

foreach file : dracut_nbft_files
configure_file(
input: 'nvmf-autoconnect/dracut-95nbft/' + file + '.in',
output: file,
configuration: substs,
)
endforeach

networkmanager_conf_files = [
'95-nvme-nbft-no-ignore-carrier.conf'
]

foreach file : networkmanager_conf_files
configure_file(
input: 'nvmf-autoconnect/NetworkManager/' + file + '.in',
output: file,
configuration: substs,
)
endforeach

networkmanager_dispatcher_files = [
'99-nvme-nbft-connect.sh'
]

foreach file : networkmanager_dispatcher_files
configure_file(
input: 'nvmf-autoconnect/NetworkManager/' + file + '.in',
output: file,
configuration: substs,
)
endforeach
endif

foreach file : systemd_files
configure_file(
input: 'nvmf-autoconnect/systemd/' + file + '.in',
Expand Down Expand Up @@ -315,11 +365,28 @@ install_data('completions/bash-nvme-completion.sh',
install_data('completions/_nvme',
install_dir: datadir + '/zsh/site-functions')

foreach file : dracut_files
foreach file : dracut_conf_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: dracutrulesdir)
endforeach

if want_dracut_module
foreach file : dracut_nbft_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: dracutmodulesdir + '95nbft/')
endforeach

foreach file : networkmanager_conf_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: networkmanagerdir + 'conf.d/')
endforeach

foreach file : networkmanager_dispatcher_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: networkmanagerdir + 'dispatcher.d/')
endforeach
endif

foreach file : systemd_files
install_data(meson.current_build_dir() + '/' + file,
install_dir: systemddir)
Expand All @@ -343,8 +410,10 @@ if meson.version().version_compare('>=0.53.0')
'mandir': mandir,
'udevrulesdir': udevrulesdir,
'dracutrulesdir': dracutrulesdir,
'dracutmodulesdir': dracutmodulesdir,
'rundir': rundir,
'systemddir': systemddir,
'networkmanagerdir': networkmanagerdir,
'build location': meson.current_build_dir(),
}
summary(path_dict, section: 'Paths')
Expand All @@ -353,8 +422,9 @@ if meson.version().version_compare('>=0.53.0')
}
summary(dep_dict, section: 'Dependencies')
conf_dict = {
'git version': conf.get('GIT_VERSION'),
'pdc enabled': get_option('pdc-enabled'),
'git version': conf.get('GIT_VERSION'),
'pdc enabled': get_option('pdc-enabled'),
'dracut module enabled': want_dracut_module
}
summary(conf_dict, section: 'Configuration')
endif
18 changes: 18 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ option(
value : 'lib/dracut/dracut.conf.d/',
description : 'directory for dracut rules files'
)
option(
'dracutmodulesdir',
type : 'string',
value : 'lib/dracut/modules.d/',
description : 'dracut modules directory'
)
option(
'htmldir',
type : 'string',
Expand Down Expand Up @@ -70,3 +76,15 @@ option(
type : 'string',
description : 'override the git version string'
)
option(
'dracut-module',
type : 'boolean',
value : false,
description : 'Enable the 95nbft dracut module'
)
option(
'networkmanagerdir',
type : 'string',
value : 'lib/NetworkManager/',
description : 'NetworkManager lib directory'
)
2 changes: 1 addition & 1 deletion nvme.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ touch %{buildroot}@SYSCONFDIR@/nvme/hostid
@UDEVRULESDIR@/70-nvmf-autoconnect.rules
@UDEVRULESDIR@/70-nvmf-keys.rules
@UDEVRULESDIR@/71-nvmf-netapp.rules
@DRACUTRILESDIR@/70-nvmf-autoconnect.conf
@DRACUTRULESDIR@/70-nvmf-autoconnect.conf
@SYSTEMDDIR@/[email protected]
@SYSTEMDDIR@/nvmefc-boot-connections.service
@SYSTEMDDIR@/nvmf-connect-nbft.service
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Boot from NVMe over TCP (NBFT)
#
# For NVMe/TCP connections that provide namespaces containing rootfs
# it is crucial to react on carrier events and reconnect any missing
# NVMe/TCP connections as defined in the ACPI NBFT table. A custom
# @[email protected]/99-nvme-nbft-connect.sh hook
# will respawn nvmf-connect-nbft.service on such occasion.
#
# This file acts as an override to any config directives that disable
# carrier events globally.
#
# See https://networkmanager.dev/docs/admins/#server-like-behavior

[device-nbft-no-ignore-carrier]

# only affects nbft0, nbft1, ... interfaces
match-device=interface-name:nbft*

# react on link up/down events
ignore-carrier=no
5 changes: 5 additions & 0 deletions nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

Check failure on line 1 in nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

ERROR: do not set execute permissions for source files

Check failure on line 2 in nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in

View workflow job for this annotation

GitHub Actions / checkpatch review

WARNING: Missing or malformed SPDX-License-Identifier tag in line 2
if [[ "$1" == nbft* ]] && [[ "$2" == "up" ]]; then
systemctl start nvmf-connect-nbft.service
fi
141 changes: 141 additions & 0 deletions nvmf-autoconnect/dracut-95nbft/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# The NBFT initramfs module

Focused solely on providing the Boot from NVMe over TCP functionality, intended
to replace parts of the existing `95nvmf` dracut module. At the moment this all
depends on the recently added NetworkManager NBFT support, though the desire is
to support more network management frameworks in the future.

While this module is currently built around dracut, the amount of dracut
involvement in this module is kept to a required minimum with the intention
of supporting more initramfs frameworks (like `mkosi`) in the future.

This is achieved by splitting the framework-specific directives into systemd
unit dropins while keeping the main unit files generic.

Related nvme-cli meson configure options:
* `-Ddracut-module` (default=false) - enables the 95nbft dracut module
* `-Ddracutmodulesdir` (default=`$prefix/lib/dracut/modules.d/`)
* `-Dnetworkmanagerdir` (default=`$prefix/lib/NetworkManager/`)


# The design

(see [dracut.bootup(7)](https://man7.org/linux/man-pages/man7/dracut.bootup.7.html)
for the overall boot process flow)

There are two primary tasks this initramfs module performs:
* early network configuration preparation steps
* the actual NVMe/TCP connection attempts

The actual network interface setup is often distribution-specific and requires
NBFT parser support in each network management framework.

With dracut and NetworkManager the boot process looks roughly as follows:
* `nbft-boot-pre.service` is run, creates udev network link files and tells
dracut to activate networking
* dracut runs `nm-initrd-generator` (the `35network-manager` module) and starts
the NetworkManager daemon
* `systemd-udev-trigger.service` renames the network interfaces
* `nm-wait-online-initrd.service` finishes, indicating networking is up and ready.
This typically satisfies reaching the `network-online.target` point.
* `nbft-boot-connect.service` initiates actual NVMe connections
* the dracut initqueue is waiting for specific block devices (rootfs) to appear

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this, and the starting of NM above, are the only 2 things that dracut needs to perform, and therefore it's relatively easy to plug this scheme into mkosi or another initramfs generator, as long as that generator is based on systemd. Nice.

Am I understanding correctly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. The idea is to have this (somewhat) easily replaceable with something else, like the mkosi counterparts. I.e. keeping the dracut involvement to a required minimum.

It's mostly a matter of the right systemd unit dependencies. Particular boot phases may need better abstraction. During my experiments I first tried to use generic systemd targets and units but found it unreliable and unpredictable. Thus the unit file dependencies are mostly dracut-specific, though I wish to find a better solution in the future.

Anyway, clarified the two-unit boot flow in the docs.

Two major packages are responsible for this: this nvme-cli dracut module and
the added NBFT support in NetworkManager.

## The dracut 95nbft module

The dracut `module-setup.sh` only installs two systemd unit files sandwiched
between specific dracut phases, nothing else. By default the module is always
included in the initramfs unless _hostonly_ is requested in which case the system
is tested for ACPI NBFT tables presence and the module is only included in such
a case.

The systemd unit files are only run when the ACPI NBFT tables are present and
no `rd.nvmf.nonbft` kernel commandline argument was provided that otherwise
instruct the boot process to skip the NBFT machinery.

## nbft-boot-pre.service

Calls the nvme-cli nbft plugin to generate network link files for each interface
found in all NBFT tables. The interface naming in form of `nbftXhY` consists
of an ACPI NBFT table index (defaults to 0) and the specified HFI index.
In a typical scenario only `nbft0h1`, `nbft0h2`, `nbft1h1`, ... interfaces are
present, however it's up to the pre-OS driver to supply arbitrary indexes,
possibly leading to interface names skipping the order to something like
`nbft0h100` and `nbft99h123`. Comparing to the old `95nvmf` dracut module
ordering, this naming scheme is geared towards (semi-)stable predictable
network interface names. Keep in mind that the contents of the NBFT tables
is generated from scratch upon every system start and is not always persistent
between reboots.

The network link files are then picked up by udev on trigger via
`systemd-udev-trigger.service` to apply the new interface names.

For simplicity and for the time being this systemd unit replaces the `95nvmf`
dracut cmdline hook and adds the `rd.neednet=1` `cmdline.d` argument.

## nm-initrd-generator NBFT support

https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/2077

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be the most important difference wrt 95nvmf. You create a tool that directly transforms libnvme NBFT data structures into the preferrred format of your Network configuration tool (NM), whereas 95nvmf converts NBFT to JSON first, then JSON to dracut-style command line arguments, which would then further processed by the conventional nm-initrd-generator.

Your approach is of course much more efficient, but at the cost of being compatible only with (a future version of) NM.

A similar approach could be taken by wicked or other network management tools.

But I wonder if there might be some middle ground, perhaps we can provide the HFI data in some format that any network management tool can easily convert?

The "dracut command line" format is obviously very clumsy and simplistic.
I'd love to see dracut develop a more capable generic format for describing networking parameters. But I doubt that's going to happen. After all, dracut's format was designed with the kernel command line in mind.

So we could keep 95nvmf for those network management tools that have no native NBFT support, and take your approach for others. I'd need to discuss with the wicked people if it makes sense for us to write a similar plugin.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original intention was to keep nm-initrd-generator stateless, needing no input or prior configuration as it would parse the NBFT table itself. Then the whole dracut network stack can be eventually replaced with a simple systemd unit file, executing ExecStartPre=nm-initrd-generator + ExecStart=/usr/sbin/NetworkManager. This was planned as a next step later in the dracut minimization effort once other modules depending on network would take the same road. Obviously this was all originally planned as a downstream change...

But I wonder if there might be some middle ground, perhaps we can provide the HFI data in some format that any network management tool can easily convert?

This would require a new code somewhere in the middle. I don't see much benefit compared to the libnvme nbft parser, except of the required dependency on libnvme. The HFI structure itself is rather simple.

To be fair, there is currently one place where this could happen, however I was hoping to get rid of that one in the near term: interface renaming. We haven't agreed with the NetworkManager developers what component should be responsible for that. Renaming needs to be done through udev rules or through udev network link files. Udev then needs to be trigerred for the rename to actually happen, that's currently ensured through unit dependencies.

A natural place for this that was suggested to me was udev's builtin-net_id.c (see the note my the original PR post). Then everything could be even more simple.

So yes, the proposed call to nvme nbft gen-udev-link-files can be used for an intermediate NBFT data representation, however that would mean a new data format, a generator at one place and separate new parsers in each network management tools. That sounds like an overkill in this little initramfs module.

So we could keep 95nvmf for those network management tools that have no native NBFT support, and take your approach for others. I'd need to discuss with the wicked people if it makes sense for us to write a similar plugin.

Yes, it all falls down to the requirement of implementing the NBFT HFI parser elsewhere.

Executed before the NetworkManager daemon starts the added NBFT support parses
the ACPI NBFT tables available and generates system connections. Only
referenced by MAC addresses, relying on udev to perform actual interface
renaming.

The `nm-initrd-generator` doesn't link to `libnvme.so.1` but opens it through
`dlopen()` in runtime. This allows for smaller hostonly initramfs images in case
the NBFT tables are not present in the system. The library is being pulled in
indirectly through the dracut module's requirement of nvme-cli. The
`rd.nvmf.nonbft` kernel commandline argument is respected as well.

## nbft-boot-connect.service

Modprobes required modules (`nvme-fabrics`) first.

Performs actual NVMe connections by calling `nvme connect-all --nbft`. The
nvme-cli code has been modified to return non-zero return code in case one
or more SSNS records fail to connect (except those marked as _'unavailable'_
by the pre-OS driver), resulting in the service startup failure with defined
respawn of 10 seconds (TBD). This ensures multiple connection attempts while
NetworkManager reacts on link events in the background and the dracut initqueue
eagerly waits for new block devices to appears, to be scanned and mounted. Once
the required block device appears, the wait cycle is ended and the system
continues booting, stopping any queued `nbft-boot-connect.service` respawns
seamlessly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see below that nbft-boot-connect.service contains After=network-online.target. Thus no NBFT target will be connected before this target is reached. Maybe I misunderstand when you consider "online" state to be reached. Is it already reached when just one iBFT interface is configured?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unfortunately a way too vague definition of "network online" state as various network management frameworks tend to treat such state differently. The NetworkManager-wait-online.service docs describes various conditions, there are corresponding config options for a connection (that roughly equals to an interface setup) that can be tweaked by the NM NBFT plugin.

This After=network-online.target dependency is again (not yet) set in stone, I'm curious to see how e.g. wicked works with this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my remarks above. IMO this can't be handled with a generic target like network-online. It makes much more sense to react on online events of specific interfaces.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wonder if we could have something like [email protected] or [email protected]_24.target (IOW, translate network events into systemd unit state changes)...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh that would be lovely, such kind of fine-grained instantiated network targets.

FWIW, neither the NetworkManager's dispatcher service is available in the initramfs, it's just too heavy, while the main daemon is present.

The difference from the old dracut `95nvmf` module is that the nvme connection
attempts are not driven by network link up events but have fixed respawn
interval. This may potentially help the cases where the NIC is slow to
initialize, reports link up yet it takes another 5+ seconds before it's fully
able to send/receive packets. We've seen this issue with some 25Gb NICs.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point.

Ideally we wouldn't react on "link up" events but on events that indicate an L3 connection. But I'm not sure if such events exist... (see below)

Note that 95nvmf also has a timeout action. But repeating this in regular intervals is actually a nice idea.



# The post-switchroot boot flow

## nvmf-connect-nbft.service

This unit is supposed to run once the `network-online.target` has been reached
and calls `nvme connect-all --nbft` again. This ensures additional connection
attempt for records that failed to connect in the initramfs phase. As long as
this call matches existing connections and skips SSNS records that have been
already connected, in an ideal case this would result in an no-op. This is
mostly a one-shot service run in NetworkManager based distros since the target
typically stays reached until reboot.

## NetworkManager dispatcher hooks

The nvme-cli package installs a custom NetworkManager dispatcher service hook
(`99-nvme-nbft-connect.sh`) that just restarts `nvmf-connect-nbft.service` on
_link up_ events on `nbft*` interfaces. At the time the hook runs the interface
in question has been fully configured by NetworkManager. This ensures further
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm. This basically describes the "L3 up" events I just thought we didn't have ... can't we just do this in the initrd as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, almost. We are seeing some NICs that are not yet fully initialized at the time of this OS "L3 up" event, not having any of the Tx/Rx paths fully up. Then from OS perspective it's nearly impossible to distinguish such scenario from "destination unreachable".

So this is a best effort for the moment. The nvme-cli NBFT connect code could use addition of retry count or a custom timeout for each connection attempt. Currently at the end of my TODO list...

Might be a driver issue though, we haven't investigated that fully. Fortunately most NICs are fine. While this issue should be dealt with separately, I wanted to have an infrastructure in place in case it's needed.

reconnection attempts in multipath scenarios where a network interface just came
alive. This is designed as a secondary measure with the kernel nvme host driver
connection recovery being the primary mechanism.

In order to make link events work properly the `nbft*` interfaces need to be set
not to ignore carrier events. This is done through a custom override snippet
(`95-nvme-nbft-no-ignore-carrier.conf`) as some distributions may opt to follow
legacy server networking behaviour (see the `NetworkManager-config-server` package).
Loading
Loading