-
Notifications
You must be signed in to change notification settings - Fork 696
New NBFT initramfs module #2620
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5015bc1
ca4c903
bf3dcb4
344f237
06c0ee5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,10 +24,12 @@ mandir = join_paths(prefixdir, get_option('mandir')) | |
| sbindir = join_paths(prefixdir, get_option('sbindir')) | ||
| sysconfdir = join_paths(prefixdir, get_option('sysconfdir')) | ||
|
|
||
| udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir')) | ||
| dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir')) | ||
| systemddir = join_paths(prefixdir, get_option('systemddir')) | ||
| rundir = join_paths(prefixdir, get_option('rundir')) | ||
| udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir')) | ||
| dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir')) | ||
| dracutmodulesdir = join_paths(prefixdir, get_option('dracutmodulesdir')) | ||
| systemddir = join_paths(prefixdir, get_option('systemddir')) | ||
| rundir = join_paths(prefixdir, get_option('rundir')) | ||
| networkmanagerdir = join_paths(prefixdir, get_option('networkmanagerdir')) | ||
|
|
||
| ############################################################################### | ||
| conf = configuration_data() | ||
|
|
@@ -199,14 +201,16 @@ substs.set('NAME', meson.project_name()) | |
| substs.set('VERSION', meson.project_version()) | ||
| substs.set('LICENSE', meson.project_license()[0]) | ||
| substs.set('UDEVRULESDIR', udevrulesdir) | ||
| substs.set('DRACUTRILESDIR', dracutrulesdir) | ||
| substs.set('DRACUTRULESDIR', dracutrulesdir) | ||
| substs.set('DRACUTMODULESDIR', dracutmodulesdir) | ||
| substs.set('REQUIRES', requires) | ||
| substs.set('DATADIR', datadir) | ||
| substs.set('MANDIR', mandir) | ||
| substs.set('RUNDIR', rundir) | ||
| substs.set('SBINDIR', sbindir) | ||
| substs.set('SYSCONFDIR', sysconfdir) | ||
| substs.set('SYSTEMDDIR', systemddir) | ||
| substs.set('NETWORKMANAGERDIR', networkmanagerdir) | ||
| substs.set('SYSTEMCTL', get_option('systemctl')) | ||
|
|
||
| configure_file( | ||
|
|
@@ -221,11 +225,11 @@ disc = configure_file( | |
| configuration: substs, | ||
| ) | ||
|
|
||
| dracut_files = [ | ||
| dracut_conf_files = [ | ||
| '70-nvmf-autoconnect.conf', | ||
| ] | ||
|
|
||
| foreach file : dracut_files | ||
| foreach file : dracut_conf_files | ||
| configure_file( | ||
| input: 'nvmf-autoconnect/dracut-conf/' + file + '.in', | ||
| output: file, | ||
|
|
@@ -241,6 +245,52 @@ systemd_files = [ | |
| '[email protected]', | ||
| ] | ||
|
|
||
| want_dracut_module = get_option('dracut-module') | ||
| if want_dracut_module | ||
| systemd_files += [ | ||
| 'nbft-boot-pre.service', | ||
| 'nbft-boot-connect.service' | ||
| ] | ||
|
|
||
| dracut_nbft_files = [ | ||
| 'module-setup.sh', | ||
| 'nbft-boot-pre-dracut.conf', | ||
| 'nbft-boot-connect-dracut.conf' | ||
| ] | ||
|
|
||
| foreach file : dracut_nbft_files | ||
| configure_file( | ||
| input: 'nvmf-autoconnect/dracut-95nbft/' + file + '.in', | ||
| output: file, | ||
| configuration: substs, | ||
| ) | ||
| endforeach | ||
|
|
||
| networkmanager_conf_files = [ | ||
| '95-nvme-nbft-no-ignore-carrier.conf' | ||
| ] | ||
|
|
||
| foreach file : networkmanager_conf_files | ||
| configure_file( | ||
| input: 'nvmf-autoconnect/NetworkManager/' + file + '.in', | ||
| output: file, | ||
| configuration: substs, | ||
| ) | ||
| endforeach | ||
|
|
||
| networkmanager_dispatcher_files = [ | ||
| '99-nvme-nbft-connect.sh' | ||
| ] | ||
|
|
||
| foreach file : networkmanager_dispatcher_files | ||
| configure_file( | ||
| input: 'nvmf-autoconnect/NetworkManager/' + file + '.in', | ||
| output: file, | ||
| configuration: substs, | ||
| ) | ||
| endforeach | ||
| endif | ||
|
|
||
| foreach file : systemd_files | ||
| configure_file( | ||
| input: 'nvmf-autoconnect/systemd/' + file + '.in', | ||
|
|
@@ -315,11 +365,28 @@ install_data('completions/bash-nvme-completion.sh', | |
| install_data('completions/_nvme', | ||
| install_dir: datadir + '/zsh/site-functions') | ||
|
|
||
| foreach file : dracut_files | ||
| foreach file : dracut_conf_files | ||
| install_data(meson.current_build_dir() + '/' + file, | ||
| install_dir: dracutrulesdir) | ||
| endforeach | ||
|
|
||
| if want_dracut_module | ||
| foreach file : dracut_nbft_files | ||
| install_data(meson.current_build_dir() + '/' + file, | ||
| install_dir: dracutmodulesdir + '95nbft/') | ||
| endforeach | ||
|
|
||
| foreach file : networkmanager_conf_files | ||
| install_data(meson.current_build_dir() + '/' + file, | ||
| install_dir: networkmanagerdir + 'conf.d/') | ||
| endforeach | ||
|
|
||
| foreach file : networkmanager_dispatcher_files | ||
| install_data(meson.current_build_dir() + '/' + file, | ||
| install_dir: networkmanagerdir + 'dispatcher.d/') | ||
| endforeach | ||
| endif | ||
|
|
||
| foreach file : systemd_files | ||
| install_data(meson.current_build_dir() + '/' + file, | ||
| install_dir: systemddir) | ||
|
|
@@ -343,8 +410,10 @@ if meson.version().version_compare('>=0.53.0') | |
| 'mandir': mandir, | ||
| 'udevrulesdir': udevrulesdir, | ||
| 'dracutrulesdir': dracutrulesdir, | ||
| 'dracutmodulesdir': dracutmodulesdir, | ||
| 'rundir': rundir, | ||
| 'systemddir': systemddir, | ||
| 'networkmanagerdir': networkmanagerdir, | ||
| 'build location': meson.current_build_dir(), | ||
| } | ||
| summary(path_dict, section: 'Paths') | ||
|
|
@@ -353,8 +422,9 @@ if meson.version().version_compare('>=0.53.0') | |
| } | ||
| summary(dep_dict, section: 'Dependencies') | ||
| conf_dict = { | ||
| 'git version': conf.get('GIT_VERSION'), | ||
| 'pdc enabled': get_option('pdc-enabled'), | ||
| 'git version': conf.get('GIT_VERSION'), | ||
| 'pdc enabled': get_option('pdc-enabled'), | ||
| 'dracut module enabled': want_dracut_module | ||
| } | ||
| summary(conf_dict, section: 'Configuration') | ||
| endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,7 @@ touch %{buildroot}@SYSCONFDIR@/nvme/hostid | |
| @UDEVRULESDIR@/70-nvmf-autoconnect.rules | ||
| @UDEVRULESDIR@/70-nvmf-keys.rules | ||
| @UDEVRULESDIR@/71-nvmf-netapp.rules | ||
| @DRACUTRILESDIR@/70-nvmf-autoconnect.conf | ||
| @DRACUTRULESDIR@/70-nvmf-autoconnect.conf | ||
| @SYSTEMDDIR@/[email protected] | ||
| @SYSTEMDDIR@/nvmefc-boot-connections.service | ||
| @SYSTEMDDIR@/nvmf-connect-nbft.service | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| # Boot from NVMe over TCP (NBFT) | ||
| # | ||
| # For NVMe/TCP connections that provide namespaces containing rootfs | ||
| # it is crucial to react on carrier events and reconnect any missing | ||
| # NVMe/TCP connections as defined in the ACPI NBFT table. A custom | ||
| # @[email protected]/99-nvme-nbft-connect.sh hook | ||
| # will respawn nvmf-connect-nbft.service on such occasion. | ||
| # | ||
| # This file acts as an override to any config directives that disable | ||
| # carrier events globally. | ||
| # | ||
| # See https://networkmanager.dev/docs/admins/#server-like-behavior | ||
|
|
||
| [device-nbft-no-ignore-carrier] | ||
|
|
||
| # only affects nbft0, nbft1, ... interfaces | ||
| match-device=interface-name:nbft* | ||
|
|
||
| # react on link up/down events | ||
| ignore-carrier=no |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| #!/bin/bash | ||
|
|
||
| if [[ "$1" == nbft* ]] && [[ "$2" == "up" ]]; then | ||
| systemctl start nvmf-connect-nbft.service | ||
| fi | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| # The NBFT initramfs module | ||
|
|
||
| Focused solely on providing the Boot from NVMe over TCP functionality, intended | ||
| to replace parts of the existing `95nvmf` dracut module. At the moment this all | ||
| depends on the recently added NetworkManager NBFT support, though the desire is | ||
| to support more network management frameworks in the future. | ||
|
|
||
| While this module is currently built around dracut, the amount of dracut | ||
| involvement in this module is kept to a required minimum with the intention | ||
| of supporting more initramfs frameworks (like `mkosi`) in the future. | ||
|
|
||
| This is achieved by splitting the framework-specific directives into systemd | ||
| unit dropins while keeping the main unit files generic. | ||
|
|
||
| Related nvme-cli meson configure options: | ||
| * `-Ddracut-module` (default=false) - enables the 95nbft dracut module | ||
| * `-Ddracutmodulesdir` (default=`$prefix/lib/dracut/modules.d/`) | ||
| * `-Dnetworkmanagerdir` (default=`$prefix/lib/NetworkManager/`) | ||
|
|
||
|
|
||
| # The design | ||
|
|
||
| (see [dracut.bootup(7)](https://man7.org/linux/man-pages/man7/dracut.bootup.7.html) | ||
| for the overall boot process flow) | ||
|
|
||
| There are two primary tasks this initramfs module performs: | ||
| * early network configuration preparation steps | ||
| * the actual NVMe/TCP connection attempts | ||
|
|
||
| The actual network interface setup is often distribution-specific and requires | ||
| NBFT parser support in each network management framework. | ||
|
|
||
| With dracut and NetworkManager the boot process looks roughly as follows: | ||
| * `nbft-boot-pre.service` is run, creates udev network link files and tells | ||
| dracut to activate networking | ||
| * dracut runs `nm-initrd-generator` (the `35network-manager` module) and starts | ||
| the NetworkManager daemon | ||
| * `systemd-udev-trigger.service` renames the network interfaces | ||
| * `nm-wait-online-initrd.service` finishes, indicating networking is up and ready. | ||
| This typically satisfies reaching the `network-online.target` point. | ||
| * `nbft-boot-connect.service` initiates actual NVMe connections | ||
| * the dracut initqueue is waiting for specific block devices (rootfs) to appear | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this, and the starting of NM above, are the only 2 things that dracut needs to perform, and therefore it's relatively easy to plug this scheme into mkosi or another initramfs generator, as long as that generator is based on systemd. Nice. Am I understanding correctly?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly. The idea is to have this (somewhat) easily replaceable with something else, like the It's mostly a matter of the right systemd unit dependencies. Particular boot phases may need better abstraction. During my experiments I first tried to use generic systemd targets and units but found it unreliable and unpredictable. Thus the unit file dependencies are mostly dracut-specific, though I wish to find a better solution in the future. Anyway, clarified the two-unit boot flow in the docs. |
||
| Two major packages are responsible for this: this nvme-cli dracut module and | ||
| the added NBFT support in NetworkManager. | ||
|
|
||
| ## The dracut 95nbft module | ||
|
|
||
| The dracut `module-setup.sh` only installs two systemd unit files sandwiched | ||
| between specific dracut phases, nothing else. By default the module is always | ||
| included in the initramfs unless _hostonly_ is requested in which case the system | ||
| is tested for ACPI NBFT tables presence and the module is only included in such | ||
| a case. | ||
|
|
||
| The systemd unit files are only run when the ACPI NBFT tables are present and | ||
| no `rd.nvmf.nonbft` kernel commandline argument was provided that otherwise | ||
| instruct the boot process to skip the NBFT machinery. | ||
|
|
||
| ## nbft-boot-pre.service | ||
|
|
||
| Calls the nvme-cli nbft plugin to generate network link files for each interface | ||
| found in all NBFT tables. The interface naming in form of `nbftXhY` consists | ||
| of an ACPI NBFT table index (defaults to 0) and the specified HFI index. | ||
| In a typical scenario only `nbft0h1`, `nbft0h2`, `nbft1h1`, ... interfaces are | ||
| present, however it's up to the pre-OS driver to supply arbitrary indexes, | ||
| possibly leading to interface names skipping the order to something like | ||
| `nbft0h100` and `nbft99h123`. Comparing to the old `95nvmf` dracut module | ||
| ordering, this naming scheme is geared towards (semi-)stable predictable | ||
| network interface names. Keep in mind that the contents of the NBFT tables | ||
| is generated from scratch upon every system start and is not always persistent | ||
| between reboots. | ||
|
|
||
| The network link files are then picked up by udev on trigger via | ||
| `systemd-udev-trigger.service` to apply the new interface names. | ||
|
|
||
| For simplicity and for the time being this systemd unit replaces the `95nvmf` | ||
| dracut cmdline hook and adds the `rd.neednet=1` `cmdline.d` argument. | ||
|
|
||
tbzatek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ## nm-initrd-generator NBFT support | ||
|
|
||
| https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/2077 | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be the most important difference wrt Your approach is of course much more efficient, but at the cost of being compatible only with (a future version of) NM. A similar approach could be taken by wicked or other network management tools. But I wonder if there might be some middle ground, perhaps we can provide the HFI data in some format that any network management tool can easily convert? The "dracut command line" format is obviously very clumsy and simplistic. So we could keep
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The original intention was to keep
This would require a new code somewhere in the middle. I don't see much benefit compared to the libnvme nbft parser, except of the required dependency on libnvme. The HFI structure itself is rather simple. To be fair, there is currently one place where this could happen, however I was hoping to get rid of that one in the near term: interface renaming. We haven't agreed with the NetworkManager developers what component should be responsible for that. Renaming needs to be done through udev rules or through udev network link files. Udev then needs to be trigerred for the rename to actually happen, that's currently ensured through unit dependencies. A natural place for this that was suggested to me was udev's builtin-net_id.c (see the note my the original PR post). Then everything could be even more simple. So yes, the proposed call to
Yes, it all falls down to the requirement of implementing the NBFT HFI parser elsewhere. |
||
| Executed before the NetworkManager daemon starts the added NBFT support parses | ||
| the ACPI NBFT tables available and generates system connections. Only | ||
| referenced by MAC addresses, relying on udev to perform actual interface | ||
| renaming. | ||
|
|
||
| The `nm-initrd-generator` doesn't link to `libnvme.so.1` but opens it through | ||
| `dlopen()` in runtime. This allows for smaller hostonly initramfs images in case | ||
| the NBFT tables are not present in the system. The library is being pulled in | ||
| indirectly through the dracut module's requirement of nvme-cli. The | ||
| `rd.nvmf.nonbft` kernel commandline argument is respected as well. | ||
|
|
||
| ## nbft-boot-connect.service | ||
|
|
||
| Modprobes required modules (`nvme-fabrics`) first. | ||
|
|
||
| Performs actual NVMe connections by calling `nvme connect-all --nbft`. The | ||
| nvme-cli code has been modified to return non-zero return code in case one | ||
| or more SSNS records fail to connect (except those marked as _'unavailable'_ | ||
| by the pre-OS driver), resulting in the service startup failure with defined | ||
| respawn of 10 seconds (TBD). This ensures multiple connection attempts while | ||
| NetworkManager reacts on link events in the background and the dracut initqueue | ||
| eagerly waits for new block devices to appears, to be scanned and mounted. Once | ||
| the required block device appears, the wait cycle is ended and the system | ||
| continues booting, stopping any queued `nbft-boot-connect.service` respawns | ||
| seamlessly. | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see below that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is unfortunately a way too vague definition of "network online" state as various network management frameworks tend to treat such state differently. The NetworkManager-wait-online.service docs describes various conditions, there are corresponding config options for a connection (that roughly equals to an interface setup) that can be tweaked by the NM NBFT plugin. This
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my remarks above. IMO this can't be handled with a generic target like
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wonder if we could have something like [email protected] or [email protected]_24.target (IOW, translate network events into systemd unit state changes)...
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh that would be lovely, such kind of fine-grained instantiated network targets. FWIW, neither the NetworkManager's dispatcher service is available in the initramfs, it's just too heavy, while the main daemon is present. |
||
| The difference from the old dracut `95nvmf` module is that the nvme connection | ||
| attempts are not driven by network link up events but have fixed respawn | ||
| interval. This may potentially help the cases where the NIC is slow to | ||
| initialize, reports link up yet it takes another 5+ seconds before it's fully | ||
| able to send/receive packets. We've seen this issue with some 25Gb NICs. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Ideally we wouldn't react on "link up" events but on events that indicate an L3 connection. But I'm not sure if such events exist... (see below) Note that |
||
|
|
||
|
|
||
| # The post-switchroot boot flow | ||
|
|
||
| ## nvmf-connect-nbft.service | ||
|
|
||
| This unit is supposed to run once the `network-online.target` has been reached | ||
| and calls `nvme connect-all --nbft` again. This ensures additional connection | ||
| attempt for records that failed to connect in the initramfs phase. As long as | ||
| this call matches existing connections and skips SSNS records that have been | ||
| already connected, in an ideal case this would result in an no-op. This is | ||
| mostly a one-shot service run in NetworkManager based distros since the target | ||
| typically stays reached until reboot. | ||
|
|
||
| ## NetworkManager dispatcher hooks | ||
|
|
||
| The nvme-cli package installs a custom NetworkManager dispatcher service hook | ||
| (`99-nvme-nbft-connect.sh`) that just restarts `nvmf-connect-nbft.service` on | ||
| _link up_ events on `nbft*` interfaces. At the time the hook runs the interface | ||
| in question has been fully configured by NetworkManager. This ensures further | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm. This basically describes the "L3 up" events I just thought we didn't have ... can't we just do this in the initrd as well?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, almost. We are seeing some NICs that are not yet fully initialized at the time of this OS "L3 up" event, not having any of the Tx/Rx paths fully up. Then from OS perspective it's nearly impossible to distinguish such scenario from "destination unreachable". So this is a best effort for the moment. The Might be a driver issue though, we haven't investigated that fully. Fortunately most NICs are fine. While this issue should be dealt with separately, I wanted to have an infrastructure in place in case it's needed. |
||
| reconnection attempts in multipath scenarios where a network interface just came | ||
| alive. This is designed as a secondary measure with the kernel nvme host driver | ||
| connection recovery being the primary mechanism. | ||
|
|
||
| In order to make link events work properly the `nbft*` interfaces need to be set | ||
| not to ignore carrier events. This is done through a custom override snippet | ||
| (`95-nvme-nbft-no-ignore-carrier.conf`) as some distributions may opt to follow | ||
| legacy server networking behaviour (see the `NetworkManager-config-server` package). | ||
Uh oh!
There was an error while loading. Please reload this page.