Skip to content

Commit 1800086

Browse files
jmarrerocgwalters
authored andcommitted
kola: Add soft-reboot support for external tests
Implements soft-reboot capabilities for Kola, it enables tests to use systemd's soft-reboot functionality. The implementation follows the same pattern as regular reboots but for `systemctl soft-reboot`, tracks systemd boot timestamps rather than kernel boot IDs for state detection. Signed-off-by: Colin Walters <[email protected]>
1 parent fbca096 commit 1800086

File tree

15 files changed

+359
-18
lines changed

15 files changed

+359
-18
lines changed

docs/kola/external-tests.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,36 @@ echo "ok autopkgtest rebooting"
102102

103103
This will trigger the monitoring `kola` process to invoke a reboot.
104104

105+
## Support for soft-rebooting
106+
107+
Kola also supports soft-rebooting using systemd's `systemctl soft-reboot` command.
108+
Soft-reboot restarts the userspace while keeping the kernel and hardware state intact.
109+
This is useful for testing userspace updates without a full system reboot.
110+
111+
The soft-reboot API is similar to the regular reboot API:
112+
113+
```
114+
#!/bin/bash
115+
# Example of soft-reboot test
116+
set -xeuo pipefail
117+
case "${AUTOPKGTEST_REBOOT_MARK:-}" in
118+
"") echo "test beginning"; /tmp/autopkgtest-soft-reboot mark1 ;;
119+
mark1) echo "test in mark1"; /tmp/autopkgtest-soft-reboot mark2 ;;
120+
mark2) echo "test in mark2" ;;
121+
*) echo "unexpected mark: ${AUTOPKGTEST_REBOOT_MARK}"; exit 1;;
122+
esac
123+
echo "ok autopkgtest soft-rebooting"
124+
```
125+
126+
Key differences with soft-reboot:
127+
- The kernel boot ID (`/proc/sys/kernel/random/boot_id`) remains the same
128+
- Hardware state and kernel memory are preserved
129+
- Only userspace is restarted
130+
- Uses `systemctl soft-reboot` instead of `reboot`
131+
132+
Both `/tmp/autopkgtest-soft-reboot` and `/tmp/autopkgtest-soft-reboot-prepare` scripts are available,
133+
analogous to their regular reboot counterparts.
134+
105135
The rationale for this is that it helps kola to know when a reboot is happening
106136
so that it can correctly follow the state of the systemd journal, etc. A future
107137
enhancement will support directly invoking `reboot` and having kola just figure

mantle/cmd/kola/devshell.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,8 @@ func runDevShellSSH(ctx context.Context, builder *platform.QemuBuilder, conf *co
239239
_ = inst.Kill()
240240
case guestStateInReboot:
241241
statusMsg = "QEMU guest initiated reboot"
242+
case guestStateInSoftReboot:
243+
statusMsg = "QEMU guest initiated soft-reboot"
242244
case guestStateOpenSshStopped:
243245
statusMsg = "QEMU openssh is not listening"
244246
case guestStateSshDisconnected:
@@ -285,6 +287,8 @@ const (
285287
guestStateInShutdown
286288
// guestStateInReboot indicates that the guest has started a reboot
287289
guestStateInReboot
290+
// guestStateInSoftReboot indicates that the guest has started a soft-reboot
291+
guestStateInSoftReboot
288292
// guestStateHalted indicates that the guest has halted or shutdown
289293
guestStateHalted
290294
// guestStateBooting indicates that the instance is in early boot
@@ -325,6 +329,9 @@ func checkWriteState(msg string, c chan<- guestState) {
325329
if strings.Contains(msg, "Starting Reboot...") {
326330
c <- guestStateInReboot
327331
}
332+
if strings.Contains(msg, "Reached target soft-reboot") {
333+
c <- guestStateInSoftReboot
334+
}
328335
}
329336

330337
type systemdEventMessage struct {
@@ -428,6 +435,11 @@ func watchJournal(builder *platform.QemuBuilder, conf *conf.Conf, stateChan chan
428435
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
429436
guestState: guestStateInShutdown,
430437
},
438+
{
439+
unit: "systemd-soft-reboot.service",
440+
messageID: "7d4958e842da4a758f6c1cdc7b36dcc5",
441+
guestState: guestStateInSoftReboot,
442+
},
431443
}
432444

433445
r, err := builder.VirtioJournal(conf, "-o json --system")

mantle/cmd/kolet/kolet.go

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,25 @@ reboot
105105
autopkgtestRebootPrepareScript = `#!/bin/bash
106106
set -euo pipefail
107107
exec /usr/local/bin/kolet reboot-request "$1"
108+
`
109+
110+
// Soft-reboot support
111+
autopkgTestSoftRebootPath = "/tmp/autopkgtest-soft-reboot"
112+
autopkgtestSoftRebootScript = `#!/bin/bash
113+
set -xeuo pipefail
114+
/usr/local/bin/kolet soft-reboot-request "$1"
115+
systemctl soft-reboot
116+
`
117+
autopkgTestSoftRebootPreparePath = "/tmp/autopkgtest-soft-reboot-prepare"
118+
119+
autopkgtestSoftRebootPrepareScript = `#!/bin/bash
120+
set -euo pipefail
121+
exec /usr/local/bin/kolet soft-reboot-request "$1"
108122
`
109123

110124
// File used to communicate between the script and the kolet runner internally
111-
rebootRequestFifo = "/run/kolet-reboot"
125+
rebootRequestFifo = "/run/kolet-reboot"
126+
softRebootRequestFifo = "/run/kolet-soft-reboot"
112127
)
113128

114129
var (
@@ -140,6 +155,13 @@ var (
140155
SilenceUsage: true,
141156
}
142157

158+
cmdSoftReboot = &cobra.Command{
159+
Use: "soft-reboot-request MARK",
160+
Short: "Request a soft reboot",
161+
RunE: runSoftReboot,
162+
SilenceUsage: true,
163+
}
164+
143165
cmdHttpd = &cobra.Command{
144166
Use: "httpd",
145167
Short: "Start an HTTP server to serve the contents of the file system",
@@ -259,7 +281,11 @@ func initiateReboot(mark string) error {
259281
return nil
260282
}
261283

284+
// / Create a FIFO in an idempotent fashion
262285
func mkfifo(path string) error {
286+
if _, err := os.Stat(path); err == nil {
287+
return nil
288+
}
263289
c := exec.Command("mkfifo", path)
264290
c.Stderr = os.Stderr
265291
err := c.Run()
@@ -269,6 +295,20 @@ func mkfifo(path string) error {
269295
return nil
270296
}
271297

298+
func initiateSoftReboot(mark string) error {
299+
systemdjournal.Print(systemdjournal.PriInfo, "Processing soft-reboot request")
300+
res := kola.KoletResult{
301+
SoftReboot: string(mark),
302+
}
303+
buf, err := json.Marshal(&res)
304+
if err != nil {
305+
return errors.Wrapf(err, "serializing KoletResult")
306+
}
307+
fmt.Println(string(buf))
308+
systemdjournal.Print(systemdjournal.PriInfo, "Acknowledged soft-reboot request with mark: %s", buf)
309+
return nil
310+
}
311+
272312
func runExtUnit(cmd *cobra.Command, args []string) error {
273313
rebootOff, _ := cmd.Flags().GetBool("deny-reboots")
274314
// Write the autopkgtest wrappers
@@ -278,10 +318,18 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
278318
if err := os.WriteFile(autopkgTestRebootPreparePath, []byte(autopkgtestRebootPrepareScript), 0755); err != nil {
279319
return err
280320
}
321+
// Write the soft-reboot autopkgtest wrappers
322+
if err := os.WriteFile(autopkgTestSoftRebootPath, []byte(autopkgtestSoftRebootScript), 0755); err != nil {
323+
return err
324+
}
325+
if err := os.WriteFile(autopkgTestSoftRebootPreparePath, []byte(autopkgtestSoftRebootPrepareScript), 0755); err != nil {
326+
return err
327+
}
281328

282329
// Create the reboot cmdline -> login FIFO for the reboot mark and
283330
// proxy it into a channel
284331
rebootChan := make(chan string)
332+
softRebootChan := make(chan string)
285333
errChan := make(chan error)
286334

287335
// We want to prevent certain tests (like non-exclusive tests) from rebooting
@@ -303,6 +351,25 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
303351
}
304352
rebootChan <- string(buf)
305353
}()
354+
355+
// Create soft-reboot FIFO and channel
356+
err = mkfifo(softRebootRequestFifo)
357+
if err != nil {
358+
return err
359+
}
360+
go func() {
361+
softRebootReader, err := os.Open(softRebootRequestFifo)
362+
if err != nil {
363+
errChan <- err
364+
return
365+
}
366+
defer softRebootReader.Close()
367+
buf, err := io.ReadAll(softRebootReader)
368+
if err != nil {
369+
errChan <- err
370+
}
371+
softRebootChan <- string(buf)
372+
}()
306373
}
307374

308375
ctx := context.Background()
@@ -344,6 +411,8 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
344411
return err
345412
case reboot := <-rebootChan:
346413
return initiateReboot(reboot)
414+
case softReboot := <-softRebootChan:
415+
return initiateSoftReboot(softReboot)
347416
case m := <-unitevents:
348417
for n := range m {
349418
if n == unitname {
@@ -397,6 +466,35 @@ func runReboot(cmd *cobra.Command, args []string) error {
397466
return nil
398467
}
399468

469+
// runSoftReboot handles soft-reboot requests similar to runReboot but for systemctl soft-reboot
470+
func runSoftReboot(cmd *cobra.Command, args []string) error {
471+
if _, err := os.Stat(softRebootRequestFifo); os.IsNotExist(err) {
472+
return errors.New("Soft-reboots are not supported for this test, softRebootRequestFifo does not exist.")
473+
}
474+
475+
mark := args[0]
476+
systemdjournal.Print(systemdjournal.PriInfo, "Requesting soft-reboot with mark: %s", mark)
477+
err := mkfifo(kola.KoletRebootAckFifo)
478+
if err != nil {
479+
return err
480+
}
481+
err = os.WriteFile(softRebootRequestFifo, []byte(mark), 0644)
482+
if err != nil {
483+
return err
484+
}
485+
f, err := os.Open(kola.KoletRebootAckFifo)
486+
if err != nil {
487+
return err
488+
}
489+
buf := make([]byte, 1)
490+
_, err = f.Read(buf)
491+
if err != nil {
492+
return err
493+
}
494+
systemdjournal.Print(systemdjournal.PriInfo, "Soft-reboot request acknowledged")
495+
return nil
496+
}
497+
400498
func runHttpd(cmd *cobra.Command, args []string) error {
401499
port, _ := cmd.Flags().GetString("port")
402500
path, _ := cmd.Flags().GetString("path")
@@ -413,6 +511,8 @@ func main() {
413511
root.AddCommand(cmdRunExtUnit)
414512
cmdReboot.Args = cobra.ExactArgs(1)
415513
root.AddCommand(cmdReboot)
514+
cmdSoftReboot.Args = cobra.ExactArgs(1)
515+
root.AddCommand(cmdSoftReboot)
416516
cmdHttpd.Flags().StringP("port", "", "80", "port")
417517
cmdHttpd.Flags().StringP("path", "", "./", "path to filesystem contents to serve")
418518
cmdHttpd.Args = cobra.ExactArgs(0)

mantle/kola/harness.go

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ const (
255255

256256
// KoletResult is serialized JSON passed from kolet to the harness
257257
type KoletResult struct {
258-
Reboot string
258+
Reboot string
259+
SoftReboot string
259260
}
260261

261262
const KoletExtTestUnit = "kola-runext"
@@ -1105,6 +1106,10 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11051106
if err != nil {
11061107
return errors.Wrapf(err, "getting boot id")
11071108
}
1109+
userspaceTimestamp, err := platform.GetMachineBootCount(mach)
1110+
if err != nil {
1111+
return errors.Wrapf(err, "getting userspace timestamp")
1112+
}
11081113
plog.Debug("Starting kolet run-test-unit")
11091114
if previousRebootState != "" {
11101115
// quote around the value for systemd
@@ -1137,27 +1142,47 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine, testNum int)
11371142
return errors.Wrapf(err, "parsing kolet json %s", string(stdout))
11381143
}
11391144
}
1140-
// If no reboot is requested, we're done
1141-
if koletRes.Reboot == "" {
1145+
// If no reboot or soft-reboot is requested, we're done
1146+
if koletRes.Reboot == "" && koletRes.SoftReboot == "" {
11421147
return nil
11431148
}
11441149

1145-
// A reboot is requested
1146-
previousRebootState = koletRes.Reboot
1147-
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1148-
// This signals to the subject that we have saved the mark, and the subject
1149-
// can proceed with rebooting. We stop sshd to ensure that the wait below
1150-
// doesn't log in while ssh is shutting down.
1151-
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1152-
if err != nil {
1153-
return errors.Wrapf(err, "failed to acknowledge reboot")
1150+
// Handle regular reboot
1151+
if koletRes.Reboot != "" {
1152+
previousRebootState = koletRes.Reboot
1153+
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
1154+
// This signals to the subject that we have saved the mark, and the subject
1155+
// can proceed with rebooting. We stop sshd to ensure that the wait below
1156+
// doesn't log in while ssh is shutting down.
1157+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
1158+
if err != nil {
1159+
return errors.Wrapf(err, "failed to acknowledge reboot")
1160+
}
1161+
plog.Debug("Waiting for reboot")
1162+
err = mach.WaitForReboot(120*time.Second, bootID)
1163+
if err != nil {
1164+
return errors.Wrapf(err, "Waiting for reboot")
1165+
}
1166+
plog.Debug("Reboot complete")
11541167
}
1155-
plog.Debug("Waiting for reboot")
1156-
err = mach.WaitForReboot(120*time.Second, bootID)
1157-
if err != nil {
1158-
return errors.Wrapf(err, "Waiting for reboot")
1168+
1169+
// Handle soft-reboot
1170+
if koletRes.SoftReboot != "" {
1171+
previousRebootState = koletRes.SoftReboot
1172+
plog.Debugf("Soft-reboot request with mark='%s'", previousRebootState)
1173+
// Use the userspace timestamp we collected at the beginning of this loop iteration
1174+
// Acknowledge the soft-reboot request
1175+
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'echo > %s'", KoletRebootAckFifo))
1176+
if err != nil {
1177+
return errors.Wrapf(err, "failed to acknowledge soft-reboot")
1178+
}
1179+
plog.Debug("Waiting for soft-reboot")
1180+
err = mach.WaitForSoftReboot(120*time.Second, userspaceTimestamp)
1181+
if err != nil {
1182+
return errors.Wrapf(err, "Waiting for soft-reboot")
1183+
}
1184+
plog.Debug("Soft-reboot complete")
11591185
}
1160-
plog.Debug("Reboot complete")
11611186
}
11621187
}
11631188

mantle/platform/machine/aws/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
8080
return platform.WaitForMachineReboot(am, am.journal, timeout, oldBootId)
8181
}
8282

83+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
84+
return platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldUserspaceTimestamp)
85+
}
86+
8387
func (am *machine) Destroy() {
8488
origConsole, err := am.cluster.flight.api.GetConsoleOutput(am.ID())
8589
if err != nil {

mantle/platform/machine/azure/machine.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,15 @@ func (am *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
108108
return am.refetchIPs()
109109
}
110110

111+
func (am *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
112+
err := platform.WaitForMachineSoftReboot(am, am.journal, timeout, oldUserspaceTimestamp)
113+
if err != nil {
114+
return err
115+
}
116+
// For soft-reboot, IP addresses should not change, but let's refetch to be safe
117+
return am.refetchIPs()
118+
}
119+
111120
func (am *machine) Destroy() {
112121
if err := am.saveConsole(); err != nil {
113122
// log error, but do not fail to terminate instance

mantle/platform/machine/do/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ func (dm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7777
return platform.WaitForMachineReboot(dm, dm.journal, timeout, oldBootId)
7878
}
7979

80+
func (dm *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
81+
return platform.WaitForMachineSoftReboot(dm, dm.journal, timeout, oldUserspaceTimestamp)
82+
}
83+
8084
func (dm *machine) Destroy() {
8185
if err := dm.cluster.flight.api.DeleteDroplet(context.TODO(), dm.droplet.ID); err != nil {
8286
plog.Errorf("Error deleting droplet %v: %v", dm.droplet.ID, err)

mantle/platform/machine/esx/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (em *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(em, em.journal, timeout, oldBootId)
7979
}
8080

81+
func (em *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
82+
return platform.WaitForMachineSoftReboot(em, em.journal, timeout, oldUserspaceTimestamp)
83+
}
84+
8185
func (em *machine) Destroy() {
8286
if err := em.cluster.flight.api.TerminateDevice(em.ID()); err != nil {
8387
plog.Errorf("Error terminating device %v: %v", em.ID(), err)

mantle/platform/machine/gcloud/machine.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ func (gm *machine) WaitForReboot(timeout time.Duration, oldBootId string) error
7878
return platform.WaitForMachineReboot(gm, gm.journal, timeout, oldBootId)
7979
}
8080

81+
func (gm *machine) WaitForSoftReboot(timeout time.Duration, oldUserspaceTimestamp string) error {
82+
return platform.WaitForMachineSoftReboot(gm, gm.journal, timeout, oldUserspaceTimestamp)
83+
}
84+
8185
func (gm *machine) Destroy() {
8286
if err := gm.saveConsole(); err != nil {
8387
plog.Errorf("Error saving console for instance %v: %v", gm.ID(), err)

0 commit comments

Comments
 (0)