Skip to content

Commit cf3f810

Browse files
committed
fix(agent): Hotfix for interrupts in agent v6.3.0, and updated agent
python base image
1 parent fb0126b commit cf3f810

File tree

3 files changed

+39
-6
lines changed

3 files changed

+39
-6
lines changed

agent/skyhook-agent/src/skyhook_agent/controller.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,10 +465,21 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
465465
print(f"Skipping interrupt {interrupt_id} because it was already run for {SKYHOOK_RESOURCE_ID}")
466466
continue
467467

468+
print(f"Writing interrupt flag {interrupt_flag}")
468469
with open(interrupt_flag, 'w') as f:
469470
f.write(str(time.time()))
471+
f.flush()
472+
os.fsync(f.fileno())
473+
print(f"Wrote interrupt flag {interrupt_flag}")
470474

475+
# If there is a reboot we need to wait for the flag file to be written
476+
# before rebooting.
477+
if interrupt.type == interrupts.NodeRestart._type():
478+
time.sleep(2)
479+
480+
print(f"Running interrupt {interrupt_id}")
471481
return_code = _run(
482+
root_mount,
472483
cmd,
473484
get_log_file(f"interrupts/{interrupt_id}", copy_dir, config_data, root_mount),
474485
write_cmds=True,
@@ -500,7 +511,9 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a
500511
logger.warning(f"This version of the Agent doesn't support the {mode} mode. Options are: {','.join(map(str, Mode))}.")
501512
return False
502513

514+
print(f"Running mode {mode}")
503515
if mode == Mode.INTERRUPT:
516+
print(f"Running interrupt {interrupt_data}")
504517
return do_interrupt(interrupt_data, root_mount, copy_dir)
505518

506519
_, SKYHOOK_DATA_DIR, _, _ = _get_env_config()

agent/skyhook-agent/tests/test_controller.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,8 +1068,8 @@ def test_interrupt_applies_all_commands(self, run_mock, datetime_mock):
10681068
"package_version": "version"
10691069
}
10701070
run_mock.assert_has_calls([
1071-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072-
mock.call(["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
1071+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072+
mock.call(root_dir, ["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
10731073
])
10741074

10751075
@mock.patch("skyhook_agent.controller._run")
@@ -1112,6 +1112,26 @@ def test_interrupt_failures_remove_flag(self, run_mock):
11121112
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
11131113
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
11141114

1115+
@mock.patch("skyhook_agent.controller._run")
1116+
def test_interrupt_calls_run_with_correct_parameters(self, run_mock):
1117+
run_mock.return_value = 0
1118+
SKYHOOK_RESOURCE_ID = "scr-id-1_package_version"
1119+
1120+
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1121+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1122+
1123+
interrupt = interrupts.ServiceRestart(["foo", "bar"])
1124+
result = controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
1125+
1126+
self.assertEqual(result, False)
1127+
expected_calls = [
1128+
mock.call(root_dir, ["systemctl", "daemon-reload"], mock.ANY, write_cmds=True, no_chmod=True),
1129+
mock.call(root_dir, ["systemctl", "restart", "foo"], mock.ANY, write_cmds=True, no_chmod=True),
1130+
mock.call(root_dir, ["systemctl", "restart", "bar"], mock.ANY, write_cmds=True, no_chmod=True)
1131+
]
1132+
run_mock.assert_has_calls(expected_calls)
1133+
self.assertEqual(run_mock.call_count, 3)
1134+
11151135
@mock.patch("skyhook_agent.controller.datetime")
11161136
@mock.patch("skyhook_agent.controller._run")
11171137
def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
@@ -1140,7 +1160,7 @@ def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
11401160
"package_version": "version"
11411161
}
11421162
run_mock.assert_has_calls([
1143-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1163+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11441164
])
11451165

11461166
self.assertEqual(result, True)
@@ -1173,7 +1193,7 @@ def test_interrupt_makes_config_from_skyhook_resource_id(self, run_mock, datetim
11731193
"package_version": "version"
11741194
}
11751195
run_mock.assert_has_calls([
1176-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1196+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11771197
])
11781198

11791199
def test_interrupt_noop_makes_the_flag_file(self):

containers/agent.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION}
3636
# Install the wheel in the builder stage
3737
RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl
3838

39-
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.13
39+
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15
4040

4141
ARG AGENT_VERSION
4242
ARG GIT_SHA
4343

4444
## https://github.com/opencontainers/image-spec/blob/main/annotations.md
45-
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.13" \
45+
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \
4646
org.opencontainers.image.licenses="Apache-2.0" \
4747
org.opencontainers.image.title="skyhook-agent" \
4848
org.opencontainers.image.version="${AGENT_VERSION}" \

0 commit comments

Comments
 (0)