Skip to content

Commit 236a707

Browse files
authored
fix(agent): hotfix for interrupts in agent v6.3.0 (#86)
1 parent 5fb44cf commit 236a707

File tree

3 files changed

+51
-13
lines changed

3 files changed

+51
-13
lines changed

agent/skyhook-agent/src/skyhook_agent/controller.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -442,10 +442,10 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
442442
return f"{interrupt_dir}/{interrupt_id}.complete"
443443

444444
SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config()
445-
446445
config_data = make_config_data_from_resource_id()
447446

448447
interrupt = interrupts.inflate(interrupt_data)
448+
449449
# Check if the interrupt has already been run for this particular skyhook resource
450450
interrupt_dir = f"{get_skyhook_directory(root_mount)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
451451
os.makedirs(interrupt_dir, exist_ok=True)
@@ -469,17 +469,23 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
469469
f.write(str(time.time()))
470470

471471
return_code = _run(
472+
root_mount,
472473
cmd,
473474
get_log_file(f"interrupts/{interrupt_id}", copy_dir, config_data, root_mount),
474475
write_cmds=True,
475476
no_chmod=True
476477
)
477478

478479
if return_code != 0:
479-
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")
480-
# If this is not removed then we will skip all failing interrupts and it will look
481-
# like the interrupt was successful when it was not.
482-
os.remove(interrupt_flag)
480+
# Special case: preserve flags only for reboot with a return code of -15
481+
# (SIGTERM signal sent to the process by OS because of reboot)
482+
if not (interrupt.type == interrupts.NodeRestart._type() and return_code == -15):
483+
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")
484+
485+
# If this is not removed then we will skip all failing interrupts and it will look
486+
# like the interrupt was successful when it was not.
487+
os.remove(interrupt_flag)
488+
483489
return True
484490

485491
return False

agent/skyhook-agent/tests/test_controller.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,8 +1068,8 @@ def test_interrupt_applies_all_commands(self, run_mock, datetime_mock):
10681068
"package_version": "version"
10691069
}
10701070
run_mock.assert_has_calls([
1071-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072-
mock.call(["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
1071+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072+
mock.call(root_dir, ["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
10731073
])
10741074

10751075
@mock.patch("skyhook_agent.controller._run")
@@ -1090,7 +1090,7 @@ def test_interrupt_create_flags_per_cmd(self, run_mock):
10901090
run_mock.return_value = 0
10911091
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
10921092
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1093-
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1093+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
10941094
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
10951095
interrupt = interrupts.ServiceRestart(["foo", "bar"])
10961096
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
@@ -1103,14 +1103,46 @@ def test_interrupt_failures_remove_flag(self, run_mock):
11031103
run_mock.side_effect = [0,1,0]
11041104
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
11051105
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1106-
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1106+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
11071107
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
11081108
interrupt = interrupts.ServiceRestart(["foo", "bar"])
11091109
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
11101110

11111111
self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete"))
11121112
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
11131113
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
1114+
1115+
@mock.patch("skyhook_agent.controller._run")
1116+
def test_interrupt_reboot_SIGTERM_preserves_flag(self, run_mock):
1117+
run_mock.side_effect = [0, -15, 0]
1118+
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
1119+
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1120+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1121+
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
1122+
interrupt = interrupts.NodeRestart()
1123+
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
1124+
1125+
self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete"))
1126+
1127+
@mock.patch("skyhook_agent.controller._run")
1128+
def test_interrupt_calls_run_with_correct_parameters(self, run_mock):
1129+
run_mock.return_value = 0
1130+
SKYHOOK_RESOURCE_ID = "scr-id-1_package_version"
1131+
1132+
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1133+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1134+
1135+
interrupt = interrupts.ServiceRestart(["foo", "bar"])
1136+
result = controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
1137+
1138+
self.assertEqual(result, False)
1139+
expected_calls = [
1140+
mock.call(root_dir, ["systemctl", "daemon-reload"], mock.ANY, write_cmds=True, no_chmod=True),
1141+
mock.call(root_dir, ["systemctl", "restart", "foo"], mock.ANY, write_cmds=True, no_chmod=True),
1142+
mock.call(root_dir, ["systemctl", "restart", "bar"], mock.ANY, write_cmds=True, no_chmod=True)
1143+
]
1144+
run_mock.assert_has_calls(expected_calls)
1145+
self.assertEqual(run_mock.call_count, 3)
11141146

11151147
@mock.patch("skyhook_agent.controller.datetime")
11161148
@mock.patch("skyhook_agent.controller._run")
@@ -1140,7 +1172,7 @@ def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
11401172
"package_version": "version"
11411173
}
11421174
run_mock.assert_has_calls([
1143-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1175+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11441176
])
11451177

11461178
self.assertEqual(result, True)
@@ -1173,7 +1205,7 @@ def test_interrupt_makes_config_from_skyhook_resource_id(self, run_mock, datetim
11731205
"package_version": "version"
11741206
}
11751207
run_mock.assert_has_calls([
1176-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1208+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11771209
])
11781210

11791211
def test_interrupt_noop_makes_the_flag_file(self):

containers/agent.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION}
3636
# Install the wheel in the builder stage
3737
RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl
3838

39-
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.13
39+
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15
4040

4141
ARG AGENT_VERSION
4242
ARG GIT_SHA
4343

4444
## https://github.com/opencontainers/image-spec/blob/main/annotations.md
45-
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.13" \
45+
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \
4646
org.opencontainers.image.licenses="Apache-2.0" \
4747
org.opencontainers.image.title="skyhook-agent" \
4848
org.opencontainers.image.version="${AGENT_VERSION}" \

0 commit comments

Comments
 (0)