Skip to content

Commit 1044458

Browse files
committed
fix(agent): Hotfix for interrupts in agent v6.3.0, and updated agent
python base image
1 parent fb0126b commit 1044458

File tree

3 files changed

+37
-11
lines changed

3 files changed

+37
-11
lines changed

agent/skyhook-agent/src/skyhook_agent/controller.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -442,10 +442,10 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
442442
return f"{interrupt_dir}/{interrupt_id}.complete"
443443

444444
SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config()
445-
446445
config_data = make_config_data_from_resource_id()
447446

448447
interrupt = interrupts.inflate(interrupt_data)
448+
449449
# Check if the interrupt has already been run for this particular skyhook resource
450450
interrupt_dir = f"{get_skyhook_directory(root_mount)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
451451
os.makedirs(interrupt_dir, exist_ok=True)
@@ -469,17 +469,23 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
469469
f.write(str(time.time()))
470470

471471
return_code = _run(
472+
root_mount,
472473
cmd,
473474
get_log_file(f"interrupts/{interrupt_id}", copy_dir, config_data, root_mount),
474475
write_cmds=True,
475476
no_chmod=True
476477
)
477478

478479
if return_code != 0:
479-
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")
480-
# If this is not removed then we will skip all failing interrupts and it will look
481-
# like the interrupt was successful when it was not.
482-
os.remove(interrupt_flag)
480+
# Special case: preserve flags only for reboot with a return code of 15
481+
# (SIGTERM signal sent to the process by OS because of reboot)
482+
if not (interrupt.type == interrupts.NodeRestart._type() and return_code == 15):
483+
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")
484+
485+
# If this is not removed then we will skip all failing interrupts and it will look
486+
# like the interrupt was successful when it was not.
487+
os.remove(interrupt_flag)
488+
483489
return True
484490

485491
return False

agent/skyhook-agent/tests/test_controller.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,8 +1068,8 @@ def test_interrupt_applies_all_commands(self, run_mock, datetime_mock):
10681068
"package_version": "version"
10691069
}
10701070
run_mock.assert_has_calls([
1071-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072-
mock.call(["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
1071+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
1072+
mock.call(root_dir, ["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
10731073
])
10741074

10751075
@mock.patch("skyhook_agent.controller._run")
@@ -1112,6 +1112,26 @@ def test_interrupt_failures_remove_flag(self, run_mock):
11121112
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
11131113
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
11141114

1115+
@mock.patch("skyhook_agent.controller._run")
1116+
def test_interrupt_calls_run_with_correct_parameters(self, run_mock):
1117+
run_mock.return_value = 0
1118+
SKYHOOK_RESOURCE_ID = "scr-id-1_package_version"
1119+
1120+
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
1121+
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
1122+
1123+
interrupt = interrupts.ServiceRestart(["foo", "bar"])
1124+
result = controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
1125+
1126+
self.assertEqual(result, False)
1127+
expected_calls = [
1128+
mock.call(root_dir, ["systemctl", "daemon-reload"], mock.ANY, write_cmds=True, no_chmod=True),
1129+
mock.call(root_dir, ["systemctl", "restart", "foo"], mock.ANY, write_cmds=True, no_chmod=True),
1130+
mock.call(root_dir, ["systemctl", "restart", "bar"], mock.ANY, write_cmds=True, no_chmod=True)
1131+
]
1132+
run_mock.assert_has_calls(expected_calls)
1133+
self.assertEqual(run_mock.call_count, 3)
1134+
11151135
@mock.patch("skyhook_agent.controller.datetime")
11161136
@mock.patch("skyhook_agent.controller._run")
11171137
def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
@@ -1140,7 +1160,7 @@ def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
11401160
"package_version": "version"
11411161
}
11421162
run_mock.assert_has_calls([
1143-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1163+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11441164
])
11451165

11461166
self.assertEqual(result, True)
@@ -1173,7 +1193,7 @@ def test_interrupt_makes_config_from_skyhook_resource_id(self, run_mock, datetim
11731193
"package_version": "version"
11741194
}
11751195
run_mock.assert_has_calls([
1176-
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
1196+
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
11771197
])
11781198

11791199
def test_interrupt_noop_makes_the_flag_file(self):

containers/agent.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION}
3636
# Install the wheel in the builder stage
3737
RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl
3838

39-
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.13
39+
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15
4040

4141
ARG AGENT_VERSION
4242
ARG GIT_SHA
4343

4444
## https://github.com/opencontainers/image-spec/blob/main/annotations.md
45-
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.13" \
45+
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \
4646
org.opencontainers.image.licenses="Apache-2.0" \
4747
org.opencontainers.image.title="skyhook-agent" \
4848
org.opencontainers.image.version="${AGENT_VERSION}" \

0 commit comments

Comments
 (0)