3333import os
3434import shutil
3535import glob
36+ import signal
3637
3738from skyhook_agent .step import Step , UpgradeStep , Idempotence , Mode , CHECK_TO_APPLY
3839from skyhook_agent import interrupts , config
3940from typing import List
4041
4142import logging as logger
4243
44+ # Global flag to track if we received SIGTERM
45+ received_sigterm = False
46+
47+ def sigterm_handler (signum , frame ):
48+ """Handle SIGTERM by setting a global flag and logging the event"""
49+ global received_sigterm
50+ received_sigterm = True
51+ logger .info ("Received SIGTERM signal - initiating graceful shutdown" )
52+
53+ # Register the SIGTERM handler
54+ signal .signal (signal .SIGTERM , sigterm_handler )
55+
4356class SkyhookValidationError (Exception ):
4457 pass
4558
@@ -414,7 +427,11 @@ def remove_flags(step_data: dict[Mode, list[Step|UpgradeStep]], config_data: dic
414427 if os .path .exists (flag_file ): # Check if the file exists before trying to remove it
415428 os .remove (flag_file )
416429
417- def main (mode : Mode , root_mount : str , copy_dir : str , interrupt_data : None | str , always_run_step = False ):
430+ def main (mode : Mode , root_mount : str , copy_dir : str , interrupt_data : None | str , always_run_step = False ) -> bool :
431+ '''
432+ returns True if the there is a failure in the steps, otherwise returns False
433+ '''
434+
418435 if mode not in set (map (str , Mode )):
419436 logger .warning (f"This version of the Agent doesn't support the { mode } mode. Options are: { ',' .join (map (str , Mode ))} ." )
420437 return False
@@ -448,9 +465,19 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a
448465 if not os .path .exists (f"{ root_mount } /{ copy_dir } /configmaps/{ f } " ):
449466 raise SkyhookValidationError (f"Expected config file { f } not found in configmaps directory." )
450467
451- return agent_main (mode , root_mount , copy_dir , config_data , interrupt_data , always_run_step )
468+ try :
469+ return agent_main (mode , root_mount , copy_dir , config_data , interrupt_data , always_run_step )
470+ except Exception as e :
471+ if received_sigterm :
472+ logger .info ("Gracefully shutting down due to SIGTERM" )
473+ # Perform any cleanup if needed
474+ return True
475+ raise
452476
453- def agent_main (mode : Mode , root_mount : str , copy_dir : str , config_data : dict , interrupt_data : None | str , always_run_step = False ):
477+ def agent_main (mode : Mode , root_mount : str , copy_dir : str , config_data : dict , interrupt_data : None | str , always_run_step = False ) -> bool :
478+ '''
479+ returns True if the there is a failure in the steps, otherwise returns False
480+ '''
454481
455482 # Pull out step_data so it matches with existing code
456483 step_data = config_data ["modes" ]
@@ -464,6 +491,11 @@ def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, in
464491 logger .warning (f" There are no { mode } steps defined. This will be ran as a no-op." )
465492
466493 for step in step_data .get (mode , []):
494+ # Check for SIGTERM
495+ if received_sigterm :
496+ logger .info ("SIGTERM received, stopping step execution" )
497+ return True
498+
467499 # Make the flag file without the host path argument (first one). This is because in operator world
468500 # the host path is going to change every time the Skyhook Custom Resource changes so it would
469501 # look like a step hasn't been run when it fact it had.
0 commit comments