@@ -170,10 +170,11 @@ def _parse_mem_limit(txt):
170170
171171def _make_cluster_name (job_name , image , machine_type , unique_name ):
172172 import hashlib
173- import os
173+
174174 if unique_name :
175175 return 'l-' + random_string (20 )
176- return "c-" + hashlib .md5 ("{}-{}-{}-{}-{}" .format (job_name , image , machine_type , sparklespray .__version__ , os .getlogin ()).encode ("utf8" )).hexdigest ()[:20 ]
176+ else :
177+ return "c-" + hashlib .md5 (f"{ job_name } -{ image } -{ machine_type } -{ sparklespray .__version__ } " .encode ("utf8" )).hexdigest ()[:20 ]
177178
178179
179180def submit (jq : JobQueue , io : IO , cluster : Cluster , job_id : str , spec : dict , config : SubmitConfig , metadata : dict = {},
@@ -253,8 +254,12 @@ def submit(jq: JobQueue, io: IO, cluster: Cluster, job_id: str, spec: dict, conf
253254 machine_specs = machine_specs ,
254255 monitor_port = monitor_port )
255256
257+ max_preemptable_attempts = 0
258+ if preemptible :
259+ max_preemptable_attempts = config .target_node_count * 2
260+
256261 jq .submit (job_id , list (zip (task_spec_urls , command_result_urls , log_urls )),
257- pipeline_spec , metadata , cluster_name , target_node_count )
262+ pipeline_spec , metadata , cluster_name , config . target_node_count , max_preemptable_attempts )
258263
259264
260265def new_job_id ():
@@ -378,6 +383,9 @@ def add_submit_cmd(subparser):
378383 "--nodes" , help = "Max number of VMs to start up to run these tasks" , type = int , default = 1 )
379384 parser .add_argument ("--cd" , help = "The directory to change to before executing the command" , default = "." ,
380385 dest = "working_dir" )
386+ parser .add_argument (
387+ "--skipifexists" , help = "If the job with this name already exists, do not submit a new one" ,
388+ action = "store_true" )
381389 parser .add_argument (
382390 "--symlinks" ,
383391 help = "When localizing files, use symlinks instead of copying files into location. This should only be used when the uploaded files will not be modified by the job." ,
@@ -386,6 +394,8 @@ def add_submit_cmd(subparser):
386394 "--local" , help = "Run the tasks inside of docker on the local machine" , action = "store_true" )
387395 parser .add_argument (
388396 "--rerun" , help = "If set, will download all of the files from previous execution of this job to worker before running" , action = "store_true" )
397+ parser .add_argument ("--preemptible" , action = "store_true" ,
398+ help = "If set, will try to turn on nodes initally as preemptible nodes" )
389399 parser .add_argument ("command" , nargs = argparse .REMAINDER )
390400 parser .add_argument ("--gpu_count" , type = int ,
391401 help = "Number of gpus on your VM" , default = 0 )
@@ -406,10 +416,15 @@ def submit_cmd(jq, io, cluster, args, config):
406416 else :
407417 image = config ['default_image' ]
408418
409- preemptible_flag = config .get ("preemptible" , "n" ).lower ()
410- if preemptible_flag not in ['y' , 'n' ]:
411- raise Exception (
412- "setting 'preemptable' in config must either by 'y' or 'n' but was: {}" .format (preemptible_flag ))
419+ if args .preemptible :
420+ preemptible = True
421+ else :
422+ preemptible_flag = config .get ("preemptible" , "n" ).lower ()
423+ if preemptible_flag not in ['y' , 'n' ]:
424+ raise Exception (
425+ "setting 'preemptible' in config must either by 'y' or 'n' but was: {}" .format (preemptible_flag ))
426+ preemptible = preemptible_flag == 'y'
427+
413428 bootDiskSizeGb = _get_bootDiskSizeGb (config )
414429 default_url_prefix = config .get ("default_url_prefix" , "" )
415430 work_dir = config .get ("local_work_dir" , os .path .expanduser (
@@ -418,7 +433,14 @@ def submit_cmd(jq, io, cluster, args, config):
418433 job_id = args .name
419434 if job_id is None :
420435 job_id = new_job_id ()
421-
436+ elif args .skipifexists :
437+ job = jq .get_job (job_id , must = False )
438+ if job is not None :
439+ txtui .user_print (
440+ f"Found existing job { job_id } and submitted job with --skipifexists so aborting" )
441+ return 0
442+
443+ target_node_count = args .nodes
422444 machine_type = config ['machine_type' ]
423445 if args .machine_type :
424446 machine_type = args .machine_type
@@ -498,7 +520,7 @@ def submit_cmd(jq, io, cluster, args, config):
498520
499521 log .debug ("spec: %s" , json .dumps (spec , indent = 2 ))
500522
501- submit_config = SubmitConfig (preemptible = preemptible_flag == 'y' ,
523+ submit_config = SubmitConfig (preemptible = preemptible ,
502524 bootDiskSizeGb = bootDiskSizeGb ,
503525 default_url_prefix = default_url_prefix ,
504526 machine_type = machine_type ,
@@ -510,7 +532,7 @@ def submit_cmd(jq, io, cluster, args, config):
510532 mount_point = config .get ("mount" , "/mnt/" ),
511533 kubequeconsume_url = kubequeconsume_exe_url ,
512534 gpu_count = gpu_count ,
513- target_node_count = args . nodes
535+ target_node_count = target_node_count
514536 )
515537
516538 cluster_name = None
@@ -539,7 +561,7 @@ def submit_cmd(jq, io, cluster, args, config):
539561 if not (args .dryrun or args .skip_kube_submit ) and args .wait_for_completion :
540562 log .info ("Waiting for job to terminate" )
541563 successful_execution = watch (
542- io , jq , job_id , cluster , target_nodes = 1 , loglive = True )
564+ io , jq , job_id , cluster , target_nodes = target_node_count , loglive = True )
543565 finished = True
544566
545567 if finished :
0 commit comments