Skip to content

Commit a76e147

Browse files
committed
review updates
1 parent bf8882e commit a76e147

File tree

4 files changed

+41
-26
lines changed

4 files changed

+41
-26
lines changed

README.md

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ Array jobs are the easiest slurm method for bundling multiple related jobs. Typi
1515
- python-dateutil
1616
- slurm
1717

18-
NOTE: Theoretically specific versions do not matter much since the scripts are relatively simple. Thus specifically tested versions have been omitted (xxx).
19-
2018
NOTE: None of the toolchain has been tested using any other shell besides bash. Some things likely will break.
2119

2220
## Installation
@@ -175,9 +173,3 @@ Utility tools (not part of the submission hierarchy):
175173
- freen
176174
- batchlim
177175
- tar_swarms
178-
179-
### TODOS
180-
181-
xxx - short term, consider doing away with bash scripts and move to python?
182-
183-
xxx - long term, rewrite swarm in python (major task)

bin/aswarm

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,12 @@ if [[ -n "$invalid_partition" ]]; then
137137
echo "USAGE: aswarm -f xxx"
138138
echo " --partition ${p_cpu} --sbatch ' --exclude=$SLURM_EXCLUDES_CPU $dep_str ' -p xx"
139139
echo " --partition ${p_gpu} --sbatch ' --exclude=$SLURM_EXCLUDES_GPU --gres=gpu:$ngpu $dep_str ' -p xx"
140+
# xxx - make the partition variables into arrays
141+
echo " --partition CPU-72c --sbatch ' --exclude=$SLURM_EXCLUDES_CPU $dep_str ' -p xx"
140142
echo " use --no-run to test without submitting"
141143
echo " use -pt instead of -p to let slurm do the packing"
142144
echo " use -b instead of -p to run this number of jobs serially per node (swarm bundle)"
145+
echo " use --exclusive (swarm switch, not sbatch) to request full nodes"
143146
exit
144147
fi
145148

bin/create_swarm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ else:
184184
valid_iterate_range = (iterate_ranges[0] > -1).all()
185185
if valid_iterate_range and (iterate_ranges[:,1] - iterate_ranges[:,0] < 1).any():
186186
with open(fn, "w") as outfile:
187-
outfile.write("echo noop\n")
187+
outfile.write("echo noop, Twas brillig, and the slithy toves\n")
188188
print("At least one range empty, writing noop swarm file")
189189
sys.exit()
190190
iterate_ranges = [iterate_ranges.copy() for x in range(nwafer_ids if nwafer_ids > 0 else 1)]

bin/pipeline

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@ import argparse
1313
from datetime import date
1414

1515
import os
16-
#import glob
16+
import sys
1717
import shlex
1818
import subprocess
1919
import socket
20+
import shutil
2021

2122
# https://stackoverflow.com/questions/3503719/emulating-bash-source-in-python
2223
def source(fn):
@@ -71,6 +72,8 @@ parser.add_argument('--force', dest='force', action='store_true',
7172
help='send force flag to rolling_submit')
7273
parser.add_argument('--validate', dest='validate', action='store_true',
7374
help='check a previous run to validate true completion (special message)')
75+
parser.add_argument('--job-hist', dest='job_hist', action='store_true',
76+
help='run jobhist on each step (saves jobhist.txt)')
7477
parser.add_argument('--cancel', dest='cancel', action='store_true',
7578
help='cancel jobs started by pipeline (bounded by start-at / stop-at)')
7679
parser.add_argument('--use-partition', nargs=1, type=str, default=[''],
@@ -92,6 +95,7 @@ date_str = args['date_str'][0]
9295
no_run = args['no_run']
9396
force = args['force']
9497
validate = args['validate']
98+
job_hist = args['job_hist']
9599
cancel = args['cancel']
96100
mock_run = args['mock_run']
97101
use_partition = args['use_partition'][0]
@@ -185,13 +189,6 @@ with open(workflow_file, 'r') as f:
185189
# ipack == -1, jobs are never packed (slurm packing)
186190
nopack = (not arg_pack) if ipack == 0 else (ipack < 0)
187191

188-
if nopack:
189-
# have to use packing in order to use gpus.
190-
# i.e., slurm can not allow jobs to share gpus (without cuda MPS management enabled).
191-
assert( all([x == 0 for x in ngpus.values()])) # xxx - not allowed
192-
# without packing, all the wall times have to match
193-
assert( all([x == times[cuse_partition] for x in times.values()]) )
194-
195192
# mrolling mode (chunk_size > 0) always uses all partitions specified
196193
use_mroll = (chunk_size > 0)
197194
wait_str = '-W' if use_mroll else ''
@@ -203,6 +200,13 @@ with open(workflow_file, 'r') as f:
203200
job_ids[name] = -1
204201
continue
205202

203+
if nopack:
204+
# have to use packing in order to use gpus.
205+
# i.e., slurm can not allow jobs to share gpus (without cuda MPS management enabled).
206+
assert( all([x == 0 for x in ngpus.values()])) # xxx - not allowed
207+
# without packing, all the wall times have to match
208+
assert( all([x == times[cuse_partition] for x in times.values()]) )
209+
206210
# get the dependencies
207211
deps = [x for x in deps if x]
208212
deps_ids = [job_ids[x] for x in deps if job_ids[x] > 0] + base_deps
@@ -251,15 +255,17 @@ with open(workflow_file, 'r') as f:
251255
cmd = "mrolling_submit --wait --swarm {}".format(os.path.join('..', swarm_file))
252256
cmd += " --sleep_time {} --swarm_chunksize {}".format(sleep_time, chunk_size)
253257
if nopack or mroll_all:
254-
njob_cutoff = max_array - 2*chunk_size - 1
255-
# because we default to using all partitions to slurm with nopack,
256-
# then we do not also need to specify the partitions to mrolling.
257-
njob_cutoff_str = "--njob_cutoffs {}".format(njob_cutoff)
258-
mspartition = "--partitions {}".format(partition)
259258
if not nopack:
260259
assert( all([x == nproc for x in nprocs.values()])) # need equal packing
261260
gpu_str = ''
262261
pack_str = '-p {}'.format(nproc)
262+
njob_cutoff = max_array - 2*chunk_size - 1
263+
else:
264+
njob_cutoff = (max_array - 1)//npartitions - 2*chunk_size - 1
265+
# because we default to using all partitions to slurm with nopack,
266+
# then we do not also need to specify the partitions to mrolling.
267+
njob_cutoff_str = "--njob_cutoffs {}".format(njob_cutoff)
268+
mspartition = "--partitions {}".format(partition)
263269
swarm_opts = (" --swarm_opts " + _swarm_opts())
264270
else:
265271
njob_cutoff_str = "--njob_cutoffs \""
@@ -287,8 +293,9 @@ with open(workflow_file, 'r') as f:
287293

288294
if use_mroll:
289295
top_swarm = 'top_' + swarm_file
290-
with open(top_swarm, 'w') as fh:
291-
fh.write(cmd + '\n')
296+
if not (validate or cancel or job_hist):
297+
with open(top_swarm, 'w') as fh:
298+
fh.write(cmd + '\n')
292299

293300
cmd = "rolling_submit --swarms {}".format(top_swarm)
294301
time = mroll_time
@@ -305,13 +312,17 @@ with open(workflow_file, 'r') as f:
305312
print('Processing line:')
306313
print(' '.join(sline))
307314
job_id = 0
308-
if not (validate or cancel):
315+
if not (validate or cancel or job_hist):
309316
assert(os.path.isfile(swarm_file)) # swarm file missing
310317
job_id = echo_and_run(cmd, mock_run, cmd_no if no_run else -1)
311318
print(); print(); print()
312319
else:
313320
subdir = '_' + (top_swarm if top_swarm else swarm_file)
314-
with open(os.path.join(subdir,'job_id.txt'), 'r') as jf:
321+
fn = os.path.join(subdir, 'job_id.txt')
322+
if not os.path.isfile(fn) and job_hist:
323+
job_ids[name] = -1
324+
continue
325+
with open(fn, 'r') as jf:
315326
for jline in jf:
316327
cjline = jline.strip()
317328
if cjline:
@@ -352,6 +363,15 @@ with open(workflow_file, 'r') as f:
352363
print('\t' + cmd)
353364
args = shlex.split(cmd)
354365
scancel = subprocess.Popen(args); scancel.wait()
366+
elif job_hist:
367+
jobhist_txt = os.path.join(subdir, 'jobhist.txt')
368+
jobhist_script = os.path.join(os.path.dirname(__file__), 'jobhist')
369+
_job_id = os.path.join(subdir, '_' + swarm_file) if top_swarm else job_id
370+
args = shlex.split("{} {} {}".format(sys.executable, jobhist_script, _job_id))
371+
with open(jobhist_txt, 'w') as jh:
372+
jobhist = subprocess.Popen(args, stdout=jh, stderr=jh)
373+
jobhist.wait()
374+
shutil.copyfile(jobhist_txt, subdir + '-jobhist.txt')
355375
else:
356376
assert(False) # you should not be here
357377
# else - if not (validate or cancel):

0 commit comments

Comments
 (0)