From dee180e42eb3533138ae42cf23a27504a676c41a Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 19 Aug 2025 16:53:07 -0400
Subject: [PATCH 01/16] Initial OpenFold container commit

Use numpy 1.x
Add libaio
x86_64 and ARM64 build working
Ran Inference tests on x86_64 and ARM64
Ran Training on 8 GPU DGX (x86_64) node:
  Completed ephoch 0 training
  Restarted training from epoch 0 checkpoint AOK
Slurm examples for x86_64 and ARM64

NOTE: The following files have "raw" github URLs that will have to be fixed
for production:

BUILD-ARM64.md
README.md

Tony
---
 .../OpenFold/BUILD-ARM64.md                   | 214 ++++
 .../Download_OpenFold_PDB_training_set.md     | 200 ++++
 .../OpenFold/Download_model_parameters.md     | 168 +++
 .../OpenFold/EXAMPLES.md                      | 963 ++++++++++++++++++
 .../OpenFold/OpenFold-aarch64.def             | 133 +++
 .../OpenFold/OpenFold.def                     |  97 ++
 .../2_ApplicationSpecific/OpenFold/README.md  | 267 +++++
 .../OpenFold/environment-aarch64.yml          |  36 +
 .../OpenFold/environment.yml                  |  44 +
 .../slurm_GH200_OpenFold_example.bash         |  98 ++
 .../OpenFold/slurm_OpenFold_example.bash      |  95 ++
 11 files changed, 2315 insertions(+)
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/OpenFold.def
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/README.md
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/environment.yml
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
new file mode 100644
index 0000000..4f2b501
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -0,0 +1,214 @@
+# Build the OpenFold container on ARM64
+
+## Buid the ARM64 container image
+
+Start an interactive job on an ARM64 node
+
+```
+tmp_file="$(mktemp)"
+salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
+ --exclusive --time=3:30:00 2>&1 | tee "${tmp_file}"
+SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
+rm "${tmp_file}"
+srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
+```
+
+sample outout:
+
+> ```
+> salloc: Pending job allocation 20812210
+> salloc: job 20812210 queued and waiting for resources
+> salloc: job 20812210 has been allocated resources
+> salloc: Granted job allocation 20812210
+> salloc: Waiting for resource configuration
+> salloc: Nodes cpn-v14-19 are ready for job
+> CCRusername@cpn-v14-19:~$
+> ```
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Then set the apptainer cache dir:
+
+```
+export APPTAINER_CACHEDIR=${SLURMTMPDIR}
+```
+
+Download the OpenFold ARM64 build files, OpenFold-aarch64.def and
+environment-aarch64.yml, to this directory
+
+```
+curl -L -o OpenFold-aarch64.def https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
+curl -L -o environment-aarch64.yml https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
+```
+
+Sample output:
+
+> ```
+>   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+>                                  Dload  Upload   Total   Spent    Left  Speed
+> 100  4627  100  4627    0     0  27459      0 --:--:-- --:--:-- --:--:-- 27541
+>   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+>                                  Dload  Upload   Total   Spent    Left  Speed
+> 100   574  100   574    0     0   3128      0 --:--:-- --:--:-- --:--:--  3136
+> ```
+
+Build your container
+
+Note: Building the OpenFold container takes about three hours
+
+```
+apptainer build OpenFold-$(arch).sif OpenFold-aarch64.def
+```
+
+sample truncated output:
+
+> ```
+> [....]
+> INFO:    Adding environment to container
+> INFO:    Creating SIF file...
+> INFO:    Build complete: OpenFold-aarch64.sif
+> ```
+
+Exit the Slurm interactive session
+
+```
+exit
+```
+
+sample output:
+
+> ```
+> CCRusername@login1$ 
+> ```
+
+End the Slurm job
+
+```
+scancel "${SLURM_JOB_ID}"
+unset SLURM_JOB_ID
+```
+
+## Running the container
+
+Start an interactive job on a node with a Grace Hopper GPU e.g.
+
+```
+tmp_file="$(mktemp)"
+salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
+ --time=1:00:00  --nodes=1 --tasks-per-node=1 --cpus-per-task=4 \
+ --gpus-per-node=1 --constraint="GH200" --mem=90G 2>&1 | tee "${tmp_file}"
+SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
+rm "${tmp_file}"
+srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
+```
+
+sample outout:
+
+> ```
+> salloc: Pending job allocation 20815431
+> salloc: job 20815431 queued and waiting for resources
+> salloc: job 20815431 has been allocated resources
+> salloc: Granted job allocation 20815431
+> salloc: Waiting for resource configuration
+> salloc: Nodes cpn-v14-19 are ready for job
+> ```
+
+Change to your OpenFold` directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Create the output base directory
+
+```
+mkdir -p ./output
+```
+
+...then start the OpenFold container instance
+
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+expected output:
+
+> ```
+> Apptainer> 
+> ```
+
+All the following commands are run from the "Apptainer> " prompt
+
+Verify OpenFold is installed:
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+python3 "${OF_DIR}/train_openfold.py" --help
+```
+
+Note: There may be no output for over half a minute
+
+Abridged sample output:
+
+> ```
+> [2025-08-22 11:47:24,610] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+>  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
+>  [WARNING]  async_io: please install the libaio-dev package with apt
+>  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+> /opt/conda/lib/python3.10/site-packages/deepspeed-0.14.5+unknown-py3.10.egg/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed-0.14.5+unknown-py3.10.egg/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> usage: train_openfold.py [-h] [--train_mmcif_data_cache_path TRAIN_MMCIF_DATA_CACHE_PATH] [--use_single_seq_mode USE_SINGLE_SEQ_MODE]
+>                          [--distillation_data_dir DISTILLATION_DATA_DIR] [--distillation_alignment_dir DISTILLATION_ALIGNMENT_DIR] [--val_data_dir VAL_DATA_DIR]
+>                          [--val_alignment_dir VAL_ALIGNMENT_DIR] [--val_mmcif_data_cache_path VAL_MMCIF_DATA_CACHE_PATH] [--kalign_binary_path KALIGN_BINARY_PATH]
+>                          [--train_filter_path TRAIN_FILTER_PATH] [--distillation_filter_path DISTILLATION_FILTER_PATH]
+>                          [--obsolete_pdbs_file_path OBSOLETE_PDBS_FILE_PATH] [--template_release_dates_cache_path TEMPLATE_RELEASE_DATES_CACHE_PATH]
+>                          [--use_small_bfd USE_SMALL_BFD] [--seed SEED] [--deepspeed_config_path DEEPSPEED_CONFIG_PATH] [--checkpoint_every_epoch]
+>                          [--early_stopping EARLY_STOPPING] [--min_delta MIN_DELTA] [--patience PATIENCE] [--resume_from_ckpt RESUME_FROM_CKPT]
+>                          [--resume_model_weights_only RESUME_MODEL_WEIGHTS_ONLY] [--resume_from_jax_params RESUME_FROM_JAX_PARAMS]
+>                          [--log_performance LOG_PERFORMANCE] [--wandb] [--experiment_name EXPERIMENT_NAME] [--wandb_id WANDB_ID] [--wandb_project WANDB_PROJECT]
+>                          [--wandb_entity WANDB_ENTITY] [--script_modules SCRIPT_MODULES] [--train_chain_data_cache_path TRAIN_CHAIN_DATA_CACHE_PATH]
+>                          [--distillation_chain_data_cache_path DISTILLATION_CHAIN_DATA_CACHE_PATH] [--train_epoch_len TRAIN_EPOCH_LEN] [--log_lr]
+>                          [--config_preset CONFIG_PRESET] [--_distillation_structure_index_path _DISTILLATION_STRUCTURE_INDEX_PATH]
+>                          [--alignment_index_path ALIGNMENT_INDEX_PATH] [--distillation_alignment_index_path DISTILLATION_ALIGNMENT_INDEX_PATH]
+>                          [--experiment_config_json EXPERIMENT_CONFIG_JSON] [--gpus GPUS] [--mpi_plugin] [--num_nodes NUM_NODES] [--precision PRECISION]
+>                          [--max_epochs MAX_EPOCHS] [--log_every_n_steps LOG_EVERY_N_STEPS] [--flush_logs_every_n_steps FLUSH_LOGS_EVERY_N_STEPS]
+>                          [--num_sanity_val_steps NUM_SANITY_VAL_STEPS] [--reload_dataloaders_every_n_epochs RELOAD_DATALOADERS_EVERY_N_EPOCHS]
+>                          [--accumulate_grad_batches ACCUMULATE_GRAD_BATCHES]
+>                          train_data_dir train_alignment_dir template_mmcif_dir output_dir max_template_date
+> [...]
+> ```
+
+Exit the Apptainer container instance
+
+```
+exit
+```
+
+sample outout:
+
+> ```
+> CCRusername@cpn-v14-19$ 
+> ```
+
+End the Slurm job
+
+```
+scancel "${SLURM_JOB_ID}"
+unset SLURM_JOB_ID
+```
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md b/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
new file mode 100644
index 0000000..026bcd1
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
@@ -0,0 +1,200 @@
+# OpenFold OpenFold PDB training set data
+
+NOTE: DO NOT do this at CCR unless the copy of the processed files in
+/util/software/data/OpenFold/ do not satisfy your needs.
+
+The following instructions take about two days to complete and you will need
+about 2TB of storage space for the downloads, though this reduces to about
+1.5TB once some pre-processed files are removed.
+
+
+## Download OpenFold PDB training set from RODA
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Start the container
+
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+expected output:
+
+> ```
+> Apptainer> 
+> ```
+
+All the following commands are run from the "Apptainer>" prompt.
+
+Following the download example [here](https://openfold.readthedocs.io/en/latest/OpenFold_Training_Setup.html)
+
+Download alignments corresponding to the original PDB training set of OpenFold
+and their mmCIF 3D structures.
+
+```
+mkdir -p alignment_data/alignment_dir_roda
+aws s3 cp s3://openfold/pdb/ alignment_data/alignment_dir_roda/ --recursive --no-sign-request
+mkdir -p pdb_data
+aws s3 cp s3://openfold/pdb_mmcif.zip pdb_data/ --no-sign-request
+aws s3 cp s3://openfold/duplicate_pdb_chains.txt pdb_data/ --no-sign-request
+unzip pdb_data/pdb_mmcif.zip -d pdb_data
+${OF_DIR}/scripts/flatten_roda.sh alignment_data/alignment_dir_roda alignment_data/ && \
+ rm -r alignment_data/alignment_dir_roda
+``
+
+
+Highly truncated output:
+
+> ```
+> [...]
+>   inflating: pdb_data/mmcif_files/3n25.cif  
+>   inflating: pdb_data/mmcif_files/5bpe.cif  
+>   inflating: pdb_data/obsolete.dat   
+> ```
+
+
+## Creating alignment DBs
+
+```
+python ${OF_DIR}/scripts/alignment_db_scripts/create_alignment_db_sharded.py \
+ alignment_data/alignments \
+ alignment_data/alignment_dbs \
+ alignment_db \
+ --n_shards 10 \
+ --duplicate_chains_file pdb_data/duplicate_pdb_chains.txt
+```
+
+sample output:
+
+> ```
+> Getting chain directories...
+> 131487it [00:01, 93532.58it/s]
+> Creating 10 alignment-db files...
+> 
+> Created all shards. 
+> Extending super index with duplicate chains...
+> Added 502947 duplicate chains to index.
+> 
+> Writing super index...
+> Done.                                                                                                                                                        
+> ```
+
+Verify the alighnemt DBs
+
+```
+grep "files" alignment_data/alignment_dbs/alignment_db.index | wc -l
+```
+
+Expected output:
+
+> ```
+> 634434
+> ```
+
+
+## Generating cluster-files
+
+Generate a .fasta file of all sequences in the training set.
+
+```
+python ${OF_DIR}/scripts/alignment_data_to_fasta.py \
+ alignment_data/all-seqs.fasta \
+ --alignment_db_index alignment_data/alignment_dbs/alignment_db.index
+```
+
+Sample output:
+
+> ```
+> Creating FASTA from alignment dbs...
+> 100%|█████████████████████████████████| 634434/634434 [40:03<00:00, 263.97it/s]
+> FASTA file written to alignment_data/all-seqs.fasta.
+> ```
+
+Generate a cluster file at 40% sequence identity, which will contain all
+chains in a particular cluster on the same line.
+
+```
+python ${OF_DIR}/scripts/fasta_to_clusterfile.py \
+ alignment_data/all-seqs.fasta \
+ alignment_data/all-seqs_clusters-40.txt \
+ /opt/conda/bin/mmseqs \
+ --seq-id 0.4
+```
+
+Sample truncated output:
+
+> ```
+> [...]
+> rmdb _mmseqs_out_temp/585534219710102476/clu -v 3 
+> 
+> Time for processing: 0h 0m 0s 82ms
+> Reformatting output file...
+> Cleaning up mmseqs2 output...
+> Done!
+> ```
+
+
+## Generating Cache files
+
+OpenFold requires “cache” files with metadata information for each chain.
+
+Download the data caches for OpenProteinSetfrom RODA
+
+```
+aws s3 cp s3://openfold/data_caches/ pdb_data/ --recursive --no-sign-request
+```
+
+Sample output:
+
+> ```
+> download: s3://openfold/data_caches/mmcif_cache.json to pdb_data/mmcif_cache.json
+> download: s3://openfold/data_caches/chain_data_cache.json to pdb_data/chain_data_cache.json
+> ```
+
+
+Create data caches for your own datasets.
+
+```
+mkdir pdb_data/data_caches
+python ${OF_DIR}/scripts/generate_mmcif_cache.py \
+ pdb_data/mmcif_files \
+ pdb_data/data_caches/mmcif_cache.json \
+ --no_workers $(nproc)
+```
+
+samoke output:
+
+> ```
+> 100%|██████████████████████████████| 185158/185158 [1:04:15<00:00, 48.03it/s]
+
+> ```
+
+Generate chain-data-cache for filtering training samples and adjusting
+per-chain sampling probabilities
+
+```
+python ${OF_DIR}/scripts/generate_chain_data_cache.py \
+ pdb_data/mmcif_files \
+ pdb_data/data_caches/chain_data_cache.json \
+ --cluster_file alignment_data/all-seqs_clusters-40.txt \
+ --no_workers $(nproc)
+```
+
+Sample output:
+
+> ```
+> 100%|██████████████████████████████| 185158/185158 [1:15:58<00:00, 40.62it/s]
+> ```
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md b/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
new file mode 100644
index 0000000..d2c9bc8
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
@@ -0,0 +1,168 @@
+# OpenFold and AlphaFold 2 model parameters
+
+NOTE: DO NOT do this at CCR unless the copy of the files in
+/util/software/data/OpenFold/ and /util/software/data/AlphaFold/
+does not satisfy your needs.
+
+## Download the OpenFold and AlphaFold 2 model parameters
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+...and make a direcory for the model parameters
+
+```
+mkdir -p ./resources
+```
+
+Start the container
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+expected output:
+
+> ```
+> Apptainer> 
+> ```
+
+Create a directory for the model parameters
+
+```
+mkdir -p ./resources/
+```
+
+## Download the OpenFold trained parameters
+
+```
+bash ${OF_DIR}/scripts/download_openfold_params.sh ./resources/
+```
+
+Sample output:
+
+> ```
+> download: s3://openfold/openfold_params/LICENSE to resources/openfold_params/LICENSE
+> download: s3://openfold/openfold_params/README.txt to resources/openfold_params/README.txt
+> download: s3://openfold/openfold_params/finetuning_no_templ_1.pt to resources/openfold_params/finetuning_no_templ_1.pt
+> download: s3://openfold/openfold_params/finetuning_5.pt to resources/openfold_params/finetuning_5.pt
+> download: s3://openfold/openfold_params/finetuning_3.pt to resources/openfold_params/finetuning_3.pt
+> download: s3://openfold/openfold_params/finetuning_2.pt to resources/openfold_params/finetuning_2.pt
+> download: s3://openfold/openfold_params/finetuning_4.pt to resources/openfold_params/finetuning_4.pt
+> download: s3://openfold/openfold_params/finetuning_ptm_2.pt to resources/openfold_params/finetuning_ptm_2.pt
+> download: s3://openfold/openfold_params/finetuning_no_templ_2.pt to resources/openfold_params/finetuning_no_templ_2.pt
+> download: s3://openfold/openfold_params/finetuning_ptm_1.pt to resources/openfold_params/finetuning_ptm_1.pt
+> download: s3://openfold/openfold_params/finetuning_no_templ_ptm_1.pt to resources/openfold_params/finetuning_no_templ_ptm_1.pt
+> download: s3://openfold/openfold_params/initial_training.pt to resources/openfold_params/initial_training.pt
+> ```
+
+This has downloaded the OpenFold model params to ./resources/openfold_params/
+
+```
+ls -l ./resources/openfold_params/
+```
+
+Sample output:
+
+> ```
+> total 3654208
+> -rw-rw-r-- 1 [CCRusername] nogroup 374586533 Jul 19  2022 finetuning_2.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374586533 Jul 19  2022 finetuning_3.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374586533 Jul 19  2022 finetuning_4.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374586533 Jul 19  2022 finetuning_5.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 373226022 Jul 19  2022 finetuning_no_templ_1.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 373226022 Jul 19  2022 finetuning_no_templ_2.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 373259620 Jul 19  2022 finetuning_no_templ_ptm_1.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374620131 Jul 19  2022 finetuning_ptm_1.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374620131 Jul 19  2022 finetuning_ptm_2.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup 374586533 Jul 19  2022 initial_training.pt
+> -rw-rw-r-- 1 [CCRusername] nogroup     18657 Jul 19  2022 LICENSE
+> -rw-rw-r-- 1 [CCRusername] nogroup      2217 Jul 19  2022 README.txt
+> ```
+
+
+## Download the AlphaFold Deepmind model parameters
+
+```
+bash ${OF_DIR}/scripts/download_alphafold_params.sh ./resources/
+```
+
+Sample output:
+
+> ```
+> 
+> 08/21 11:38:26 [NOTICE] Downloading 1 item(s)
+> 
+> 08/21 11:38:26 [NOTICE] Allocating disk space. Use --file-allocation=none to disable it. See --file-allocation option in man page for more details.
+>  *** Download Progress Summary as of Thu Aug 21 11:39:29 2025 ***                                                                                                   
+> ===============================================================================
+> [#5fb42b 4.5GiB/5.2GiB(86%) CN:1 DL:60MiB ETA:11s]
+> FILE: ./resources//params/alphafold_params_2022-12-06.tar
+> -------------------------------------------------------------------------------
+> 
+> [#5fb42b 5.1GiB/5.2GiB(98%) CN:1 DL:95MiB]                                                                                                                          
+> 08/21 11:39:37 [NOTICE] Download complete: ./resources//params/alphafold_params_2022-12-06.tar
+> 
+> Download Results:
+> gid   |stat|avg speed  |path/URI
+> ======+====+===========+=======================================================
+> 5fb42b|OK  |    78MiB/s|./resources//params/alphafold_params_2022-12-06.tar
+> 
+> Status Legend:
+> (OK):download completed.
+> params_model_1.npz
+> params_model_2.npz
+> params_model_3.npz
+> params_model_4.npz
+> params_model_5.npz
+> params_model_1_ptm.npz
+> params_model_2_ptm.npz
+> params_model_3_ptm.npz
+> params_model_4_ptm.npz
+> params_model_5_ptm.npz
+> params_model_1_multimer_v3.npz
+> params_model_2_multimer_v3.npz
+> params_model_3_multimer_v3.npz
+> params_model_4_multimer_v3.npz
+> params_model_5_multimer_v3.npz
+> LICENSE
+> ```
+
+This has downloaded the AlphaFold Deepmind momdel parameters to ./resources/params/
+
+```
+ls -l ./resources/params/
+```
+
+Sample output:
+
+> ```
+> total 5456991
+> -rw-rw-r-- 1 [CCRusername] nogroup     18657 Mar 23  2020 LICENSE
+> -rw-rw-r-- 1 [CCRusername] nogroup 373043148 Nov 22  2022 params_model_1_multimer_v3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373069562 Jul 19  2021 params_model_1.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373103340 Jul 19  2021 params_model_1_ptm.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373043148 Nov 22  2022 params_model_2_multimer_v3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373069562 Jul 19  2021 params_model_2.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373103340 Jul 19  2021 params_model_2_ptm.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373043148 Nov 22  2022 params_model_3_multimer_v3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371712506 Jul 19  2021 params_model_3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371746284 Jul 19  2021 params_model_3_ptm.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373043148 Nov 22  2022 params_model_4_multimer_v3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371712506 Jul 19  2021 params_model_4.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371746284 Jul 19  2021 params_model_4_ptm.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 373043148 Nov 22  2022 params_model_5_multimer_v3.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371712506 Jul 19  2021 params_model_5.npz
+> -rw-rw-r-- 1 [CCRusername] nogroup 371746284 Jul 19  2021 params_model_5_ptm.npz
+> ```
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
new file mode 100644
index 0000000..6503bb1
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
@@ -0,0 +1,963 @@
+# OpenFold Examples
+
+## OpenFold example from the GitHub sources
+
+The following example is from the [OpenFold Inference docs](https://openfold.readthedocs.io/en/latest/Inference.html#running-alphafold-model-inference)
+
+Start an interactive job with a GPU e.g.
+NOTE: OpenFold Inference only uses one GPU
+
+```
+salloc --cluster=ub-hpc --partition=general-compute --qos=general-compute \
+ --account="[SlurmAccountName]" --mem=128GB --nodes=1 --cpus-per-task=1 \
+ --tasks-per-node=12 --gpus-per-node=1 --time=02:00:00
+```
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Create a top level output directory
+
+```
+mkdir -p ./output
+```
+
+Start the container, with the "./output" directory a the top level output
+ directory "/output" inside the contianer.
+
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+expected output:
+
+> ```
+> Apptainer> 
+> ```
+
+All the following commands are run from the "Apptainer>" prompt.
+
+The following example uses the [OpenFold model params](Download_model_parameters.md), which have
+already been downloaded at CCR and are avaiable in the directory:
+/util/software/data/OpenFold/openfold_params
+This directory is mounted on /util/software/data/OpenFold/openfold_params
+inside the contaner when using the "apptainer" command given above
+
+# Get the example from the OpenFold GitHub repo
+
+```
+git clone https://github.com/aqlaboratory/openfold.git
+mv openfold//examples/ ./examples/
+rm -rf openfold
+```
+
+You should now have the monomer example in the ./examples/monomer/ directory
+
+```
+ls -l ./examples/monomer/
+```
+
+Sample output:
+
+> ```
+> total 1
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 alignments
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 fasta_dir
+> -rwxrwxr-x 1 [CCRusername] nogroup  530 Aug 21 15:45 inference.sh
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 sample_predictions
+> ```
+
+## Run model inference
+
+### Model inference with pre-computed alignments
+
+Note: this example uses "/output/PDB_6KWC/pre-computed_alignments" as the
+output directory; outside the container this is the directory:
+"./output/PDB_6KWC/pre-computed_alignments"
+
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+mkdir -p /output/PDB_6KWC/pre-computed_alignments
+python3 "${OF_DIR}/run_pretrained_openfold.py" \
+ --hhblits_binary_path "/opt/conda/bin/hhblits" \
+ --hmmsearch_binary_path "/opt/conda/bin/hhsearch" \
+ --hmmbuild_binary_path "/opt/conda/bin/hmmbuild" \
+ --kalign_binary_path "/opt/conda/bin/kalign" \
+ --model_device cuda \
+ --data_random_seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --use_precomputed_alignments "./examples/monomer/alignments" \
+ --output_dir "/output/PDB_6KWC/pre-computed_alignments" \
+ --config_preset model_1_ptm \
+ --jax_param_path "${OF_DIR}/openfold/resources/params/params_model_1_ptm.npz" \
+ "./examples/monomer/fasta_dir" \
+ "/data/pdb_data/mmcif_files"
+```
+
+Sample output:
+
+> ```
+> [2025-08-25 14:49:35,606] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Successfully loaded JAX parameters at /opt/openfold/openfold/resources/params/params_model_1_ptm.npz...
+> INFO:/opt/openfold/run_pretrained_openfold.py:Using precomputed alignments for 6KWC_1 at ./examples/monomer/alignments...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Running inference for 6KWC_1...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 56.51587692601606
+> INFO:/opt/openfold/run_pretrained_openfold.py:Output written to /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
+> INFO:/opt/openfold/run_pretrained_openfold.py:Running relaxation on /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 10.501414217054844
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxed output written to /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_relaxed.pdb...
+> ```
+
+The output for the run is in the PDB_6KWC/pre-computed_alignments directory tree
+
+```
+ls -laR /output/PDB_6KWC/pre-computed_alignments
+```
+
+Sample output:
+
+> ```
+> /output/PDB_6KWC/pre-computed_alignments:
+> total 1
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:50 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:49 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:51 predictions
+> -rw-rw-r-- 1 [CCRusername] nogroup   44 Aug 25 14:50 timings.json
+> 
+> /output/PDB_6KWC/pre-computed_alignments/predictions:
+> total 344
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 25 14:51 .
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 25 14:50 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup 230149 Aug 25 14:51 6KWC_1_model_1_ptm_relaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Aug 25 14:50 6KWC_1_model_1_ptm_unrelaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup     34 Aug 25 14:51 timings.json
+> ```
+
+
+### Model inference without pre-computed alignments
+
+Note: jackhmmer and nhmmer don't scale beyond 8 cores, henec the "--cpu" option
+is set to 8 rather than $(nproc)
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+mkdir -p /output/PDB_6KWC/without_pre-computed_alignments
+python3 "${OF_DIR}/run_pretrained_openfold.py" \
+ --hhblits_binary_path "/opt/conda/bin/hhblits" \
+ --hmmsearch_binary_path "/opt/conda/bin/hhsearch" \
+ --hmmbuild_binary_path "/opt/conda/bin/hmmbuild" \
+ --kalign_binary_path "/opt/conda/bin/kalign" \
+ --uniref90_database_path "/database/uniref90/uniref90.fasta" \
+ --mgnify_database_path "/database/mgnify/mgy_clusters_2022_05.fa" \
+ --pdb70_database_path "/database/pdb70/pdb70" \
+ --uniclust30_database_path "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
+ --bfd_database_path "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
+ --cpus 8 \
+ --model_device cuda \
+ --data_random_seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --output_dir "/output/PDB_6KWC/without_pre-computed_alignments" \
+ --config_preset model_1_ptm \
+ --jax_param_path "${OF_DIR}/openfold/resources/params/params_model_1_ptm.npz" \
+ "./examples/monomer/fasta_dir" \
+ "/data/pdb_data/mmcif_files"
+```
+
+Sample output:
+
+> ```
+> [2025-08-26 09:33:00,593] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Successfully loaded JAX parameters at /opt/openfold/openfold/resources/params/params_model_1_ptm.npz...
+> INFO:/opt/openfold/run_pretrained_openfold.py:Generating alignments for 6KWC_1...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Running inference for 6KWC_1...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 58.297142078052275
+> INFO:/opt/openfold/run_pretrained_openfold.py:Output written to /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
+> INFO:/opt/openfold/run_pretrained_openfold.py:Running relaxation on /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 10.48616563109681
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxed output written to /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_relaxed.pdb...
+> ```
+
+The output for the run is in the PDB_6KWC/without_pre-computed_alignments directory tree
+
+```
+ls -laR /output/PDB_6KWC/without_pre-computed_alignments
+```
+
+Sample output:
+
+> ```
+> /output/PDB_6KWC/without_pre-computed_alignments:
+> total 1
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:52 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:35 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:33 alignments
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:53 predictions
+> -rw-rw-r-- 1 [CCRusername] nogroup   45 Aug 26 09:52 timings.json
+> 
+> /output/PDB_6KWC/without_pre-computed_alignments/alignments:
+> total 0
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:33 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:52 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:51 6KWC_1
+> 
+> /output/PDB_6KWC/without_pre-computed_alignments/alignments/6KWC_1:
+> total 7028
+> drwxrwsr-x 2 [CCRusername] nogroup    4096 Aug 26 09:51 .
+> drwxrwsr-x 2 [CCRusername] nogroup    4096 Aug 26 09:33 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup  397302 Aug 26 09:51 bfd_uniclust_hits.a3m
+> -rw-rw-r-- 1 [CCRusername] nogroup  136021 Aug 26 09:38 hhsearch_output.hhr
+> -rw-rw-r-- 1 [CCRusername] nogroup 1972569 Aug 26 09:48 mgnify_hits.sto
+> -rw-rw-r-- 1 [CCRusername] nogroup 4689644 Aug 26 09:38 uniref90_hits.sto
+> 
+> /output/PDB_6KWC/without_pre-computed_alignments/predictions:
+> total 344
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 26 09:53 .
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 26 09:52 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup 230149 Aug 26 09:53 6KWC_1_model_1_ptm_relaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Aug 26 09:52 6KWC_1_model_1_ptm_unrelaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup     33 Aug 26 09:53 timings.json
+> ```
+
+
+Note: Other possible options for "run_pretrained_openfold.py"
+
+> ```
+> --pdb_seqres_database_path "/database/pdb_seqres/pdb_seqres.txt" \
+> --uniref30_database_path "/database/uniref30/UniRef30_2021_03" \
+> --uniprot_database_path "/database/uniprot/uniprot.fasta" \
+> --max_template_date MAX_TEMPLATE_DATE \
+> --obsolete_pdbs_path OBSOLETE_PDBS_PATH \
+> --model_device MODEL_DEVICE \
+> --config_preset CONFIG_PRESET \
+> --openfold_checkpoint_path OPENFOLD_CHECKPOINT_PATH \
+> --save_outputs \
+> --preset {reduced_dbs,full_dbs} \
+> --output_postfix OUTPUT_POSTFIX \
+> --skip_relaxation \
+> --multimer_ri_gap MULTIMER_RI_GAP \
+> --trace_model \
+> --subtract_plddt \
+> --long_sequence_inference \
+> --cif_output \
+> --experiment_config_json EXPERIMENT_CONFIG_JSON \
+> --use_deepspeed_evoformer_attention \
+> --release_dates_path RELEASE_DATES_PATH \
+> ```
+
+
+# Multi GPU example using the OpenFold PDB training set from RODA
+
+Start an interactive job with more than one GPU e.g.
+
+```
+salloc --cluster=ub-hpc --account="[SlurmAccountName]" \
+ --partition=industry-dgx --qos=industry-dgx --mem=128GB --nodes=1 \
+ --gpus-per-node=8 --mem=0 --exclusive --time=3-00:00:00
+```
+
+sample outout:
+
+> ```
+> salloc: Pending job allocation 21070582
+> salloc: job 21070582 queued and waiting for resources
+> salloc: job 21070582 has been allocated resources
+> salloc: Granted job allocation 21070582
+> salloc: Nodes cpn-i09-04 are ready for job
+> CCRusername@cpn-i09-04:~$ 
+> ```
+
+In this case the node allocated has eight H100 GPUs with 80GB RAM each
+
+```
+nvidia-smi -L
+```
+
+output:
+
+> ````
+> GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-e5f404f3-cc2a-cf0c-219f-dcf1a4e223f2)
+> GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-96601a91-e977-7a71-a188-8df4aff2fbcc)
+> GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-c4a62918-26ce-f10c-a009-dd3b2e069ac2)
+> GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-7b286e42-7f9d-a8e8-501c-14b0663b8440)
+> GPU 4: NVIDIA H100 80GB HBM3 (UUID: GPU-a9038bc9-da63-7f95-edb6-9857e428acbc)
+> GPU 5: NVIDIA H100 80GB HBM3 (UUID: GPU-347ee5de-5ad5-fdea-3c1b-41ba332b066e)
+> GPU 6: NVIDIA H100 80GB HBM3 (UUID: GPU-558a69d7-ed47-fd4c-be72-4308fefe6876)
+> GPU 7: NVIDIA H100 80GB HBM3 (UUID: GPU-f65a6ec2-ce6f-ba6c-d4c2-8ca414d6e709)
+> ````
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Create an output directory
+
+```
+mkdir -p ./output
+```
+
+Start the container, with the "./output" directory as the output directory.
+Note: You can change the /output mount: "-B $(pwd)/output:/output" to use an
+alternate output directory
+
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+expected output:
+
+> ```
+> Apptainer> 
+> ```
+
+All the following commands are run from the "Apptainer>" prompt.
+
+The following example uses the OpenFold PDB training set from RODA, which was
+downloaded and processed for use already, and is available at CCR in the
+directory: /util/software/data/OpenFold/
+This directory is about 1.5TB in size
+
+The process to download & process this data is documented in in the
+[Download_OpenFold_PDB_training_set.md](Download_OpenFold_PDB_training_set.md) file.
+This process takes several days to complete.  Do NOT follow the instrctions
+therein unless the CCR copy does not work for your use case, and you have
+sufficient storage space for the files.
+
+
+NOTE: The "--seed" option below uses a random number utilizing the ${RANDOM}
+bash variable to generate an integer in the 1 to 2^32 range.  You should
+expect to generate different loss values for the same parameters and data,
+with multiple runs.  If you use the same seed for multiple runs you should
+generate the same loss values (this can be used for reproducibility.)
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+mkdir -p /output/PDB/2021-10-10/
+python3 "${OF_DIR}/train_openfold.py" \
+ --train_chain_data_cache_path "/data/pdb_data/data_caches/chain_data_cache.json" \
+ --template_release_dates_cache_path "/data/pdb_data/data_caches/mmcif_cache.json" \
+ --obsolete_pdbs_file_path "/data/pdb_data/obsolete.dat" \
+ --config_preset initial_training \
+ --seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --num_nodes ${SLURM_NNODES} \
+ --gpus $(expr ${SLURM_GPUS_ON_NODE} \* ${SLURM_NNODES}) \
+ --max_epochs 1000 \
+ --checkpoint_every_epoch \
+ "/data/pdb_data/mmcif_files" \
+ "/data/alignment_data/alignments" \
+ "/data/pdb_data/mmcif_files" \
+ "/output/PDB/2021-10-10" \
+ "2021-10-10"
+```
+
+Sample output:
+
+> ```
+> [2025-08-26 11:53:29,143] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 0] Seed set to 504588624
+> /opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
+> Using bfloat16 Automatic Mixed Precision (AMP)
+> GPU available: True (cuda), used: True
+> TPU available: False, using: 0 TPU cores
+> HPU available: False, using: 0 HPUs
+> You are using a CUDA device ('NVIDIA H100 80GB HBM3') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
+> Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
+> [2025-08-26 11:54:03,699] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,706] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,710] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,710] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 11:54:03,715] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 1] Seed set to 504588624
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 2] Seed set to 504588624
+> [rank: 6] Seed set to 504588624
+> [rank: 4] Seed set to 504588624
+> [rank: 7] Seed set to 504588624
+> [rank: 5] Seed set to 504588624
+> [rank: 3] Seed set to 504588624
+> Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
+> Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
+> Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
+> Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
+> Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
+> Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
+> Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
+> ----------------------------------------------------------------------------------------------------
+> distributed_backend=nccl
+> All distributed processes registered. Starting with 8 processes
+> ----------------------------------------------------------------------------------------------------
+> 
+> LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+> 
+>   | Name  | Type          | Params | Mode 
+> ------------------------------------------------
+> 0 | model | AlphaFold     | 93.2 M | train
+> 1 | loss  | AlphaFoldLoss | 0      | train
+> ------------------------------------------------
+> 93.2 M    Trainable params
+> 0         Non-trainable params
+> 93.2 M    Total params
+> 372.916   Total estimated model params size (MB)
+> 4451      Modules in train mode
+> 0         Modules in eval mode
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `list` across ranks is zero. Please make sure this was your intention.
+> Epoch 0:   0%|                                                                                                                            | 0/1250 [00:00<?, ?it/s]/opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/lddt_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/drmsd_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> WARNING:root:The exact sequence TTADIVVFDEISMATNYDLSVVNARLRAKHYVYIGDPAQLPAPRTLLTKGTLEPEYFNSVCRLMKTIGPDMFLGTCRRCPAEIVDTVSALVYDNKLKAHKDKSAQCFKMFYKGVITHDVSSAINRPQIGVVREFLTRNPAWRKAVFISPYNSQNAVASKILGLPTQTVDSSQGSEYDYVIFTQTTETAHSCNVNRFNVAITRAKVGILCIMSDR was not found in 7o7y_BK. Realigning the template to the actual sequence.
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> Epoch 0:   1%|█▎                                                                                             | 17/1250 [01:45<2:07:09,  0.16it/s, train/loss=90.00]
+> [...]
+> Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1250/1250 [1:31:49<00:00,  0.23it/s, train/loss=55.40]
+> ```
+
+Note: This example will fail with an odd "strategy=None" error if run on a
+node with only one GPU
+
+In the above example, I stopped the training after Ephoch 0 which created the
+following checkpoint file:
+
+```
+ls -l /output/PDB/2021-10-10/checkpoints
+```
+
+> ```
+> total 1464717
+> -rw-rw-r-- 1 tkewtest nogroup 1499869690 Aug 26 13:26 /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+> ```
+
+Restarted the training from the checkpoint:
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+python3 "${OF_DIR}/train_openfold.py" \
+ --train_chain_data_cache_path "/data/pdb_data/data_caches/chain_data_cache.json" \
+ --template_release_dates_cache_path "/data/pdb_data/data_caches/mmcif_cache.json" \
+ --obsolete_pdbs_file_path "/data/pdb_data/obsolete.dat" \
+ --config_preset initial_training \
+ --seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --num_nodes ${SLURM_NNODES} \
+ --gpus $(expr ${SLURM_GPUS_ON_NODE} \* ${SLURM_NNODES}) \
+ --max_epochs 1000 \
+ --checkpoint_every_epoch \
+ --resume_from_ckpt /output/PDB/2021-10-10/checkpoints/0-1250.ckpt \
+ "/data/pdb_data/mmcif_files" \
+ "/data/alignment_data/alignments" \
+ "/data/pdb_data/mmcif_files" \
+ "/output/PDB/2021-10-10" \
+ "2021-10-10"
+```
+
+Sample output:
+
+```
+> [2025-08-26 15:33:33,529] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 0] Seed set to 741092476
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
+> Using bfloat16 Automatic Mixed Precision (AMP)
+> GPU available: True (cuda), used: True
+> TPU available: False, using: 0 TPU cores
+> HPU available: False, using: 0 HPUs
+> You are using a CUDA device ('NVIDIA H100 80GB HBM3') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
+> Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
+> [2025-08-26 15:33:57,402] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,418] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,426] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,431] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,443] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,445] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-08-26 15:33:57,448] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 2] Seed set to 741092476
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> [rank: 3] Seed set to 741092476
+> [rank: 1] Seed set to 741092476
+> [rank: 5] Seed set to 741092476
+> [rank: 4] Seed set to 741092476
+> [rank: 6] Seed set to 741092476
+> [rank: 7] Seed set to 741092476
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+>   sd = torch.load(args.resume_from_ckpt)
+> Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
+> Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
+> Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
+> Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
+> Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
+> Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
+> ----------------------------------------------------------------------------------------------------
+> distributed_backend=nccl
+> All distributed processes registered. Starting with 8 processes
+> ----------------------------------------------------------------------------------------------------
+> 
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:701: Checkpoint directory /output/PDB/2021-10-10/checkpoints exists and is not empty.
+> Restoring states from the checkpoint path at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+> LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+>   warnings.warn(
+> 
+>   | Name  | Type          | Params | Mode 
+> ------------------------------------------------
+> 0 | model | AlphaFold     | 93.2 M | train
+> 1 | loss  | AlphaFoldLoss | 0      | train
+> ------------------------------------------------
+> 93.2 M    Trainable params
+> 0         Non-trainable params
+> 93.2 M    Total params
+> 372.916   Total estimated model params size (MB)
+> 4451      Modules in train mode
+> 0         Modules in eval mode
+> Restored all states from the checkpoint at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `list` across ranks is zero. Please make sure this was your intention.
+> Training: |                                                                                                                                  | 0/? [00:00<?, ?it/s]WARNING:root:The exact sequence LDARLDTVYDAIVLGGGMGGLSAAIYLARYGLKCLVVEKGRGRSFWMQDLRNYVGLDPDTPGRDIITHSTQQALHWGADLLRGYVEDVTDEGDTLAVKVKVGKKDSLYPIFRTKYVIAATGIIDNLPQLEDMQNVYDYAGYTLHVCMICDGFDMWDQKAVLIAGTEGQINAAFVLNWFTPYITVLTHGLCTVGDEMKAKLADHGYPLHEAAITKFLGEDHKMSGVELVDGTVMEATTGLINMGSVYHNHYLKGIEGLEWDGENLVTNDMAQTSHPRIFALGDLKKGLNQVSVAVADGTLAATQIWRNIRRASEPRK was not found in 5k0a_D. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence ATLREQSFDEAWLFHRGDIAEGEKQSLDDSQWRQINLPHDWSIEDIPGTNSPFTADAATEVAGGFTVGGTGWYRKHFYIDAAEKGKAIAVSFDGIYMNADIWVNDRHVANHVYGYTAFELDITDYVRFGAENLIAVRVKNEGMNCRWYTGSGIYRHTFLKITNPLHFETWGTFVTTPVATADKAEVHVQSVLANTEKVTGKVILETRIVDKNNHTVARKEQLVTLDNKEKTEVGHALEVLAPQLWSIDNPYLYQVVNRLLQDDKVIDEEYISIGIRNIAFSAENGFQLNGKSMKLKGGCIHHDNGLLGAKAFDRAEERKIELLKAAGFNALRLSHNPPSIALLNACDRLGMLVIDEAFDMWRYGHYQYDYAQYFDKLWKEDLHSMVARDRNHPSVIMWSIGNEIKNKETAEIVDICRELTGFVKTLDTTRPVTAGVNSIVDATDDFLAPLDVCGYNYALNRYESDAKRHPDRIIYASESYASQAYDYWKGVEDHSWVIGDFIWTAFDYIGEASIGWCGYPLDKRIFPWNHANCGDLNLSGERRPQSYLRETLWSDAPVSHIVVTPPVPSFPLNPDKADWSVWDFPDVVDHWNFPGYEGKKMTVSVYSNCEQVELFLNGESLGKQENTADKKNTLVWEVPYAHGILKAVSYNKGGEVGTATLESAGKVEKIRLSADRTEIVADGNDLSYITLELVDSKGIRNQLAEELVAFSIEGDATIEGVGNANPMSIESFVANSRKTWRGSNLLVVRSGKSSGRIIVTAKVKALPVASITIT was not found in 6b6l_B. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence VRKRVLIGLKDAPNFVMRLFTVEPGGLIDRASHPWEHEIFVLKGKLTVLKEQGEETVEEGFYIFVEPNEIHGFRNDTDSEVEFLA was not found in 6l2e_B. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence MVILEVANPQEAARVLNENLLVGYFLPCKLVVYQENGTTKIGMPK was not found in 1q9u_B. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence GRLGVTRNKIMTAQYECYQKIMQDPIQQAEGVYCQRTWDGWLCWNDVAAGTESMQLCPDYFQDFDPSEKVTKICDQDGNWFRHPASQRTWTNYTQCNVNT was not found in 6zho_A. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence SSVPMTQNRNILWIMCDQLRFDYLSCYGHERLNTPNIDKLAKRGVRFTNAYVQATVXGPSRMSAYTGRYVRSHGSTQNGIPLRVGEPTLGDHLRDVGMRNVLIGKTHMRPDLDGMKRLGIDPDSEIGARVGEGGFDAFDRDDGVHPTGYRKKEPAYNDYLRHAGFQAENPWEFWANSAEGKGGENQSGWLLTHADKPARVPEEHSETAYMTRRAMEFMEAAEKDGRPWCAHLSYIKPHWPYIVPAPYHDMFGPDDVKPAVRSDEELKAAHPLFKAMTEEVYSRNFARDEVREKVIPAYMGLIKQIDDQLGQLFAFMQERGLDENTMIVFTADHGDYLGDHWMGEKYLFYEAAAKVPLIIYDPSDKADATRGTVSDALVEMIDLAPTFVDYAGGVPPMHILEGKSLLPLLHDDDSSWDRQYVFSELDYSNLPARLKLGRDIQDCRATMVFDGRYKLVEVMGFAPILFDLEVDPDELKDLGRDPSAEEVRQRLTSALDAWHRNTRQR was not found in 4upi_A. Realigning the template to the actual sequence.
+> WARNING:root:The exact sequence PRGSHMASIKKPNVLILLFDDMRFDTFSYRNGPVSTPNIDALANEGTRFDQAMTSTGLASPSRAAMFTGRWGHKTGLDDNVGLYHSRLSELSLSEGSVIKRATSIGYDVSYVGKWHLGAQGPALRGANFMWGHDKDEERNGRPFTPYQTQKNVARMNAGERDKNGEKHDYYKTLPGTYADTVTAKEVNEGKLMLQNAAKSDKPFFGIVSFEQPHPPYRVPEPYASMYDYKDIKLPKNFGIKRKHKPMAQDDIWWPWHDVSHMSETDWRKAHSFYYGAIAMIDHAVGELINTAKEEGLYDDLHIILVGDQGSMLGEHNLYDKGPYAYDELMRMPLIIRDPSLEPKIINRQVSMLDIAPTLRQWMTLPLDGDEDGRSLLPLMKQGDSADAGKDDISLYAYEWYNGGWFGIRAIRTPEMKFVWNPGDSRDELYDLKNDPYEITNQIDNPKYKKQLTDLVHKMAGELNRIDDPSLTKF was not found in 6pt4_B. Realigning the template to the actual sequence.
+> Epoch 1:   0%|                                                                                                                            | 0/1250 [00:00<?, ?it/s]/opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/lddt_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/drmsd_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
+>   return fn(*args, **kwargs)
+> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+>   with torch.cuda.amp.autocast(enabled=False):
+> Epoch 1:   2%|█▌                                                                                             | 21/1250 [01:56<1:53:48,  0.18it/s, train/loss=50.10]
+> [...]
+```
+
+You can monitor the GPU utilization which running the training as following,
+using the Slurm job id:
+
+e.g. from vortex:
+
+```
+srun --jobid="21070582" --export=HOME,TERM,SHELL --pty /bin/bash --login
+```
+
+Sample output:
+
+> ```
+> CCRusername@cpn-i09-04:~$
+> ```
+
+Show the GPUs available in the Slurm job:
+
+```
+nvidia-smi -L
+```
+
+Sample output:
+
+> ```
+> GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-e5f404f3-cc2a-cf0c-219f-dcf1a4e223f2)
+> GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-96601a91-e977-7a71-a188-8df4aff2fbcc)
+> GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-c4a62918-26ce-f10c-a009-dd3b2e069ac2)
+> GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-7b286e42-7f9d-a8e8-501c-14b0663b8440)
+> GPU 4: NVIDIA H100 80GB HBM3 (UUID: GPU-a9038bc9-da63-7f95-edb6-9857e428acbc)
+> GPU 5: NVIDIA H100 80GB HBM3 (UUID: GPU-347ee5de-5ad5-fdea-3c1b-41ba332b066e)
+> GPU 6: NVIDIA H100 80GB HBM3 (UUID: GPU-558a69d7-ed47-fd4c-be72-4308fefe6876)
+> GPU 7: NVIDIA H100 80GB HBM3 (UUID: GPU-f65a6ec2-ce6f-ba6c-d4c2-8ca414d6e709)
+> ```
+
+Monitor the GPU activity:
+
+```
+nvidia-smi -l
+```
+
+Sample output:
+
+> ```
+> Tue Aug 26 15:49:52 2025       
+> +-----------------------------------------------------------------------------------------+
+> | NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
+> |-----------------------------------------+------------------------+----------------------+
+> | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+> | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+> |                                         |                        |               MIG M. |
+> |=========================================+========================+======================|
+> |   0  NVIDIA H100 80GB HBM3          On  |   00000000:19:00.0 Off |                    0 |
+> | N/A   41C    P0            362W /  700W |   26980MiB /  81559MiB |    100%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   1  NVIDIA H100 80GB HBM3          On  |   00000000:3B:00.0 Off |                    0 |
+> | N/A   37C    P0            346W /  700W |   11751MiB /  81559MiB |    100%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   2  NVIDIA H100 80GB HBM3          On  |   00000000:4C:00.0 Off |                    0 |
+> | N/A   36C    P0            330W /  700W |   11751MiB /  81559MiB |    100%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   3  NVIDIA H100 80GB HBM3          On  |   00000000:5D:00.0 Off |                    0 |
+> | N/A   38C    P0            349W /  700W |   11751MiB /  81559MiB |    100%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   4  NVIDIA H100 80GB HBM3          On  |   00000000:9B:00.0 Off |                    0 |
+> | N/A   39C    P0            334W /  700W |   11751MiB /  81559MiB |     95%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   5  NVIDIA H100 80GB HBM3          On  |   00000000:BB:00.0 Off |                    0 |
+> | N/A   36C    P0            335W /  700W |   11751MiB /  81559MiB |     73%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   6  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
+> | N/A   86C    P0            226W /  700W |   11751MiB /  81559MiB |    100%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+> |   7  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
+> | N/A   37C    P0            353W /  700W |   11511MiB /  81559MiB |     89%      Default |
+> |                                         |                        |             Disabled |
+> +-----------------------------------------+------------------------+----------------------+
+>                                                                                          
+> +-----------------------------------------------------------------------------------------+
+> | Processes:                                                                              |
+> |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+> |        ID   ID                                                               Usage      |
+> |=========================================================================================|
+> |    0   N/A  N/A         2085037      C   python3                               13266MiB |
+> |    0   N/A  N/A         2085212      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085213      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085214      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085215      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085216      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085217      C   /opt/conda/bin/python3                 1952MiB |
+> |    0   N/A  N/A         2085218      C   /opt/conda/bin/python3                 1952MiB |
+> |    1   N/A  N/A         2085212      C   /opt/conda/bin/python3                11742MiB |
+> |    2   N/A  N/A         2085213      C   /opt/conda/bin/python3                11742MiB |
+> |    3   N/A  N/A         2085214      C   /opt/conda/bin/python3                11742MiB |
+> |    4   N/A  N/A         2085215      C   /opt/conda/bin/python3                11742MiB |
+> |    5   N/A  N/A         2085216      C   /opt/conda/bin/python3                11742MiB |
+> |    6   N/A  N/A         2085217      C   /opt/conda/bin/python3                11742MiB |
+> |    7   N/A  N/A         2085218      C   /opt/conda/bin/python3                11502MiB |
+> +-----------------------------------------------------------------------------------------+
+> ```
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
new file mode 100644
index 0000000..5ee795a
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
@@ -0,0 +1,133 @@
+Bootstrap: docker
+From: nvcr.io/nvidia/pytorch:25.06-py3
+
+%labels
+  org.opencontainers.image.authors OpenFold Team
+  org.opencontainers.image.source https://github.com/aqlaboratory/openfold
+  org.opencontainers.image.licenses Apache License 2.0
+  org.opencontainers.image.base.name docker.io/nvidia/cuda:12.6.3-base-ubuntu24.04
+
+%files
+  environment-$(arch).yml /opt/openfold/environment.yml
+
+%post
+  # Set the timezone, if unset
+  test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
+
+  cp /etc/apt/sources.list /etc/apt/sources.list~
+  sed -E -i 's/^# deb-src /deb-src /' /etc/apt/sources.list
+  apt-get -y update
+
+  # Install man & man pages - this section can be removed if not needed
+  # NOTE: Do this before installing anything else so their man pages are installed
+  sed -e '\|/usr/share/man|s|^#*|#|g' -i /etc/dpkg/dpkg.cfg.d/excludes
+  DEBIAN_FRONTEND=noninteractive apt-get -y install apt-utils groff dialog man-db manpages manpages-posix manpages-dev
+  rm -f /usr/bin/man
+  dpkg-divert --quiet --remove --rename /usr/bin/man
+
+  # O/S package updates:
+  DEBIAN_FRONTEND=noninteractive apt-get -y upgrade
+
+  DEBIAN_FRONTEND=noninteractive apt-get -y install \
+   tzdata \
+   locales \
+   unzip \
+   wget \
+   git \
+   curl \
+   jq \
+   nano \
+   vim \
+   apt-file
+
+  # NOTE: apt-file is generally not needed to run, but can be useful during development
+  apt-file update
+
+  # These steps are necessary to configure Perl and can cause issues with Python if omitted
+  sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen
+  dpkg-reconfigure --frontend=noninteractive locales
+  update-locale LANG=en_US.UTF-8
+
+  # use all the available cores for cmake parallel builds
+  export CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)"
+
+  # Following build based on the Dockerfile
+
+  # set up Miniforge
+  miniforge_version="23.3.1-1"
+  #miniforge_version="25.3.0-3"
+  wget -P /tmp \
+   "https://github.com/conda-forge/miniforge/releases/download/${miniforge_version}/Miniforge3-$(uname)-$(uname -m).sh"
+  bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda
+  rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
+
+  export CONDA_PREFIX="/opt/conda"
+  export PATH=/opt/conda/bin:$PATH
+  cd /opt/conda
+
+  mamba env update -n base --file /opt/openfold/environment.yml
+  mamba clean --all
+
+  # Install deepspeed
+  cd /root
+  curl -L -o DeepSpeed-0.14.5.tar.gz https://github.com/deepspeedai/DeepSpeed/archive/refs/tags/v0.14.5.tar.gz
+  gzip -dc DeepSpeed-0.14.5.tar.gz | tar xf -
+  rm DeepSpeed-0.14.5.tar.gz 
+  mv DeepSpeed-0.14.5/ DeepSpeed
+
+  cd DeepSpeed
+  sed -i -e 's/ninja.*$/ninja==1.11.1/' \
+   -e 's/nvidia-ml-py.*$/nvidia-ml-py==12.575.51/' \
+   -e 's/py-cpuinfo.*$/py-cpuinfo==8.0.0/' \
+   environment.yml requirements/requirements.txt
+  export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
+  python3 setup.py install
+  cd ..
+  rm -rf DeepSpeed
+  /opt/conda/bin/pip install dm-tree
+
+  # install flash-attn
+  cd /root
+  git clone https://github.com/Dao-AILab/flash-attention.git
+  cd flash-attention
+  python3 setup.py install
+  cd ..
+  rm -rf flash-attention
+
+  # Install dllogger
+  cd /root
+  git clone --filter=blob:none --quiet https://github.com/NVIDIA/dllogger
+  cd dllogger
+  mkdir -p requirements
+  /opt/conda/bin/pip freeze > requirements/requirements.txt
+  python3 setup.py install
+  cd ..
+  rm -rf dllogger
+  cd /root
+  git clone https://github.com/aqlaboratory/openfold.git
+  cd openfold
+  cp -r openfold /opt/openfold/openfold
+  cp -r scripts /opt/openfold/scripts
+  # Note: Copying the "tests" dir is only required to run the test script
+  #         bash /opt/openfold/scripts/run_unit_tests.sh -v tests.test_model
+  #       This can be omitted if not running this test:
+  cp -r tests /opt/openfold/tests
+  cp run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
+  cp train_openfold.py /opt/openfold/train_openfold.py
+  cp setup.py /opt/openfold/setup.py
+  cd ..
+  rm -rf openfold
+  cd /opt/openfold
+  wget -q -P /opt/openfold/openfold/resources \
+    https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
+  python3 setup.py install
+
+%environment
+  export LANG=en_US.UTF-8 
+  export LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
+  export PYTHONPATH="/opt/conda/lib/python3.10/site-packages:/opt/openfold:${PYTHONPATH}"
+  export PATH="/opt/conda/bin:/opt/openfold:${PATH}"
+  export CONDA_PREFIX="/opt/conda"
+  export OF_DIR="/opt/openfold"
+  export TRITON_CACHE_DIR="${SLURMTMPDIR:-$(test -d /scratch && echo "/scratch" || echo "/var/tmp")}"
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
new file mode 100644
index 0000000..7568a6a
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
@@ -0,0 +1,97 @@
+Bootstrap: docker
+From: nvidia/cuda:12.6.3-base-ubuntu24.04
+
+%labels
+  org.opencontainers.image.authors OpenFold Team
+  org.opencontainers.image.source https://github.com/aqlaboratory/openfold
+  org.opencontainers.image.licenses Apache License 2.0
+  org.opencontainers.image.base.name docker.io/nvidia/cuda:12.6.3-base-ubuntu24.04
+
+%files
+  environment.yml /opt/openfold/environment.yml
+
+%post
+  # Set the timezone, if unset
+  test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
+
+  cp /etc/apt/sources.list /etc/apt/sources.list~
+  sed -E -i 's/^# deb-src /deb-src /' /etc/apt/sources.list
+  apt-get -y update
+
+  # Install man & man pages - this section can be removed if not needed
+  # NOTE: Do this before installing anything else so their man pages are installed
+  sed -e '\|/usr/share/man|s|^#*|#|g' -i /etc/dpkg/dpkg.cfg.d/excludes
+  DEBIAN_FRONTEND=noninteractive apt-get -y install apt-utils groff dialog man-db manpages manpages-posix manpages-dev
+  rm -f /usr/bin/man
+  dpkg-divert --quiet --remove --rename /usr/bin/man
+
+  # O/S package updates:
+  DEBIAN_FRONTEND=noninteractive apt-get -y upgrade
+
+  DEBIAN_FRONTEND=noninteractive apt-get -y install \
+   tzdata \
+   locales \
+   unzip \
+   wget \
+   git \
+   curl \
+   jq \
+   nano \
+   vim \
+   apt-file
+
+  # NOTE: apt-file is generally not needed to run, but can be useful during development
+  apt-file update
+
+  # These steps are necessary to configure Perl and can cause issues with Python if omitted
+  sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen
+  dpkg-reconfigure --frontend=noninteractive locales
+  update-locale LANG=en_US.UTF-8
+
+  # use all the available cores for cmake parallel builds
+  export CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)"
+
+  # Following build based on the Dockerfile
+
+  # set up Miniforge
+  miniforge_version="23.3.1-1"
+  #miniforge_version="25.3.0-3"
+  wget -P /tmp \
+   "https://github.com/conda-forge/miniforge/releases/download/${miniforge_version}/Miniforge3-$(uname)-$(uname -m).sh"
+  bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda
+  rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
+
+  export CONDA_PREFIX="/opt/conda"
+  export PATH=/opt/conda/bin:$PATH
+  cd /opt/conda
+  mamba env update -n base --file /opt/openfold/environment.yml
+  mamba clean --all
+  export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
+  cd /root
+  git clone https://github.com/aqlaboratory/openfold.git
+  cd openfold
+  cp -r openfold /opt/openfold/openfold
+  cp -r scripts /opt/openfold/scripts
+  # Note: Copying the "tests" dir is only required to run the test script
+  #         bash /opt/openfold/scripts/run_unit_tests.sh -v tests.test_model
+  #       This can be omitted if not running this test:
+  cp -r tests /opt/openfold/tests
+  cp run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
+  cp train_openfold.py /opt/openfold/train_openfold.py
+  cp setup.py /opt/openfold/setup.py
+  cd ..
+  rm -rf openfold
+  cd /opt/openfold
+  wget -q -P /opt/openfold/openfold/resources \
+    https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
+  python3 setup.py install
+
+%environment
+  export LANG=en_US.UTF-8 
+  export LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
+  export PYTHONPATH="/opt/conda/lib/python3.10/site-packages:/opt/openfold:${PYTHONPATH}"
+  export PATH="/opt/conda/bin:/opt/openfold:${PATH}"
+  export CONDA_PREFIX="/opt/conda"
+  export OF_DIR="/opt/openfold"
+  export TRITON_CACHE_DIR="${SLURMTMPDIR:-$(test -d /scratch && echo "/scratch" || echo "/var/tmp")}"
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
new file mode 100644
index 0000000..4d308b9
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -0,0 +1,267 @@
+# Example OpenFold container
+
+## Building the container
+
+A brief guide to building the OpenFold container follows:<br/>
+Please refer to CCR's [container documentation](https://docs.ccr.buffalo.edu/en/latest/howto/containerization/) for more detailed information on building and using Apptainer.
+
+NOTE: for building on the ARM64 platform see [BUILD-ARM64.md](./BUILD-ARM64.md)
+
+1. Start an interactive job
+
+Apptainer is not available on the CCR login nodes and the compile nodes may not provide enough resources for you to build a container.  We recommend requesting an interactive job on a compute node to conduct this build process.<br/>
+Note: a GPU is NOT needed to build the OpenFold container<br/>
+See CCR docs for more info on [running jobs](https://docs.ccr.buffalo.edu/en/latest/hpc/jobs/#interactive-job-submission)
+
+```
+salloc --cluster=ub-hpc --partition=debug --qos=debug --account="[SlurmAccountName]" \
+ --mem=0 --exclusive --time=01:00:00
+```
+
+sample outout:
+
+> ```
+> salloc: Pending job allocation 19781052
+> salloc: job 19781052 queued and waiting for resources
+> salloc: job 19781052 has been allocated resources
+> salloc: Granted job allocation 19781052
+> salloc: Nodes cpn-i14-39 are ready for job
+> CCRusername@cpn-i14-39:~$ 
+> ```
+
+2. Navigate to your build directory and use the Slurm job local temporary directory for cache
+
+You should now be on the compute node allocated to you.  In this example we're using our project directory for our build directory.  Ensure you've placed your `OpenFold.def` file in your build directory
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Download the OpenFold build files, Openfold.def and environment.yml to this directory
+
+```
+curl -L -o OpenFold.def https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
+curl -L -o environment.yml https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/environment.yml
+```
+
+Sample output:
+
+> ```
+>   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+>                                  Dload  Upload   Total   Spent    Left  Speed
+> 100  3534  100  3534    0     0  63992      0 --:--:-- --:--:-- --:--:-- 64254
+>   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+>                                  Dload  Upload   Total   Spent    Left  Speed
+> 100   767  100   767    0     0  11406      0 --:--:-- --:--:-- --:--:-- 11447
+> ```
+
+3. Build your container
+
+Set the apptainer cache dir:
+
+```
+export APPTAINER_CACHEDIR=${SLURMTMPDIR}
+```
+
+Building the OpenFold container takes about half an hour...
+
+```
+apptainer build OpenFold-$(arch).sif OpenFold.def
+```
+
+Sample truncated output:
+
+> ```
+> [....]
+> INFO:    Adding environment to container
+> INFO:    Creating SIF file...
+> INFO:    Build complete: OpenFold-x86_64.sif
+> ```
+
+## Running the container
+
+Start an interactive job with a single GPU e.g.
+NOTE: OpenFold Inference only uses one GPU
+
+```
+salloc --cluster=ub-hpc --partition=general-compute --qos=general-compute \
+ --account="[SlurmAccountName]" --mem=128GB --nodes=1 --cpus-per-task=1 \
+ --tasks-per-node=12 --gpus-per-node=1 --time=05:00:00
+```
+
+Change to your OpenFold directory
+
+```
+cd /projects/academic/[YourGroupName]/OpenFold
+```
+
+Create an output directory
+
+```
+mkdir -p ./output
+```
+
+...then start the OpenFold container instance
+
+```
+apptainer shell \
+ -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+ -B /util/software/data/OpenFold:/data \
+ -B /util/software/data/alphafold:/database \
+ -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+ -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+ -B $(pwd)/output:/output \
+ --nv \
+ OpenFold-$(arch).sif
+```
+
+All the following commands are run from the "Apptainer> " prompt
+
+Verify OpenFold is installed:
+
+```
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+python3 "${OF_DIR}/train_openfold.py" --help
+```
+
+Sample output:
+
+> ```
+> [2025-08-18 17:02:46,110] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+>  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
+>  [WARNING]  async_io: please install the libaio-dev package with apt
+>  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+>   def forward(ctx, input, weight, bias=None):
+> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
+>   def backward(ctx, grad_output):
+> usage: train_openfold.py [-h] [--train_mmcif_data_cache_path TRAIN_MMCIF_DATA_CACHE_PATH] [--use_single_seq_mode USE_SINGLE_SEQ_MODE]
+>                          [--distillation_data_dir DISTILLATION_DATA_DIR] [--distillation_alignment_dir DISTILLATION_ALIGNMENT_DIR] [--val_data_dir VAL_DATA_DIR]
+>                          [--val_alignment_dir VAL_ALIGNMENT_DIR] [--val_mmcif_data_cache_path VAL_MMCIF_DATA_CACHE_PATH] [--kalign_binary_path KALIGN_BINARY_PATH]
+>                          [--train_filter_path TRAIN_FILTER_PATH] [--distillation_filter_path DISTILLATION_FILTER_PATH]
+>                          [--obsolete_pdbs_file_path OBSOLETE_PDBS_FILE_PATH] [--template_release_dates_cache_path TEMPLATE_RELEASE_DATES_CACHE_PATH]
+>                          [--use_small_bfd USE_SMALL_BFD] [--seed SEED] [--deepspeed_config_path DEEPSPEED_CONFIG_PATH] [--checkpoint_every_epoch]
+>                          [--early_stopping EARLY_STOPPING] [--min_delta MIN_DELTA] [--patience PATIENCE] [--resume_from_ckpt RESUME_FROM_CKPT]
+>                          [--resume_model_weights_only RESUME_MODEL_WEIGHTS_ONLY] [--resume_from_jax_params RESUME_FROM_JAX_PARAMS]
+>                          [--log_performance LOG_PERFORMANCE] [--wandb] [--experiment_name EXPERIMENT_NAME] [--wandb_id WANDB_ID] [--wandb_project WANDB_PROJECT]
+>                          [--wandb_entity WANDB_ENTITY] [--script_modules SCRIPT_MODULES] [--train_chain_data_cache_path TRAIN_CHAIN_DATA_CACHE_PATH]
+>                          [--distillation_chain_data_cache_path DISTILLATION_CHAIN_DATA_CACHE_PATH] [--train_epoch_len TRAIN_EPOCH_LEN] [--log_lr]
+>                          [--config_preset CONFIG_PRESET] [--_distillation_structure_index_path _DISTILLATION_STRUCTURE_INDEX_PATH]
+>                          [--alignment_index_path ALIGNMENT_INDEX_PATH] [--distillation_alignment_index_path DISTILLATION_ALIGNMENT_INDEX_PATH]
+>                          [--experiment_config_json EXPERIMENT_CONFIG_JSON] [--gpus GPUS] [--mpi_plugin] [--num_nodes NUM_NODES] [--precision PRECISION]
+>                          [--max_epochs MAX_EPOCHS] [--log_every_n_steps LOG_EVERY_N_STEPS] [--flush_logs_every_n_steps FLUSH_LOGS_EVERY_N_STEPS]
+>                          [--num_sanity_val_steps NUM_SANITY_VAL_STEPS] [--reload_dataloaders_every_n_epochs RELOAD_DATALOADERS_EVERY_N_EPOCHS]
+>                          [--accumulate_grad_batches ACCUMULATE_GRAD_BATCHES]
+>                          train_data_dir train_alignment_dir template_mmcif_dir output_dir max_template_date
+> 
+> positional arguments:
+>   train_data_dir        Directory containing training mmCIF files
+>   train_alignment_dir   Directory containing precomputed training alignments
+>   template_mmcif_dir    Directory containing mmCIF files to search for templates
+>   output_dir            Directory in which to output checkpoints, logs, etc. Ignored if not on rank 0
+>   max_template_date     Cutoff for all templates. In training mode, templates are also filtered by the release date of the target
+> 
+> options:
+>   -h, --help            show this help message and exit
+>   --train_mmcif_data_cache_path TRAIN_MMCIF_DATA_CACHE_PATH
+>                         Path to the json file which records all the information of mmcif structures used during training
+>   --use_single_seq_mode USE_SINGLE_SEQ_MODE
+>                         Use single sequence embeddings instead of MSAs.
+>   --distillation_data_dir DISTILLATION_DATA_DIR
+>                         Directory containing training PDB files
+>   --distillation_alignment_dir DISTILLATION_ALIGNMENT_DIR
+>                         Directory containing precomputed distillation alignments
+>   --val_data_dir VAL_DATA_DIR
+>                         Directory containing validation mmCIF files
+>   --val_alignment_dir VAL_ALIGNMENT_DIR
+>                         Directory containing precomputed validation alignments
+>   --val_mmcif_data_cache_path VAL_MMCIF_DATA_CACHE_PATH
+>                         path to the json file which records all the information of mmcif structures used during validation
+>   --kalign_binary_path KALIGN_BINARY_PATH
+>                         Path to the kalign binary
+>   --train_filter_path TRAIN_FILTER_PATH
+>                         Optional path to a text file containing names of training examples to include, one per line. Used to filter the training set
+>   --distillation_filter_path DISTILLATION_FILTER_PATH
+>                         See --train_filter_path
+>   --obsolete_pdbs_file_path OBSOLETE_PDBS_FILE_PATH
+>                         Path to obsolete.dat file containing list of obsolete PDBs and their replacements.
+>   --template_release_dates_cache_path TEMPLATE_RELEASE_DATES_CACHE_PATH
+>                         Output of scripts/generate_mmcif_cache.py run on template mmCIF files.
+>   --use_small_bfd USE_SMALL_BFD
+>                         Whether to use a reduced version of the BFD database
+>   --seed SEED           Random seed
+>   --deepspeed_config_path DEEPSPEED_CONFIG_PATH
+>                         Path to DeepSpeed config. If not provided, DeepSpeed is disabled
+>   --checkpoint_every_epoch
+>                         Whether to checkpoint at the end of every training epoch
+>   --early_stopping EARLY_STOPPING
+>                         Whether to stop training when validation loss fails to decrease
+>   --min_delta MIN_DELTA
+>                         The smallest decrease in validation loss that counts as an improvement for the purposes of early stopping
+>   --patience PATIENCE   Early stopping patience
+>   --resume_from_ckpt RESUME_FROM_CKPT
+>                         Path to a model checkpoint from which to restore training state
+>   --resume_model_weights_only RESUME_MODEL_WEIGHTS_ONLY
+>                         Whether to load just model weights as opposed to training state
+>   --resume_from_jax_params RESUME_FROM_JAX_PARAMS
+>                         Path to an .npz JAX parameter file with which to initialize the model
+>   --log_performance LOG_PERFORMANCE
+>                         Measure performance
+>   --wandb               Whether to log metrics to Weights & Biases
+>   --experiment_name EXPERIMENT_NAME
+>                         Name of the current experiment. Used for wandb logging
+>   --wandb_id WANDB_ID   ID of a previous run to be resumed
+>   --wandb_project WANDB_PROJECT
+>                         Name of the wandb project to which this run will belong
+>   --wandb_entity WANDB_ENTITY
+>                         wandb username or team name to which runs are attributed
+>   --script_modules SCRIPT_MODULES
+>                         Whether to TorchScript eligible components of them model
+>   --train_chain_data_cache_path TRAIN_CHAIN_DATA_CACHE_PATH
+>   --distillation_chain_data_cache_path DISTILLATION_CHAIN_DATA_CACHE_PATH
+>   --train_epoch_len TRAIN_EPOCH_LEN
+>                         The virtual length of each training epoch. Stochastic filtering of training data means that training datasets have no well-defined length.
+>                         This virtual length affects frequency of validation & checkpointing (by default, one of each per epoch).
+>   --log_lr              Whether to log the actual learning rate
+>   --config_preset CONFIG_PRESET
+>                         Config setting. Choose e.g. "initial_training", "finetuning", "model_1", etc. By default, the actual values in the config are used.
+>   --_distillation_structure_index_path _DISTILLATION_STRUCTURE_INDEX_PATH
+>   --alignment_index_path ALIGNMENT_INDEX_PATH
+>                         Training alignment index. See the README for instructions.
+>   --distillation_alignment_index_path DISTILLATION_ALIGNMENT_INDEX_PATH
+>                         Distillation alignment index. See the README for instructions.
+>   --experiment_config_json EXPERIMENT_CONFIG_JSON
+>                         Path to a json file with custom config values to overwrite config setting
+>   --gpus GPUS           For determining optimal strategy and effective batch size.
+>   --mpi_plugin          Whether to use MPI for parallele processing
+> 
+> Arguments to pass to PyTorch Lightning Trainer:
+>   --num_nodes NUM_NODES
+>   --precision PRECISION
+>                         Sets precision, lower precision improves runtime performance.
+>   --max_epochs MAX_EPOCHS
+>   --log_every_n_steps LOG_EVERY_N_STEPS
+>   --flush_logs_every_n_steps FLUSH_LOGS_EVERY_N_STEPS
+>   --num_sanity_val_steps NUM_SANITY_VAL_STEPS
+>   --reload_dataloaders_every_n_epochs RELOAD_DATALOADERS_EVERY_N_EPOCHS
+>   --accumulate_grad_batches ACCUMULATE_GRAD_BATCHES
+>                         Accumulate gradients over k batches before next optimizer step.
+> ```
+
+See the [EXAMPLE file](./EXAMPLE.md) for more info.
+
+## Sample Slurm scripts
+
+### x86_64 example
+[OpenFold Slurm example script](https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash)
+
+### Grace Hopper (GH200) GPU - ARM64 example
+[OpenFold Grace Hopper (GH200) GPU - ARM64 Slurm example script](https://raw.githubusercontent.com/tonykew/ccr-examples/refs/heads/OpenFold/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash)
+
+## Documentation Resources
+
+For more information on OpenFold see the [OpenFold Documentation](https://openfold.readthedocs.io/en/latest) and the [OpenFold GitHub page](https://github.com/aqlaboratory/openfold)
+
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
new file mode 100644
index 0000000..4fe1293
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
@@ -0,0 +1,36 @@
+name: openfold-env 
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - nvidia 
+dependencies:
+  - cuda
+  - gcc=12.4
+  - python<3.11
+  - setuptools=59.5.0
+  - pip
+  - openmm
+  - pdbfixer
+  - pytorch-lightning
+  - biopython
+  - numpy=1.26.4
+  - pandas
+  - PyYAML
+  - requests
+  - scipy
+  - tqdm
+  - typing-extensions
+  - wandb
+  - modelcif==0.7
+  - awscli
+  - ml-collections
+  - aria2
+  - git
+  - bioconda::hmmer
+  # install libopenblas as a fix for https://github.com/bioconda/bioconda-recipes/issues/56856
+  - libopenblas
+  - libaio
+  - bioconda::hhsuite
+  - bioconda::kalign2
+  - mmseqs2
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment.yml b/containers/2_ApplicationSpecific/OpenFold/environment.yml
new file mode 100644
index 0000000..3847f5d
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/environment.yml
@@ -0,0 +1,44 @@
+name: openfold-env 
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - nvidia 
+dependencies:
+  - cuda
+  - gcc=12.4
+  - python<3.11
+  - setuptools=59.5.0
+  - pip
+  - openmm
+  - pdbfixer
+  - pytorch-lightning
+  - biopython
+  - numpy=1.26.4
+  - pandas
+  - PyYAML
+  - requests
+  - scipy
+  - tqdm
+  - typing-extensions
+  - wandb
+  - modelcif==0.7
+  - awscli
+  - ml-collections
+  - aria2
+  - mkl
+  - git
+  - bioconda::hmmer
+  # install libopenblas as a fix for https://github.com/bioconda/bioconda-recipes/issues/56856
+  - libopenblas
+  - libaio
+  - bioconda::hhsuite
+  - bioconda::kalign2
+  - mmseqs2
+  - pytorch::pytorch=2.5
+  - pytorch::pytorch-cuda=12.4
+  - pip:
+      - deepspeed==0.14.5
+      - dm-tree==0.1.6
+      - git+https://github.com/NVIDIA/dllogger.git
+      - flash-attn 
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
new file mode 100644
index 0000000..a123195
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
@@ -0,0 +1,98 @@
+#!/bin/bash -l
+
+## This file is intended to serve as a template to be downloaded and modified for your use case.
+## For more information, refer to the following resources whenever referenced in the script-
+## README- https://github.com/ubccr/ccr-examples/tree/main/slurm/README.md
+## DOCUMENTATION- https://docs.ccr.buffalo.edu/en/latest/hpc/jobs
+
+## NOTE: This Slurm script was tested with the ccrsoft/2024.04 software release
+
+#SBATCH --cluster="ub-hpc"
+#SBATCH --partition="arm64"
+#SBATCH --qos="arm64"
+## Grace Hopper GH200 GPU
+#SBATCH --constraint="GH200"
+
+## Select the account that is appropriate for your use case
+## Available options and more details are provided in CCR's documentation:
+##   https://docs.ccr.buffalo.edu/en/latest/hpc/jobs/#slurm-directives-partitions-qos
+#SBATCH --account="[SlurmAccountName]"
+
+#SBATCH --nodes=1
+#SBATCH --tasks-per-node=1
+## jackhmmer and nhmmer don't scale beyond 8 cores, so no point requesting more CPU cores
+#SBATCH --cpus-per-task=12
+## This example only uses one GPU
+#SBATCH --gpus-per-node=1
+#SBATCH --mem=92GB
+
+## Job runtime limit, the job will be canceled once this limit is reached. Format- dd-hh:mm:ss
+#SBATCH --time=00:30:00
+
+## change to the OpenFold directory
+cd /projects/academic/[YourGroupName]/OpenFold
+
+## Make sure the top output directory exist
+mkdir -p ./output
+
+###############################################################################
+# OpenFold container setup
+###############################################################################
+if [ "${APPTAINER_NAME}" = "" ]
+then
+  # Launch the container with this script
+  exec apptainer exec \
+  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+  -B /util/software/data/OpenFold:/data \
+  -B /util/software/data/alphafold:/database \
+  -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+  -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+  -B "$(pwd)/output":/output \
+  --nv \
+  OpenFold-$(arch).sif \
+   bash "$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')"
+fi
+# Inside the container - OpenFold setup:
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+###############################################################################
+
+# You can run the same OpenFold commands you would run from
+# the "Apptainer> " prompt here:
+
+echo "Running OpenFold on compute node: $(hostname -s)"
+echo "GPU info:"
+nvidia-smi -L 
+
+# Get the example from the OpenFold GitHub repo
+pushd "${SLURMTMPDIR}" > /dev/null
+git clone https://github.com/aqlaboratory/openfold.git
+mv openfold/examples/ ./examples/
+rm -rf openfold
+popd > /dev/null
+
+# make the output dir for this job
+mkdir -p /output/PDB_6KWC/pre-computed_alignments
+
+## Run the OpenFold example from the GitHub sources
+python3 "${OF_DIR}/run_pretrained_openfold.py" \
+ --hhblits_binary_path "/opt/conda/bin/hhblits" \
+ --hmmsearch_binary_path "/opt/conda/bin/hhsearch" \
+ --hmmbuild_binary_path "/opt/conda/bin/hmmbuild" \
+ --kalign_binary_path "/opt/conda/bin/kalign" \
+ --model_device cuda \
+ --data_random_seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --use_precomputed_alignments "${SLURMTMPDIR}/examples/monomer/alignments" \
+ --output_dir /output/PDB_6KWC/pre-computed_alignments \
+ --config_preset model_1_ptm \
+ --jax_param_path "${OF_DIR}/openfold/resources/params/params_model_1_ptm.npz" \
+ "${SLURMTMPDIR}/examples/monomer/fasta_dir" \
+ "/data/pdb_data/mmcif_files"
+
+echo
+if [ "$?" = "0" ]
+then
+  echo "Model inference with pre-computed alignments completed"
+else
+  echo "Model inference with pre-computed alignments FAILED!" >&2
+fi
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
new file mode 100644
index 0000000..4a9092e
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
@@ -0,0 +1,95 @@
+#!/bin/bash -l
+
+## This file is intended to serve as a template to be downloaded and modified for your use case.
+## For more information, refer to the following resources whenever referenced in the script-
+## README- https://github.com/ubccr/ccr-examples/tree/main/slurm/README.md
+## DOCUMENTATION- https://docs.ccr.buffalo.edu/en/latest/hpc/jobs
+
+## NOTE: This Slurm script was tested with the ccrsoft/2024.04 software release
+
+## Select a cluster, partition, qos and account that is appropriate for your use case
+## Available options and more details are provided in CCR's documentation:
+##   https://docs.ccr.buffalo.edu/en/latest/hpc/jobs/#slurm-directives-partitions-qos
+#SBATCH --cluster="[cluster]"
+#SBATCH --partition="[partition]"
+#SBATCH --qos="[qos]"
+#SBATCH --account="[SlurmAccountName]"
+
+#SBATCH --nodes=1
+#SBATCH --tasks-per-node=1
+## jackhmmer and nhmmer don't scale beyond 8 cores, so no point requesting more CPU cores
+#SBATCH --cpus-per-task=12
+## This example only uses one GPU
+#SBATCH --gpus-per-node=1
+#SBATCH --mem=92GB
+
+## Job runtime limit, the job will be canceled once this limit is reached. Format- dd-hh:mm:ss
+#SBATCH --time=01:00:00
+
+## change to the OpenFold directory
+cd /projects/academic/[YourGroupName]/OpenFold
+
+## Make sure the top output directory exist
+mkdir -p ./output
+
+###############################################################################
+# OpenFold container setup
+###############################################################################
+if [ "${APPTAINER_NAME}" = "" ]
+then
+  # Launch the container with this script
+  exec apptainer exec \
+  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
+  -B /util/software/data/OpenFold:/data \
+  -B /util/software/data/alphafold:/database \
+  -B /util/software/data/OpenFold/openfold_params:/opt/openfold/openfold/resources/openfold_params \
+  -B /util/software/data/alphafold/params:/opt/openfold/openfold/resources/params \
+  -B "$(pwd)/output":/output \
+  --nv \
+  OpenFold-$(arch).sif \
+   bash "$(scontrol show job $SLURM_JOB_ID | awk -F= '/Command=/{print $2}')"
+fi
+# Inside the container - OpenFold setup:
+export TRITON_CACHE_DIR="${SLURMTMPDIR}"
+###############################################################################
+
+# You can run the same OpenFold commands you would run from
+# the "Apptainer> " prompt here:
+
+echo "Running OpenFold on compute node: $(hostname -s)"
+echo "GPU info:"
+nvidia-smi -L 
+
+# Get the example from the OpenFold GitHub repo
+pushd "${SLURMTMPDIR}" > /dev/null
+git clone https://github.com/aqlaboratory/openfold.git
+mv openfold/examples/ ./examples/
+rm -rf openfold
+popd > /dev/null
+
+# make the output dir for this job
+mkdir -p /output/PDB_6KWC/pre-computed_alignments
+
+## Run the OpenFold example from the GitHub sources
+python3 "${OF_DIR}/run_pretrained_openfold.py" \
+ --hhblits_binary_path "/opt/conda/bin/hhblits" \
+ --hmmsearch_binary_path "/opt/conda/bin/hhsearch" \
+ --hmmbuild_binary_path "/opt/conda/bin/hmmbuild" \
+ --kalign_binary_path "/opt/conda/bin/kalign" \
+ --model_device cuda \
+ --data_random_seed $(((RANDOM<<15)|(RANDOM + 1))) \
+ --use_precomputed_alignments "${SLURMTMPDIR}/examples/monomer/alignments" \
+ --output_dir /output/PDB_6KWC/pre-computed_alignments \
+ --config_preset model_1_ptm \
+ --jax_param_path "${OF_DIR}/openfold/resources/params/params_model_1_ptm.npz" \
+ "${SLURMTMPDIR}/examples/monomer/fasta_dir" \
+ "/data/pdb_data/mmcif_files"
+
+echo
+if [ "$?" = "0" ]
+then
+  echo "Model inference with pre-computed alignments completed"
+else
+  echo "Model inference with pre-computed alignments FAILED!" >&2
+fi
+

From be66b73ff6962761ad2f63a3c826c40e0359bfd1 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 2 Sep 2025 11:15:23 -0400
Subject: [PATCH 02/16] Added missing Slurm directive to fix X86 bin issue.

Prevent attempting to run X86 binaries on ARM64

Tony
---
 .../OpenFold/slurm_GH200_OpenFold_example.bash                   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
index a123195..7cdf531 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
@@ -10,6 +10,7 @@
 #SBATCH --cluster="ub-hpc"
 #SBATCH --partition="arm64"
 #SBATCH --qos="arm64"
+#SBATCH --export=HOME,TERM,SHELL
 ## Grace Hopper GH200 GPU
 #SBATCH --constraint="GH200"
 

From 76c41a0c43318f0c5072ed6244e87958dc0d040b Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Thu, 11 Sep 2025 11:17:56 -0400
Subject: [PATCH 03/16] Fix QOS for the DGX partition

Tony
---
 containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
index 6503bb1..9d25bba 100644
--- a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
+++ b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
@@ -271,7 +271,7 @@ Start an interactive job with more than one GPU e.g.
 
 ```
 salloc --cluster=ub-hpc --account="[SlurmAccountName]" \
- --partition=industry-dgx --qos=industry-dgx --mem=128GB --nodes=1 \
+ --partition=industry-dgx --qos=industry --mem=128GB --nodes=1 \
  --gpus-per-node=8 --mem=0 --exclusive --time=3-00:00:00
 ```
 

From d00cf6c3d34b3537f32249b85f64d83974a6a8ea Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Fri, 12 Sep 2025 12:06:55 -0400
Subject: [PATCH 04/16] Fixed logic for Slurm script end messages

Tony
---
 .../OpenFold/slurm_GH200_OpenFold_example.bash                 | 3 ++-
 .../2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
index 7cdf531..fe5a027 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
@@ -89,11 +89,12 @@ python3 "${OF_DIR}/run_pretrained_openfold.py" \
  "${SLURMTMPDIR}/examples/monomer/fasta_dir" \
  "/data/pdb_data/mmcif_files"
 
-echo
 if [ "$?" = "0" ]
 then
+  echo
   echo "Model inference with pre-computed alignments completed"
 else
+  echo
   echo "Model inference with pre-computed alignments FAILED!" >&2
 fi
 
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
index 4a9092e..f0236af 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
@@ -85,11 +85,12 @@ python3 "${OF_DIR}/run_pretrained_openfold.py" \
  "${SLURMTMPDIR}/examples/monomer/fasta_dir" \
  "/data/pdb_data/mmcif_files"
 
-echo
 if [ "$?" = "0" ]
 then
+  echo
   echo "Model inference with pre-computed alignments completed"
 else
+  echo
   echo "Model inference with pre-computed alignments FAILED!" >&2
 fi
 

From 377871eaa8daf3e9ce97d5b96543874d27323c82 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 30 Sep 2025 16:14:22 -0400
Subject: [PATCH 05/16] Create an empty tuning dir for triton

Avoids a "df" error when running OpenFold

Tony
---
 containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md | 3 ++-
 containers/2_ApplicationSpecific/OpenFold/README.md      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 4f2b501..c1b89af 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -123,10 +123,11 @@ Change to your OpenFold` directory
 cd /projects/academic/[YourGroupName]/OpenFold
 ```
 
-Create the output base directory
+Create the output base directory, and an empty tuning directory for triton
 
 ```
 mkdir -p ./output
+mkdir -p ${HOME}/.triton/autotune
 ```
 
 ...then start the OpenFold container instance
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
index 4d308b9..b177d1e 100644
--- a/containers/2_ApplicationSpecific/OpenFold/README.md
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -97,10 +97,11 @@ Change to your OpenFold directory
 cd /projects/academic/[YourGroupName]/OpenFold
 ```
 
-Create an output directory
+Create an output directory, and an empty tuning directory for triton
 
 ```
 mkdir -p ./output
+mkdir -p ${HOME}/.triton/autotune
 ```
 
 ...then start the OpenFold container instance

From 709cf3d029030639e3a3b84d54b9698c6f80f9ad Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Thu, 2 Oct 2025 17:47:05 -0400
Subject: [PATCH 06/16] Fix ARM64 build

A GPU has to be requested with the "salloc" or the nvidia pieces, "nvcc" CUDA
etc. don't get downloaded and installed.
Even though "--exclusive" is used, "nvidia-smi -L" sees no GPU

Tony
---
 .../OpenFold/BUILD-ARM64.md                   | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index c1b89af..4814f98 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -2,12 +2,12 @@
 
 ## Buid the ARM64 container image
 
-Start an interactive job on an ARM64 node
+Start an interactive job on an ARM64 node with a GPU
 
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --exclusive --time=3:30:00 2>&1 | tee "${tmp_file}"
+ --gpus-per-node=1 --exclusive --time=3:30:00 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
@@ -21,8 +21,21 @@ sample outout:
 > salloc: job 20812210 has been allocated resources
 > salloc: Granted job allocation 20812210
 > salloc: Waiting for resource configuration
-> salloc: Nodes cpn-v14-19 are ready for job
-> CCRusername@cpn-v14-19:~$
+> salloc: Nodes cpn-f06-36 are ready for job
+> CCRusername@cpn-f06-36:~$
+> ```
+
+Verify that a GPU has been allocated to the job (or the build will fail because
+the nvidia tools incluing "nvcc" will not be installed.)
+
+```
+nvidia-smi -L
+```
+
+sample output:
+
+> ````
+> GPU 0: NVIDIA GH200 480GB (UUID: GPU-3ec6f59a-0684-f162-69a0-8b7ebe27a8e3)
 > ```
 
 Change to your OpenFold directory
@@ -114,7 +127,7 @@ sample outout:
 > salloc: job 20815431 has been allocated resources
 > salloc: Granted job allocation 20815431
 > salloc: Waiting for resource configuration
-> salloc: Nodes cpn-v14-19 are ready for job
+> salloc: Nodes cpn-f06-36 are ready for job
 > ```
 
 Change to your OpenFold` directory
@@ -203,7 +216,7 @@ exit
 sample outout:
 
 > ```
-> CCRusername@cpn-v14-19$ 
+> CCRusername@cpn-f06-36$ 
 > ```
 
 End the Slurm job

From 050ba12dccc82e391ae3305902494168f907d456 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Fri, 3 Oct 2025 14:10:16 -0400
Subject: [PATCH 07/16] Increaed the salloc runtome to 5 hours

Build takes about 4 hours

Tony
---
 containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 4814f98..5381f1f 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -7,7 +7,7 @@ Start an interactive job on an ARM64 node with a GPU
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --gpus-per-node=1 --exclusive --time=3:30:00 2>&1 | tee "${tmp_file}"
+ --gpus-per-node=1 --exclusive --time=5:00:00 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login

From 8c04410ddb12d8de2187e2b401882c851c7bb89a Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Fri, 3 Oct 2025 14:42:48 -0400
Subject: [PATCH 08/16] Added missing "--account" option to salloc

Tony
---
 containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 5381f1f..0b7aacb 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -7,7 +7,8 @@ Start an interactive job on an ARM64 node with a GPU
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --gpus-per-node=1 --exclusive --time=5:00:00 2>&1 | tee "${tmp_file}"
+ --account="[SlurmAccountName]" --gpus-per-node=1 --exclusive \
+ --time=5:00:00 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
@@ -112,8 +113,9 @@ Start an interactive job on a node with a Grace Hopper GPU e.g.
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --time=1:00:00  --nodes=1 --tasks-per-node=1 --cpus-per-task=4 \
- --gpus-per-node=1 --constraint="GH200" --mem=90G 2>&1 | tee "${tmp_file}"
+ --account="[SlurmAccountName]" --time=01:00:00  --nodes=1 --tasks-per-node=1 \
+ --cpus-per-task=4 --gpus-per-node=1 --constraint="GH200" \
+ --mem=90G 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login

From 8913877f1b4466f60d466ced62f9c66cc04f19be Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Fri, 3 Oct 2025 14:46:23 -0400
Subject: [PATCH 09/16] Docs missed an "exit"

Tony
---
 .../2_ApplicationSpecific/OpenFold/BUILD-ARM64.md    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 0b7aacb..3f195e3 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -221,6 +221,18 @@ sample outout:
 > CCRusername@cpn-f06-36$ 
 > ```
 
+Exit the Slurm interactive session
+
+```
+exit
+```
+
+sample output:
+
+> ```
+> CCRusername@login1$ 
+> ```
+
 End the Slurm job
 
 ```

From 0c1b059795be61c95f979e9e256076003f3b20e1 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 21 Oct 2025 10:03:36 -0400
Subject: [PATCH 10/16] Added OpenFold to the "menu" README.md

Tony
---
 containers/2_ApplicationSpecific/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/containers/2_ApplicationSpecific/README.md b/containers/2_ApplicationSpecific/README.md
index 25c4ef3..2fd2d3e 100644
--- a/containers/2_ApplicationSpecific/README.md
+++ b/containers/2_ApplicationSpecific/README.md
@@ -16,6 +16,7 @@ Please refer to CCR's [container documentation](https://docs.ccr.buffalo.edu/en/
 | [Micro-C](./Micro-C)                 | Micro-C Pipeline container with steps for building and running via Apptainer |
 | [OpenFF-Toolkit](./Open_Force_Field_toolkit)  | Open Force Field toolkit container with steps for building and running via Apptainer |
 | [OpenFOAM](./OpenFOAM)               | OpenFOAM container with steps for building and running via Apptainer and Slurm |
+| [OpenFold](./OpenFold)               | OpenFold container with steps for building and running Inference (one GPU) & Training (minimum two GPUs) via Apptainer and Slurm |
 | [OpenSees](./OpenSees)               | OpenSees container with steps for building and running via Apptainer |
 | [SAS](./sas)                         | Guide for running SAS using Apptainer via Slurm batch script, command line, and GUI access |
 | [Seurat](./seurat)                   | Seurat container with example scRNA analysis |

From 6b2d0b39fa5739d510d0e8066fc402377fa79b98 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Wed, 26 Nov 2025 19:32:39 -0500
Subject: [PATCH 11/16] Fix x86_64 build

Note: ARM64 build still broken

Tony
---
 .../OpenFold/BUILD-ARM64.md                   | 10 +++--
 .../OpenFold/OpenFold.def                     | 38 +++++++++++++++++--
 .../2_ApplicationSpecific/OpenFold/README.md  |  3 +-
 .../OpenFold/environment.yml                  |  4 +-
 .../slurm_GH200_OpenFold_example.bash         |  2 +-
 .../OpenFold/slurm_OpenFold_example.bash      |  2 +-
 6 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 3f195e3..402809e 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -4,11 +4,14 @@
 
 Start an interactive job on an ARM64 node with a GPU
 
+```
+export SBATCH_ACCOUNT="[SlurmAccountName]"
+```
+
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --account="[SlurmAccountName]" --gpus-per-node=1 --exclusive \
- --time=5:00:00 2>&1 | tee "${tmp_file}"
+ --gpus-per-node=1 --exclusive --time=5:00:00 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
@@ -75,7 +78,8 @@ Build your container
 Note: Building the OpenFold container takes about three hours
 
 ```
-apptainer build OpenFold-$(arch).sif OpenFold-aarch64.def
+apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" -B /scratch:/scratch \
+ OpenFold-$(arch).sif OpenFold-aarch64.def
 ```
 
 sample truncated output:
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
index 7568a6a..d0b1cfa 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
@@ -1,5 +1,5 @@
 Bootstrap: docker
-From: nvidia/cuda:12.6.3-base-ubuntu24.04
+From: ubuntu:24.04
 
 %labels
   org.opencontainers.image.authors OpenFold Team
@@ -7,10 +7,19 @@ From: nvidia/cuda:12.6.3-base-ubuntu24.04
   org.opencontainers.image.licenses Apache License 2.0
   org.opencontainers.image.base.name docker.io/nvidia/cuda:12.6.3-base-ubuntu24.04
 
+%setup
+  # create mountpoint for /scratch during the build (for ${SLURMTMPDIR})
+  mkdir ${APPTAINER_ROOTFS}/scratch
+
 %files
   environment.yml /opt/openfold/environment.yml
 
-%post
+%arguments
+    SLURMTMPDIR=""
+
+%post -c /bin/bash
+  export SLURMTMPDIR="{{ SLURMTMPDIR }}"
+
   # Set the timezone, if unset
   test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
 
@@ -62,11 +71,21 @@ From: nvidia/cuda:12.6.3-base-ubuntu24.04
   rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
 
   export CONDA_PREFIX="/opt/conda"
+  export CUDA_HOME="${CONDA_PREFIX}"
+  uid="$(head -1 /proc/self/uid_map | awk '{print $2}')"
+  export TMPDIR="${SLURMTMPDIR:-/var/tmp}/tmp_${uid}"
+  mkdir -p "${TMPDIR}"
+  export CUDA_CACHE_PATH="${TMPDIR}/cache/cuda"
+  mkdir -p "${CUDA_CACHE_PATH}"
+  export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
+  mkdir -p "${PIP_CACHE_DIR}"
   export PATH=/opt/conda/bin:$PATH
   cd /opt/conda
   mamba env update -n base --file /opt/openfold/environment.yml
   mamba clean --all
   export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
+  # manually install flash-attn with --no-build-isolation
+  pip install flash-attn --no-build-isolation
   cd /root
   git clone https://github.com/aqlaboratory/openfold.git
   cd openfold
@@ -93,5 +112,18 @@ From: nvidia/cuda:12.6.3-base-ubuntu24.04
   export PATH="/opt/conda/bin:/opt/openfold:${PATH}"
   export CONDA_PREFIX="/opt/conda"
   export OF_DIR="/opt/openfold"
-  export TRITON_CACHE_DIR="${SLURMTMPDIR:-$(test -d /scratch && echo "/scratch" || echo "/var/tmp")}"
+
+%runscript
+  #!/bin/bash
+  uid="$(head -1 /proc/self/uid_map | awk '{print $2}')"
+  export TMPDIR="${SLURMTMPDIR:-$(echo ",${APPTAINER_BIND}," | grep -q ",/scratch," && echo "/scratch" || echo "/var/tmp")}/tmp_${uid}"
+  mkdir -p "${TMPDIR}"
+  export CUDA_CACHE_PATH="${TMPDIR}/cache/cuda"
+  mkdir -p "${CUDA_CACHE_PATH}"
+  export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
+  mkdir -p "${PIP_CACHE_DIR}"
+  export TRITON_CACHE_DIR="${TMPDIR}/cache/triton"
+  mkdir -p "${TRITON_CACHE_DIR}"
+  # Exec passed command (required for Modal ENTRYPOINT compatibility)
+  exec "$@"
 
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
index b177d1e..e4da480 100644
--- a/containers/2_ApplicationSpecific/OpenFold/README.md
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -68,7 +68,8 @@ export APPTAINER_CACHEDIR=${SLURMTMPDIR}
 Building the OpenFold container takes about half an hour...
 
 ```
-apptainer build OpenFold-$(arch).sif OpenFold.def
+apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" -B /scratch:/scratch \
+ OpenFold-$(arch).sif OpenFold.def
 ```
 
 Sample truncated output:
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment.yml b/containers/2_ApplicationSpecific/OpenFold/environment.yml
index 3847f5d..a41a5ba 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment.yml
@@ -41,4 +41,6 @@ dependencies:
       - deepspeed==0.14.5
       - dm-tree==0.1.6
       - git+https://github.com/NVIDIA/dllogger.git
-      - flash-attn 
+      - einops
+# Have to mainually install flash-attn
+#      - flash-attn
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
index fe5a027..1d048d1 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
@@ -42,7 +42,7 @@ mkdir -p ./output
 if [ "${APPTAINER_NAME}" = "" ]
 then
   # Launch the container with this script
-  exec apptainer exec \
+  exec apptainer run \
   -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
   -B /util/software/data/OpenFold:/data \
   -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
index f0236af..5f9d6f5 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
@@ -38,7 +38,7 @@ mkdir -p ./output
 if [ "${APPTAINER_NAME}" = "" ]
 then
   # Launch the container with this script
-  exec apptainer exec \
+  exec apptainer run \
   -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
   -B /util/software/data/OpenFold:/data \
   -B /util/software/data/alphafold:/database \

From 7bc9b7f77310bfd9b0140a6e79c5b04bfe235386 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Fri, 5 Dec 2025 14:04:59 -0500
Subject: [PATCH 12/16] Fix ARM64 build & updates for x86_64

Tony
---
 .../OpenFold/BUILD-ARM64.md                   |  23 ++--
 .../OpenFold/OpenFold-aarch64.def             | 126 +++++++++++-------
 .../OpenFold/OpenFold.def                     |  36 +++--
 .../2_ApplicationSpecific/OpenFold/README.md  |   2 +-
 .../OpenFold/environment-aarch64.yml          |  29 ++--
 .../OpenFold/environment.yml                  |   3 +
 6 files changed, 129 insertions(+), 90 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 402809e..530e739 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -11,7 +11,7 @@ export SBATCH_ACCOUNT="[SlurmAccountName]"
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --gpus-per-node=1 --exclusive --time=5:00:00 2>&1 | tee "${tmp_file}"
+ --gpus-per-node=1 --exclusive --time=1:00:00 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
@@ -48,12 +48,6 @@ Change to your OpenFold directory
 cd /projects/academic/[YourGroupName]/OpenFold
 ```
 
-Then set the apptainer cache dir:
-
-```
-export APPTAINER_CACHEDIR=${SLURMTMPDIR}
-```
-
 Download the OpenFold ARM64 build files, OpenFold-aarch64.def and
 environment-aarch64.yml, to this directory
 
@@ -73,6 +67,12 @@ Sample output:
 > 100   574  100   574    0     0   3128      0 --:--:-- --:--:-- --:--:--  3136
 > ```
 
+Set the apptainer cache dir:
+
+```
+export APPTAINER_CACHEDIR="${SLURMTMPDIR}"
+```
+
 Build your container
 
 Note: Building the OpenFold container takes about three hours
@@ -114,12 +114,15 @@ unset SLURM_JOB_ID
 
 Start an interactive job on a node with a Grace Hopper GPU e.g.
 
+```
+export SBATCH_ACCOUNT="[SlurmAccountName]"
+```
+
 ```
 tmp_file="$(mktemp)"
 salloc --partition=arm64 --qos=arm64 --constraint=ARM64 --no-shell \
- --account="[SlurmAccountName]" --time=01:00:00  --nodes=1 --tasks-per-node=1 \
- --cpus-per-task=4 --gpus-per-node=1 --constraint="GH200" \
- --mem=90G 2>&1 | tee "${tmp_file}"
+ --time=01:00:00  --nodes=1 --tasks-per-node=1 --cpus-per-task=4 \
+ --gpus-per-node=1 --constraint="GH200" --mem=90G 2>&1 | tee "${tmp_file}"
 SLURM_JOB_ID="$(head -1 "${tmp_file}" | awk '{print $NF}')"
 rm "${tmp_file}"
 srun --jobid="${SLURM_JOB_ID}" --export=HOME,TERM,SHELL --pty /bin/bash --login
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
index 5ee795a..0743d72 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
@@ -1,16 +1,25 @@
 Bootstrap: docker
-From: nvcr.io/nvidia/pytorch:25.06-py3
+From: nvcr.io/nvidia/pytorch:25.11-py3
 
 %labels
   org.opencontainers.image.authors OpenFold Team
   org.opencontainers.image.source https://github.com/aqlaboratory/openfold
   org.opencontainers.image.licenses Apache License 2.0
-  org.opencontainers.image.base.name docker.io/nvidia/cuda:12.6.3-base-ubuntu24.04
+  org.opencontainers.image.base.name nvcr.io/nvidia/pytorch:25.11-py3
+
+%setup
+  # create mountpoint for /scratch during the build (for ${SLURMTMPDIR})
+  mkdir "${APPTAINER_ROOTFS}/scratch"
 
 %files
-  environment-$(arch).yml /opt/openfold/environment.yml
+  environment-aarch64.yml /opt/openfold/environment.yml
+
+%arguments
+    SLURMTMPDIR=""
+
+%post -c /bin/bash
+  export SLURMTMPDIR="{{ SLURMTMPDIR }}"
 
-%post
   # Set the timezone, if unset
   test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
 
@@ -34,7 +43,6 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
    unzip \
    wget \
    git \
-   curl \
    jq \
    nano \
    vim \
@@ -53,56 +61,58 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
 
   # Following build based on the Dockerfile
 
+  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")'):${PYTHONPATH}"
+
+  export CONDA_PREFIX="/opt/conda"
+  export CUDA_HOME="/usr/local/cuda"
+  export CUDATOOLKIT_HOME="${CUDA_HOME}"
+  export CUDNN_HOME="${CUDA_HOME}"
+  uid="$(head -1 /proc/self/uid_map | awk '{print $2}')"
+  export TMPDIR="${SLURMTMPDIR:-/var/tmp}/tmp_${uid}"
+  mkdir -p "${TMPDIR}"
+  export CUDA_CACHE_PATH="${TMPDIR}/cache/cuda"
+  mkdir -p "${CUDA_CACHE_PATH}"
+  export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
+  mkdir -p "${PIP_CACHE_DIR}"
+  export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
+  mkdir -p "${CONDA_PKGS_DIRS}"
+  export PATH="${PATH}:${CONDA_PREFIX}/bin"
+  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+
   # set up Miniforge
   miniforge_version="23.3.1-1"
   #miniforge_version="25.3.0-3"
   wget -P /tmp \
    "https://github.com/conda-forge/miniforge/releases/download/${miniforge_version}/Miniforge3-$(uname)-$(uname -m).sh"
+  echo "============================================================================="
+  echo "Installing Miniforge"
+  echo "============================================================================="
   bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda
   rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
 
-  export CONDA_PREFIX="/opt/conda"
-  export PATH=/opt/conda/bin:$PATH
   cd /opt/conda
-
+  echo "============================================================================="
+  echo "Installing Python pachages from environment.yml"
+  echo "============================================================================="
   mamba env update -n base --file /opt/openfold/environment.yml
-  mamba clean --all
-
-  # Install deepspeed
-  cd /root
-  curl -L -o DeepSpeed-0.14.5.tar.gz https://github.com/deepspeedai/DeepSpeed/archive/refs/tags/v0.14.5.tar.gz
-  gzip -dc DeepSpeed-0.14.5.tar.gz | tar xf -
-  rm DeepSpeed-0.14.5.tar.gz 
-  mv DeepSpeed-0.14.5/ DeepSpeed
-
-  cd DeepSpeed
-  sed -i -e 's/ninja.*$/ninja==1.11.1/' \
-   -e 's/nvidia-ml-py.*$/nvidia-ml-py==12.575.51/' \
-   -e 's/py-cpuinfo.*$/py-cpuinfo==8.0.0/' \
-   environment.yml requirements/requirements.txt
-  export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
-  python3 setup.py install
-  cd ..
-  rm -rf DeepSpeed
-  /opt/conda/bin/pip install dm-tree
-
-  # install flash-attn
-  cd /root
-  git clone https://github.com/Dao-AILab/flash-attention.git
-  cd flash-attention
-  python3 setup.py install
-  cd ..
-  rm -rf flash-attention
-
-  # Install dllogger
-  cd /root
-  git clone --filter=blob:none --quiet https://github.com/NVIDIA/dllogger
-  cd dllogger
-  mkdir -p requirements
-  /opt/conda/bin/pip freeze > requirements/requirements.txt
-  python3 setup.py install
-  cd ..
-  rm -rf dllogger
+  mamba clean --all -y
+
+  # CUDA aware OpenMPI and UCX settings
+  export OMPI_MCA_opal_cuda_support="true"
+  export OMPI_MCA_pml="ucx"
+  export OMPI_MCA_osc="ucx"
+  export UCX_MEMTYPE_CACHE="n"
+
+  # Install PyTorch Lightning and dependencies for OpenFold 
+  pip install --prefix="/usr" \
+   lightning \
+   'jsonargparse[signatures]' \
+   deepspeed \
+   dm-tree \
+   flash-attn \
+   nvdllogger
+
+  # Install OpenFold 2
   cd /root
   git clone https://github.com/aqlaboratory/openfold.git
   cd openfold
@@ -118,16 +128,34 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
   cd ..
   rm -rf openfold
   cd /opt/openfold
+  # dllogger was installed as nvdllogger (from pypi.org) so fix the package name in logger.py
+  sed -E -i '/(import|from)[[:space:]]+dllogger/s/dllogger/nvdllogger/' ./openfold/utils/logger.py
   wget -q -P /opt/openfold/openfold/resources \
     https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
   python3 setup.py install
 
 %environment
   export LANG=en_US.UTF-8 
-  export LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
-  export PYTHONPATH="/opt/conda/lib/python3.10/site-packages:/opt/openfold:${PYTHONPATH}"
-  export PATH="/opt/conda/bin:/opt/openfold:${PATH}"
+  export CUDA_HOME="/usr/local/cuda"
   export CONDA_PREFIX="/opt/conda"
   export OF_DIR="/opt/openfold"
-  export TRITON_CACHE_DIR="${SLURMTMPDIR:-$(test -d /scratch && echo "/scratch" || echo "/var/tmp")}"
+  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CONDA_PREFIX}/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
+  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")'):${PYTHONPATH}"
+  export PATH="${CUDA_HOME}/bin:${PATH}:${CONDA_PREFIX}/bin:${PATH}"
+
+%runscript
+  #!/bin/bash
+  uid="$(head -1 /proc/self/uid_map | awk '{print $2}')"
+  export TMPDIR="${SLURMTMPDIR:-$(echo ",${APPTAINER_BIND}," | grep -q ",/scratch," && echo "/scratch" || echo "/var/tmp")}/tmp_${uid}"
+  mkdir -p "${TMPDIR}"
+  export CUDA_CACHE_PATH="${TMPDIR}/cache/cuda"
+  mkdir -p "${CUDA_CACHE_PATH}"
+  export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
+  mkdir -p "${PIP_CACHE_DIR}"
+  export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
+  mkdir -p "${CONDA_PKGS_DIRS}"
+  export TRITON_CACHE_DIR="${TMPDIR}/cache/triton"
+  mkdir -p "${TRITON_CACHE_DIR}"
+  # Exec passed command (required for Modal ENTRYPOINT compatibility)
+  exec "$@"
 
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
index d0b1cfa..031fb7c 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
@@ -9,7 +9,7 @@ From: ubuntu:24.04
 
 %setup
   # create mountpoint for /scratch during the build (for ${SLURMTMPDIR})
-  mkdir ${APPTAINER_ROOTFS}/scratch
+  mkdir "${APPTAINER_ROOTFS}/scratch"
 
 %files
   environment.yml /opt/openfold/environment.yml
@@ -62,16 +62,10 @@ From: ubuntu:24.04
 
   # Following build based on the Dockerfile
 
-  # set up Miniforge
-  miniforge_version="23.3.1-1"
-  #miniforge_version="25.3.0-3"
-  wget -P /tmp \
-   "https://github.com/conda-forge/miniforge/releases/download/${miniforge_version}/Miniforge3-$(uname)-$(uname -m).sh"
-  bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda
-  rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
-
   export CONDA_PREFIX="/opt/conda"
   export CUDA_HOME="${CONDA_PREFIX}"
+  export CUDATOOLKIT_HOME="${CUDA_HOME}"
+  export CUDNN_HOME="${CUDA_HOME}"
   uid="$(head -1 /proc/self/uid_map | awk '{print $2}')"
   export TMPDIR="${SLURMTMPDIR:-/var/tmp}/tmp_${uid}"
   mkdir -p "${TMPDIR}"
@@ -79,11 +73,22 @@ From: ubuntu:24.04
   mkdir -p "${CUDA_CACHE_PATH}"
   export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
   mkdir -p "${PIP_CACHE_DIR}"
+  export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
+  mkdir -p "${CONDA_PKGS_DIRS}"
   export PATH=/opt/conda/bin:$PATH
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+
+  # set up Miniforge
+  miniforge_version="23.3.1-1"
+  #miniforge_version="25.3.0-3"
+  wget -P /tmp \
+   "https://github.com/conda-forge/miniforge/releases/download/${miniforge_version}/Miniforge3-$(uname)-$(uname -m).sh"
+  bash /tmp/Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/conda
+  rm /tmp/Miniforge3-$(uname)-$(uname -m).sh
+
   cd /opt/conda
   mamba env update -n base --file /opt/openfold/environment.yml
-  mamba clean --all
-  export LD_LIBRARY_PATH="/opt/conda/lib:${LD_LIBRARY_PATH}"
+  mamba clean --all -y
   # manually install flash-attn with --no-build-isolation
   pip install flash-attn --no-build-isolation
   cd /root
@@ -107,11 +112,12 @@ From: ubuntu:24.04
 
 %environment
   export LANG=en_US.UTF-8 
-  export LD_LIBRARY_PATH="/opt/conda/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
-  export PYTHONPATH="/opt/conda/lib/python3.10/site-packages:/opt/openfold:${PYTHONPATH}"
-  export PATH="/opt/conda/bin:/opt/openfold:${PATH}"
   export CONDA_PREFIX="/opt/conda"
+  export CUDA_HOME="${CONDA_PREFIX}"
   export OF_DIR="/opt/openfold"
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
+  export PYTHONPATH="${OF_DIR}:/opt/conda/lib/python3.10/site-packages:${PYTHONPATH}"
+  export PATH="${CONDA_PREFIX}/bin:${PATH}"
 
 %runscript
   #!/bin/bash
@@ -122,6 +128,8 @@ From: ubuntu:24.04
   mkdir -p "${CUDA_CACHE_PATH}"
   export PIP_CACHE_DIR="${TMPDIR}/cache/pip"
   mkdir -p "${PIP_CACHE_DIR}"
+  export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
+  mkdir -p "${CONDA_PKGS_DIRS}"
   export TRITON_CACHE_DIR="${TMPDIR}/cache/triton"
   mkdir -p "${TRITON_CACHE_DIR}"
   # Exec passed command (required for Modal ENTRYPOINT compatibility)
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
index e4da480..411786b 100644
--- a/containers/2_ApplicationSpecific/OpenFold/README.md
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -62,7 +62,7 @@ Sample output:
 Set the apptainer cache dir:
 
 ```
-export APPTAINER_CACHEDIR=${SLURMTMPDIR}
+export APPTAINER_CACHEDIR="${SLURMTMPDIR}"
 ```
 
 Building the OpenFold container takes about half an hour...
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
index 4fe1293..e7450e1 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
@@ -2,27 +2,14 @@ name: openfold-env
 channels:
   - conda-forge
   - bioconda
-  - pytorch
-  - nvidia 
 dependencies:
-  - cuda
-  - gcc=12.4
-  - python<3.11
-  - setuptools=59.5.0
-  - pip
+  - python=3.12
+  - setuptools<80
   - openmm
   - pdbfixer
-  - pytorch-lightning
   - biopython
-  - numpy=1.26.4
-  - pandas
-  - PyYAML
-  - requests
-  - scipy
-  - tqdm
-  - typing-extensions
   - wandb
-  - modelcif==0.7
+  - modelcif
   - awscli
   - ml-collections
   - aria2
@@ -31,6 +18,16 @@ dependencies:
   # install libopenblas as a fix for https://github.com/bioconda/bioconda-recipes/issues/56856
   - libopenblas
   - libaio
+  - ucx
   - bioconda::hhsuite
   - bioconda::kalign2
   - mmseqs2
+  # pytorch-lightning dependencies
+  - cpython
+  - cusparselt
+  - gmpy2
+  - mpc
+  - nccl
+  - nomkl
+  - sleef
+  - triton
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment.yml b/containers/2_ApplicationSpecific/OpenFold/environment.yml
index a41a5ba..954c317 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment.yml
@@ -37,10 +37,13 @@ dependencies:
   - mmseqs2
   - pytorch::pytorch=2.5
   - pytorch::pytorch-cuda=12.4
+  - lightning
+  - torchvision
   - pip:
       - deepspeed==0.14.5
       - dm-tree==0.1.6
       - git+https://github.com/NVIDIA/dllogger.git
+      - jsonargparse[signatures]
       - einops
 # Have to mainually install flash-attn
 #      - flash-attn

From c6b06156867452b7f4ec7dbd708d851175bb3f5f Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Mon, 8 Dec 2025 09:46:24 -0500
Subject: [PATCH 13/16] Fix to occational CUDA rm error on startup

Occasionally there wil be the following error on container startup:

  /usr/bin/rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system
  rm: cannot remove '/usr/local/cuda/compat/lib': Read-only file system

The nvidia continers add "--writable-tmpfs" to their containers but I
can't find a way to do this in the .def file, so added to the container
startup.

Tony
---
 containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 530e739..48f2189 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -156,6 +156,7 @@ mkdir -p ${HOME}/.triton/autotune
 
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \

From 30b7f8d923f485ce73f628bafd689690c85a8e29 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 9 Dec 2025 22:13:55 -0500
Subject: [PATCH 14/16] Fixed the x86_86 build - OpenFold test runs.

Tony
---
 .../OpenFold/BUILD-ARM64.md                    |  3 ++-
 .../Download_OpenFold_PDB_training_set.md      |  1 +
 .../OpenFold/Download_model_parameters.md      |  1 +
 .../2_ApplicationSpecific/OpenFold/EXAMPLES.md |  2 ++
 .../OpenFold/OpenFold.def                      | 18 ++++++++++++++++--
 .../2_ApplicationSpecific/OpenFold/README.md   |  4 +++-
 .../OpenFold/environment.yml                   | 15 ++++++++++-----
 .../OpenFold/slurm_GH200_OpenFold_example.bash |  1 +
 .../OpenFold/slurm_OpenFold_example.bash       |  1 +
 9 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 48f2189..16dc493 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -78,7 +78,8 @@ Build your container
 Note: Building the OpenFold container takes about three hours
 
 ```
-apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" -B /scratch:/scratch \
+apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" \
+ --build-arg SLURM_NPROCS="${SLURM_NPROCS}" -B /scratch:/scratch \
  OpenFold-$(arch).sif OpenFold-aarch64.def
 ```
 
diff --git a/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md b/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
index 026bcd1..7441b21 100644
--- a/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
+++ b/containers/2_ApplicationSpecific/OpenFold/Download_OpenFold_PDB_training_set.md
@@ -20,6 +20,7 @@ Start the container
 
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md b/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
index d2c9bc8..2d11d63 100644
--- a/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
+++ b/containers/2_ApplicationSpecific/OpenFold/Download_model_parameters.md
@@ -21,6 +21,7 @@ mkdir -p ./resources
 Start the container
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
index 9d25bba..28b9e2d 100644
--- a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
+++ b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
@@ -30,6 +30,7 @@ Start the container, with the "./output" directory a the top level output
 
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \
@@ -323,6 +324,7 @@ alternate output directory
 
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
index 031fb7c..c293393 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
@@ -1,5 +1,5 @@
 Bootstrap: docker
-From: ubuntu:24.04
+From: nvcr.io/nvidia/pytorch:24.10-py3
 
 %labels
   org.opencontainers.image.authors OpenFold Team
@@ -16,9 +16,11 @@ From: ubuntu:24.04
 
 %arguments
     SLURMTMPDIR=""
+    SLURM_NPROCS=""
 
 %post -c /bin/bash
   export SLURMTMPDIR="{{ SLURMTMPDIR }}"
+  export SLURM_NPROCS="{{ SLURM_NPROCS }}"
 
   # Set the timezone, if unset
   test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
@@ -57,8 +59,15 @@ From: ubuntu:24.04
   dpkg-reconfigure --frontend=noninteractive locales
   update-locale LANG=en_US.UTF-8
 
+  # max parallelism for ninja
+  export MAX_JOBS="$(expr ${SLURM_NPROCS:-$(nproc)} - 3)"
+  if [ ${MAX_JOBS} -lt 1 ]xi
+  then
+    MAX_JOBS=1
+  fi
+
   # use all the available cores for cmake parallel builds
-  export CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)"
+  export CMAKE_BUILD_PARALLEL_LEVEL="${MAX_JOBS}"
 
   # Following build based on the Dockerfile
 
@@ -91,6 +100,8 @@ From: ubuntu:24.04
   mamba clean --all -y
   # manually install flash-attn with --no-build-isolation
   pip install flash-attn --no-build-isolation
+  # install CUDA TensorFlow
+  pip install 'tensorflow[and-cuda]' tensorrt polygraphy
   cd /root
   git clone https://github.com/aqlaboratory/openfold.git
   cd openfold
@@ -106,6 +117,9 @@ From: ubuntu:24.04
   cd ..
   rm -rf openfold
   cd /opt/openfold
+  # cuda-bindings 13.0.0 introduced a breaking change - cuda.cudart was
+  # replaced by cuda.bindings.runtime
+  sed -E -i '/^[[:space:]]*import[[:space:]]+cuda\.cudart/s/cuda\.cudart/cuda.bindings.runtime/' ./openfold/utils/tensorrt_lazy_compiler.py
   wget -q -P /opt/openfold/openfold/resources \
     https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
   python3 setup.py install
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
index 411786b..562b50d 100644
--- a/containers/2_ApplicationSpecific/OpenFold/README.md
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -68,7 +68,8 @@ export APPTAINER_CACHEDIR="${SLURMTMPDIR}"
 Building the OpenFold container takes about half an hour...
 
 ```
-apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" -B /scratch:/scratch \
+apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" \
+ --build-arg SLURM_NPROCS="${SLURM_NPROCS}" -B /scratch:/scratch \
  OpenFold-$(arch).sif OpenFold.def
 ```
 
@@ -109,6 +110,7 @@ mkdir -p ${HOME}/.triton/autotune
 
 ```
 apptainer shell \
+ --writable-tmpfs \
  -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
  -B /util/software/data/OpenFold:/data \
  -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment.yml b/containers/2_ApplicationSpecific/OpenFold/environment.yml
index 954c317..5fb49ed 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment.yml
@@ -1,27 +1,30 @@
-name: openfold-env 
+name: openfold-env
 channels:
   - conda-forge
   - bioconda
   - pytorch
-  - nvidia 
+  - nvidia
 dependencies:
   - cuda
   - gcc=12.4
-  - python<3.11
+  - python=3.10
   - setuptools=59.5.0
   - pip
   - openmm
   - pdbfixer
   - pytorch-lightning
   - biopython
-  - numpy=1.26.4
+  - numpy
   - pandas
   - PyYAML
   - requests
   - scipy
   - tqdm
   - typing-extensions
-  - wandb
+#  - wandb
+  # W&B Automations API is experimental and it is recommend pinning the package
+  # version to 0.23.1 to reduce the risk of disruption
+  - wandb==0.23.1
   - modelcif==0.7
   - awscli
   - ml-collections
@@ -40,6 +43,8 @@ dependencies:
   - lightning
   - torchvision
   - pip:
+      - wandb[workspaces]
+      - cuda-python
       - deepspeed==0.14.5
       - dm-tree==0.1.6
       - git+https://github.com/NVIDIA/dllogger.git
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
index 1d048d1..65e6bf3 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_GH200_OpenFold_example.bash
@@ -43,6 +43,7 @@ if [ "${APPTAINER_NAME}" = "" ]
 then
   # Launch the container with this script
   exec apptainer run \
+   --writable-tmpfs \
   -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
   -B /util/software/data/OpenFold:/data \
   -B /util/software/data/alphafold:/database \
diff --git a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
index 5f9d6f5..6636f41 100644
--- a/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
+++ b/containers/2_ApplicationSpecific/OpenFold/slurm_OpenFold_example.bash
@@ -39,6 +39,7 @@ if [ "${APPTAINER_NAME}" = "" ]
 then
   # Launch the container with this script
   exec apptainer run \
+   --writable-tmpfs \
   -B /projects:/projects,/scratch:/scratch,/util:/util,/vscratch:/vscratch \
   -B /util/software/data/OpenFold:/data \
   -B /util/software/data/alphafold:/database \

From 678c47e8ddb943979587e9338abee7b929f04b0d Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Tue, 16 Dec 2025 18:21:56 -0500
Subject: [PATCH 15/16] Fix ARM64 build

Unfortunately here are slews of deprecation and future warnings that
cannot be easily supresssed - needs code changes

Tony
---
 .../OpenFold/OpenFold-aarch64.def             | 29 ++++++++++++++-----
 .../OpenFold/OpenFold.def                     | 13 +++++----
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
index 0743d72..a765c37 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold-aarch64.def
@@ -16,9 +16,11 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
 
 %arguments
     SLURMTMPDIR=""
+    SLURM_NPROCS=""
 
 %post -c /bin/bash
   export SLURMTMPDIR="{{ SLURMTMPDIR }}"
+  export SLURM_NPROCS="{{ SLURM_NPROCS }}"
 
   # Set the timezone, if unset
   test -h /etc/localtime || ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime
@@ -56,12 +58,19 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
   dpkg-reconfigure --frontend=noninteractive locales
   update-locale LANG=en_US.UTF-8
 
-  # use all the available cores for cmake parallel builds
-  export CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)"
+  # max parallelism - leave cores free for apptainer & its active mounts
+  export MAX_JOBS="$(expr ${SLURM_NPROCS:-$(nproc)} - 3)"
+  if [ ${MAX_JOBS} -lt 1 ]
+  then
+    MAX_JOBS=1
+  fi
+
+  # use all free cores for cmake parallel builds
+  export CMAKE_BUILD_PARALLEL_LEVEL="${MAX_JOBS}"
 
   # Following build based on the Dockerfile
 
-  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")'):${PYTHONPATH}"
+  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")')${PYTHONPATH:+:${PYTHONPATH}}"
 
   export CONDA_PREFIX="/opt/conda"
   export CUDA_HOME="/usr/local/cuda"
@@ -77,7 +86,7 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
   export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
   mkdir -p "${CONDA_PKGS_DIRS}"
   export PATH="${PATH}:${CONDA_PREFIX}/bin"
-  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:/opt/conda/lib:${LD_LIBRARY_PATH}"
 
   # set up Miniforge
   miniforge_version="23.3.1-1"
@@ -104,7 +113,9 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
   export UCX_MEMTYPE_CACHE="n"
 
   # Install PyTorch Lightning and dependencies for OpenFold 
+  pip install --prefix="/usr" nvidia-nccl-cu13 nvidia-cudnn-cu13
   pip install --prefix="/usr" \
+   'cuda-python[all]' \
    lightning \
    'jsonargparse[signatures]' \
    deepspeed \
@@ -130,6 +141,9 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
   cd /opt/openfold
   # dllogger was installed as nvdllogger (from pypi.org) so fix the package name in logger.py
   sed -E -i '/(import|from)[[:space:]]+dllogger/s/dllogger/nvdllogger/' ./openfold/utils/logger.py
+  # cuda-bindings 13.0.0 introduced a breaking change - cuda.cudart was
+  # replaced by cuda.bindings.runtime
+  sed -E -i '/^[[:space:]]*import[[:space:]]+cuda\.cudart/s/cuda\.cudart/cuda.bindings.runtime/' ./openfold/utils/tensorrt_lazy_compiler.py
   wget -q -P /opt/openfold/openfold/resources \
     https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
   python3 setup.py install
@@ -139,9 +153,10 @@ From: nvcr.io/nvidia/pytorch:25.11-py3
   export CUDA_HOME="/usr/local/cuda"
   export CONDA_PREFIX="/opt/conda"
   export OF_DIR="/opt/openfold"
-  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CONDA_PREFIX}/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
-  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")'):${PYTHONPATH}"
-  export PATH="${CUDA_HOME}/bin:${PATH}:${CONDA_PREFIX}/bin:${PATH}"
+  export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CONDA_PREFIX}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+  export PYTHONPATH="${OF_DIR}:$(python3 -c 'import sys; print("/usr/local/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/dist-packages")'):${CONDA_PREFIX}$(python3 -c 'import sys; print("/lib/python" + str(sys.version_info.major) + "." + str(sys.version_info.minor) + "/site-packages")')${PYTHONPATH:+:${PYTHONPATH}}"
+  export PATH="${OF_DIR}:${CUDA_HOME}/bin:${PATH}:${CONDA_PREFIX}/bin:${PATH}"
+  export PYTHONWARNINGS="ignore::DeprecationWarning,ignore::FutureWarning"
 
 %runscript
   #!/bin/bash
diff --git a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
index c293393..1db747f 100644
--- a/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
+++ b/containers/2_ApplicationSpecific/OpenFold/OpenFold.def
@@ -59,14 +59,14 @@ From: nvcr.io/nvidia/pytorch:24.10-py3
   dpkg-reconfigure --frontend=noninteractive locales
   update-locale LANG=en_US.UTF-8
 
-  # max parallelism for ninja
+  # max parallelism - leave cores free for apptainer & its active mounts
   export MAX_JOBS="$(expr ${SLURM_NPROCS:-$(nproc)} - 3)"
-  if [ ${MAX_JOBS} -lt 1 ]xi
+  if [ ${MAX_JOBS} -lt 1 ]
   then
     MAX_JOBS=1
   fi
 
-  # use all the available cores for cmake parallel builds
+  # use all free cores for cmake parallel builds
   export CMAKE_BUILD_PARALLEL_LEVEL="${MAX_JOBS}"
 
   # Following build based on the Dockerfile
@@ -85,7 +85,7 @@ From: nvcr.io/nvidia/pytorch:24.10-py3
   export CONDA_PKGS_DIRS="${TMPDIR}/cache/conda"
   mkdir -p "${CONDA_PKGS_DIRS}"
   export PATH=/opt/conda/bin:$PATH
-  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
 
   # set up Miniforge
   miniforge_version="23.3.1-1"
@@ -129,9 +129,10 @@ From: nvcr.io/nvidia/pytorch:24.10-py3
   export CONDA_PREFIX="/opt/conda"
   export CUDA_HOME="${CONDA_PREFIX}"
   export OF_DIR="/opt/openfold"
-  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
-  export PYTHONPATH="${OF_DIR}:/opt/conda/lib/python3.10/site-packages:${PYTHONPATH}"
+  export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+  export PYTHONPATH="${OF_DIR}:/opt/conda/lib/python3.10/site-packages${PYTHONPATH:+:${PYTHONPATH}}"
   export PATH="${CONDA_PREFIX}/bin:${PATH}"
+  export PYTHONWARNINGS="ignore::DeprecationWarning,ignore::FutureWarning"
 
 %runscript
   #!/bin/bash

From 744a636ecd2ca05b10ab6c461f7d64d50afff441 Mon Sep 17 00:00:00 2001
From: Tony Kew <tony.d.r.kew@gmail.com>
Date: Thu, 18 Dec 2025 15:30:16 -0500
Subject: [PATCH 16/16] Updated examples with the latest builds

Tony
---
 .../OpenFold/BUILD-ARM64.md                   |  11 +-
 .../OpenFold/CUDA_notes.txt                   |  31 +
 .../OpenFold/EXAMPLES.md                      | 648 +++++-------------
 .../2_ApplicationSpecific/OpenFold/README.md  |  19 +-
 .../OpenFold/environment-aarch64.yml          |   4 +-
 .../OpenFold/environment.yml                  |   3 +-
 6 files changed, 229 insertions(+), 487 deletions(-)
 create mode 100644 containers/2_ApplicationSpecific/OpenFold/CUDA_notes.txt

diff --git a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
index 16dc493..5aa54ed 100644
--- a/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
+++ b/containers/2_ApplicationSpecific/OpenFold/BUILD-ARM64.md
@@ -75,7 +75,7 @@ export APPTAINER_CACHEDIR="${SLURMTMPDIR}"
 
 Build your container
 
-Note: Building the OpenFold container takes about three hours
+Note: Building the OpenFold container takes about ten minutes
 
 ```
 apptainer build --build-arg SLURMTMPDIR="${SLURMTMPDIR}" \
@@ -188,15 +188,6 @@ Note: There may be no output for over half a minute
 Abridged sample output:
 
 > ```
-> [2025-08-22 11:47:24,610] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
->  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
->  [WARNING]  async_io: please install the libaio-dev package with apt
->  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
-> /opt/conda/lib/python3.10/site-packages/deepspeed-0.14.5+unknown-py3.10.egg/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed-0.14.5+unknown-py3.10.egg/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
 > usage: train_openfold.py [-h] [--train_mmcif_data_cache_path TRAIN_MMCIF_DATA_CACHE_PATH] [--use_single_seq_mode USE_SINGLE_SEQ_MODE]
 >                          [--distillation_data_dir DISTILLATION_DATA_DIR] [--distillation_alignment_dir DISTILLATION_ALIGNMENT_DIR] [--val_data_dir VAL_DATA_DIR]
 >                          [--val_alignment_dir VAL_ALIGNMENT_DIR] [--val_mmcif_data_cache_path VAL_MMCIF_DATA_CACHE_PATH] [--kalign_binary_path KALIGN_BINARY_PATH]
diff --git a/containers/2_ApplicationSpecific/OpenFold/CUDA_notes.txt b/containers/2_ApplicationSpecific/OpenFold/CUDA_notes.txt
new file mode 100644
index 0000000..6721683
--- /dev/null
+++ b/containers/2_ApplicationSpecific/OpenFold/CUDA_notes.txt
@@ -0,0 +1,31 @@
+Open MPI is built with CUDA awareness but this support is disabled by default.
+To enable it, please set the environment variable "OMPI_MCA_opal_cuda_support"
+to "true"
+
+  export OMPI_MCA_opal_cuda_support=true
+
+before launching your MPI processes.  Equivalently, you can set the MCA
+parameter in the command line:
+
+  mpiexec --mca opal_cuda_support 1 ...
+ 
+
+In addition, the UCX support is also built but disabled by default.
+To enable it, first install UCX (conda install -c conda-forge ucx). Then, set
+the environment variables OMPI_MCA_pml and OMPI_MCA_osc to "ucx"
+
+  export OMPI_MCA_pml="ucx"
+  export OMPI_MCA_osc="ucx"
+
+before launching your MPI processes.  Equivalently, you can set the MCA
+parameters in the command line:
+
+  mpiexec --mca pml ucx --mca osc ucx ...
+
+Note that you might also need to set the environment variable
+"UCX_MEMTYPE_CACHE" to "n" for CUDA awareness via UCX.
+
+  export UCX_MEMTYPE_CACHE="n"
+
+Please consult UCX's documentation for details.
+
diff --git a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
index 28b9e2d..dabcdbb 100644
--- a/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
+++ b/containers/2_ApplicationSpecific/OpenFold/EXAMPLES.md
@@ -7,10 +7,14 @@ The following example is from the [OpenFold Inference docs](https://openfold.rea
 Start an interactive job with a GPU e.g.
 NOTE: OpenFold Inference only uses one GPU
 
+```
+export SBATCH_ACCOUNT="[SlurmAccountName]"
+```
+
 ```
 salloc --cluster=ub-hpc --partition=general-compute --qos=general-compute \
- --account="[SlurmAccountName]" --mem=128GB --nodes=1 --cpus-per-task=1 \
- --tasks-per-node=12 --gpus-per-node=1 --time=02:00:00
+ --mem=128GB --nodes=1 --cpus-per-task=1 --tasks-per-node=12 \
+ --gpus-per-node=1 --time=02:00:00
 ```
 
 Change to your OpenFold directory
@@ -25,7 +29,7 @@ Create a top level output directory
 mkdir -p ./output
 ```
 
-Start the container, with the "./output" directory a the top level output
+Start the container, with the "./output" directory as the top level output
  directory "/output" inside the contianer.
 
 ```
@@ -73,10 +77,10 @@ Sample output:
 
 > ```
 > total 1
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 alignments
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 fasta_dir
-> -rwxrwxr-x 1 [CCRusername] nogroup  530 Aug 21 15:45 inference.sh
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 21 15:45 sample_predictions
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:34 alignments
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:34 fasta_dir
+> -rwxrwxr-x 1 [CCRusername] nogroup  530 Dec 17 10:34 inference.sh
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:34 sample_predictions
 > ```
 
 ## Run model inference
@@ -109,19 +113,15 @@ python3 "${OF_DIR}/run_pretrained_openfold.py" \
 Sample output:
 
 > ```
-> [2025-08-25 14:49:35,606] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
+> [2025-12-17 10:36:24,872] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/[CCRusername]/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Successfully loaded JAX parameters at /opt/openfold/openfold/resources/params/params_model_1_ptm.npz...
 > INFO:/opt/openfold/run_pretrained_openfold.py:Using precomputed alignments for 6KWC_1 at ./examples/monomer/alignments...
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Running inference for 6KWC_1...
-> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 56.51587692601606
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 19.050397651968524
 > INFO:/opt/openfold/run_pretrained_openfold.py:Output written to /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
 > INFO:/opt/openfold/run_pretrained_openfold.py:Running relaxation on /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
-> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 10.501414217054844
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 10.55438576999586
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxed output written to /output/PDB_6KWC/pre-computed_alignments/predictions/6KWC_1_model_1_ptm_relaxed.pdb...
 > ```
 
@@ -136,18 +136,18 @@ Sample output:
 > ```
 > /output/PDB_6KWC/pre-computed_alignments:
 > total 1
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:50 .
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:49 ..
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 25 14:51 predictions
-> -rw-rw-r-- 1 [CCRusername] nogroup   44 Aug 25 14:50 timings.json
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:37 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:36 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:37 predictions
+> -rw-rw-r-- 1 [CCRusername] nogroup   45 Dec 17 10:37 timings.json
 > 
 > /output/PDB_6KWC/pre-computed_alignments/predictions:
-> total 344
-> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 25 14:51 .
-> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 25 14:50 ..
-> -rw-rw-r-- 1 [CCRusername] nogroup 230149 Aug 25 14:51 6KWC_1_model_1_ptm_relaxed.pdb
-> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Aug 25 14:50 6KWC_1_model_1_ptm_unrelaxed.pdb
-> -rw-rw-r-- 1 [CCRusername] nogroup     34 Aug 25 14:51 timings.json
+> total 341
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Dec 17 10:37 .
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Dec 17 10:37 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup 227310 Dec 17 10:37 6KWC_1_model_1_ptm_relaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Dec 17 10:37 6KWC_1_model_1_ptm_unrelaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup     33 Dec 17 10:37 timings.json
 > ```
 
 
@@ -182,19 +182,15 @@ python3 "${OF_DIR}/run_pretrained_openfold.py" \
 Sample output:
 
 > ```
-> [2025-08-26 09:33:00,593] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
+> [2025-12-17 10:39:12,706] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/[CCRusername]/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Successfully loaded JAX parameters at /opt/openfold/openfold/resources/params/params_model_1_ptm.npz...
 > INFO:/opt/openfold/run_pretrained_openfold.py:Generating alignments for 6KWC_1...
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Running inference for 6KWC_1...
-> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 58.297142078052275
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Inference time: 15.672921338002197
 > INFO:/opt/openfold/run_pretrained_openfold.py:Output written to /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
 > INFO:/opt/openfold/run_pretrained_openfold.py:Running relaxation on /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_unrelaxed.pdb...
-> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 10.48616563109681
+> INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxation time: 6.940809735970106
 > INFO:/opt/openfold/openfold/utils/script_utils.py:Relaxed output written to /output/PDB_6KWC/without_pre-computed_alignments/predictions/6KWC_1_model_1_ptm_relaxed.pdb...
 > ```
 
@@ -209,34 +205,34 @@ Sample output:
 > ```
 > /output/PDB_6KWC/without_pre-computed_alignments:
 > total 1
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:52 .
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:35 ..
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:33 alignments
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:53 predictions
-> -rw-rw-r-- 1 [CCRusername] nogroup   45 Aug 26 09:52 timings.json
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 11:24 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:39 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:39 alignments
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 11:24 predictions
+> -rw-rw-r-- 1 [CCRusername] nogroup   45 Dec 17 11:24 timings.json
 > 
 > /output/PDB_6KWC/without_pre-computed_alignments/alignments:
 > total 0
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:33 .
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:52 ..
-> drwxrwsr-x 2 [CCRusername] nogroup 4096 Aug 26 09:51 6KWC_1
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 10:39 .
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 11:24 ..
+> drwxrwsr-x 2 [CCRusername] nogroup 4096 Dec 17 11:23 6KWC_1
 > 
 > /output/PDB_6KWC/without_pre-computed_alignments/alignments/6KWC_1:
 > total 7028
-> drwxrwsr-x 2 [CCRusername] nogroup    4096 Aug 26 09:51 .
-> drwxrwsr-x 2 [CCRusername] nogroup    4096 Aug 26 09:33 ..
-> -rw-rw-r-- 1 [CCRusername] nogroup  397302 Aug 26 09:51 bfd_uniclust_hits.a3m
-> -rw-rw-r-- 1 [CCRusername] nogroup  136021 Aug 26 09:38 hhsearch_output.hhr
-> -rw-rw-r-- 1 [CCRusername] nogroup 1972569 Aug 26 09:48 mgnify_hits.sto
-> -rw-rw-r-- 1 [CCRusername] nogroup 4689644 Aug 26 09:38 uniref90_hits.sto
+> drwxrwsr-x 2 [CCRusername] nogroup    4096 Dec 17 11:23 .
+> drwxrwsr-x 2 [CCRusername] nogroup    4096 Dec 17 10:39 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup  397302 Dec 17 11:23 bfd_uniclust_hits.a3m
+> -rw-rw-r-- 1 [CCRusername] nogroup  136025 Dec 17 10:55 hhsearch_output.hhr
+> -rw-rw-r-- 1 [CCRusername] nogroup 1972569 Dec 17 11:19 mgnify_hits.sto
+> -rw-rw-r-- 1 [CCRusername] nogroup 4689644 Dec 17 10:54 uniref90_hits.sto
 > 
 > /output/PDB_6KWC/without_pre-computed_alignments/predictions:
-> total 344
-> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 26 09:53 .
-> drwxrwsr-x 2 [CCRusername] nogroup   4096 Aug 26 09:52 ..
-> -rw-rw-r-- 1 [CCRusername] nogroup 230149 Aug 26 09:53 6KWC_1_model_1_ptm_relaxed.pdb
-> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Aug 26 09:52 6KWC_1_model_1_ptm_unrelaxed.pdb
-> -rw-rw-r-- 1 [CCRusername] nogroup     33 Aug 26 09:53 timings.json
+> total 341
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Dec 17 11:24 .
+> drwxrwsr-x 2 [CCRusername] nogroup   4096 Dec 17 11:24 ..
+> -rw-rw-r-- 1 [CCRusername] nogroup 227310 Dec 17 11:24 6KWC_1_model_1_ptm_relaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup 120528 Dec 17 11:24 6KWC_1_model_1_ptm_unrelaxed.pdb
+> -rw-rw-r-- 1 [CCRusername] nogroup     33 Dec 17 11:24 timings.json
 > ```
 
 
@@ -271,9 +267,12 @@ Note: Other possible options for "run_pretrained_openfold.py"
 Start an interactive job with more than one GPU e.g.
 
 ```
-salloc --cluster=ub-hpc --account="[SlurmAccountName]" \
- --partition=industry-dgx --qos=industry --mem=128GB --nodes=1 \
- --gpus-per-node=8 --mem=0 --exclusive --time=3-00:00:00
+export SBATCH_ACCOUNT="[SlurmAccountName]"
+```
+
+```
+salloc --cluster=ub-hpc --partition=industry-dgx --qos=industry --mem=128GB \
+ --nodes=1 --gpus-per-node=8 --mem=0 --exclusive --time=3-00:00:00
 ```
 
 sample outout:
@@ -381,218 +380,89 @@ python3 "${OF_DIR}/train_openfold.py" \
  "2021-10-10"
 ```
 
-Sample output:
+Sample abridged output:
 
 > ```
-> [2025-08-26 11:53:29,143] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:06,327] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
 > Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 0] Seed set to 504588624
+> [rank: 0] Seed set to 1054328241
 > /opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
 > Using bfloat16 Automatic Mixed Precision (AMP)
 > GPU available: True (cuda), used: True
 > TPU available: False, using: 0 TPU cores
-> HPU available: False, using: 0 HPUs
-> You are using a CUDA device ('NVIDIA H100 80GB HBM3') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
 > Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
-> [2025-08-26 11:54:03,699] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,705] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,706] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,710] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,710] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 11:54:03,715] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> [2025-12-17 11:56:37,498] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,528] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,538] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,541] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,542] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,544] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+> [2025-12-17 11:56:37,546] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
 > Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+> [...]
 > Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 1] Seed set to 504588624
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 2] Seed set to 504588624
-> [rank: 6] Seed set to 504588624
-> [rank: 4] Seed set to 504588624
-> [rank: 7] Seed set to 504588624
-> [rank: 5] Seed set to 504588624
-> [rank: 3] Seed set to 504588624
+> [rank: 2] Seed set to 1054328241
+> [rank: 3] Seed set to 1054328241
+> [rank: 5] Seed set to 1054328241
+> [rank: 7] Seed set to 1054328241
+> [rank: 1] Seed set to 1054328241
+> [rank: 6] Seed set to 1054328241
+> [rank: 4] Seed set to 1054328241
+> Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
+> Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
 > Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
 > Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
-> Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
 > Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
+> Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
 > Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
-> Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
-> Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
 > ----------------------------------------------------------------------------------------------------
 > distributed_backend=nccl
 > All distributed processes registered. Starting with 8 processes
 > ----------------------------------------------------------------------------------------------------
 > 
-> LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
 > LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+> LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
 > LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
 > LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
 > /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
 >   warnings.warn(
 > /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+> [...]
 >   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
-> 
->   | Name  | Type          | Params | Mode 
-> ------------------------------------------------
-> 0 | model | AlphaFold     | 93.2 M | train
-> 1 | loss  | AlphaFoldLoss | 0      | train
-> ------------------------------------------------
-> 93.2 M    Trainable params
-> 0         Non-trainable params
-> 93.2 M    Total params
-> 372.916   Total estimated model params size (MB)
-> 4451      Modules in train mode
-> 0         Modules in eval mode
+> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+> ┏━━━┳━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓
+> ┃   ┃ Name  ┃ Type          ┃ Params ┃ Mode  ┃ FLOPs ┃
+> ┡━━━╇━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩
+> │ 0 │ model │ AlphaFold     │ 93.2 M │ train │     0 │
+> │ 1 │ loss  │ AlphaFoldLoss │      0 │ train │     0 │
+> └───┴───────┴───────────────┴────────┴───────┴───────┘
+> Trainable params: 93.2 M                                                                                                                                           
+> Non-trainable params: 0                                                                                                                                            
+> Total params: 93.2 M                                                                                                                                               
+> Total estimated model params size (MB): 372                                                                                                                        
+> Modules in train mode: 4451                                                                                                                                        
+> Modules in eval mode: 0                                                                                                                                            
+> Total FLOPs: 0                                                                                                                                                     
 > /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `list` across ranks is zero. Please make sure this was your intention.
-> Epoch 0:   0%|                                                                                                                            | 0/1250 [00:00<?, ?it/s]/opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
+> Epoch 0/999 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1250 0:00:00 • -:--:-- 0.00it/s
 > /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
 >   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/lddt_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/drmsd_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> WARNING:root:The exact sequence TTADIVVFDEISMATNYDLSVVNARLRAKHYVYIGDPAQLPAPRTLLTKGTLEPEYFNSVCRLMKTIGPDMFLGTCRRCPAEIVDTVSALVYDNKLKAHKDKSAQCFKMFYKGVITHDVSSAINRPQIGVVREFLTRNPAWRKAVFISPYNSQNAVASKILGLPTQTVDSSQGSEYDYVIFTQTTETAHSCNVNRFNVAITRAKVGILCIMSDR was not found in 7o7y_BK. Realigning the template to the actual sequence.
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> Epoch 0:   1%|█▎                                                                                             | 17/1250 [01:45<2:07:09,  0.16it/s, train/loss=90.00]
 > [...]
-> Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1250/1250 [1:31:49<00:00,  0.23it/s, train/loss=55.40]
+> Epoch 0/999 ╸━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21/1250 0:02:02 • 1:18:06 0.26it/s train/loss: 143.231
+> [...]
+> Epoch 0/999 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╸━ 1215/1250 1:27:11 • 0:02:25 0.24it/s train/loss: 53.322
+> [....]
 > ```
 
 Note: This example will fail with an odd "strategy=None" error if run on a
 node with only one GPU
 
-In the above example, I stopped the training after Ephoch 0 which created the
-following checkpoint file:
+In the above example, I stopped the training with <ctrl>c after Ephoch 0 which
+created the following checkpoint file:
 
 ```
 ls -l /output/PDB/2021-10-10/checkpoints
@@ -600,10 +470,11 @@ ls -l /output/PDB/2021-10-10/checkpoints
 
 > ```
 > total 1464717
-> -rw-rw-r-- 1 tkewtest nogroup 1499869690 Aug 26 13:26 /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+> -rw-rw-r-- 1 tkewtest nogroup 1499870010 Dec 17 13:27 0-1250.ckpt
 > ```
 
-Restarted the training from the checkpoint:
+Restarted the training from the checkpoint, using the checkpoint file
+"/output/PDB/2021-10-10/checkpoints/0-1250.ckpt"
 
 ```
 export TRITON_CACHE_DIR="${SLURMTMPDIR}"
@@ -625,235 +496,84 @@ python3 "${OF_DIR}/train_openfold.py" \
  "2021-10-10"
 ```
 
-Sample output:
-
-```
-> [2025-08-26 15:33:33,529] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 0] Seed set to 741092476
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
-> Using bfloat16 Automatic Mixed Precision (AMP)
-> GPU available: True (cuda), used: True
-> TPU available: False, using: 0 TPU cores
-> HPU available: False, using: 0 HPUs
-> You are using a CUDA device ('NVIDIA H100 80GB HBM3') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
-> Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
-> [2025-08-26 15:33:57,402] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,418] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,426] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,431] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,443] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,445] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> [2025-08-26 15:33:57,448] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 2] Seed set to 741092476
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
-> [rank: 3] Seed set to 741092476
-> [rank: 1] Seed set to 741092476
-> [rank: 5] Seed set to 741092476
-> [rank: 4] Seed set to 741092476
-> [rank: 6] Seed set to 741092476
-> [rank: 7] Seed set to 741092476
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> /opt/openfold/train_openfold.py:328: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
->   sd = torch.load(args.resume_from_ckpt)
-> Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
-> Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
-> Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
-> Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
-> Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
-> Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
-> ----------------------------------------------------------------------------------------------------
-> distributed_backend=nccl
-> All distributed processes registered. Starting with 8 processes
-> ----------------------------------------------------------------------------------------------------
-> 
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:701: Checkpoint directory /output/PDB/2021-10-10/checkpoints exists and is not empty.
-> Restoring states from the checkpoint path at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
-> LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> /opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
->   warnings.warn(
-> 
->   | Name  | Type          | Params | Mode 
-> ------------------------------------------------
-> 0 | model | AlphaFold     | 93.2 M | train
-> 1 | loss  | AlphaFoldLoss | 0      | train
-> ------------------------------------------------
-> 93.2 M    Trainable params
-> 0         Non-trainable params
-> 93.2 M    Total params
-> 372.916   Total estimated model params size (MB)
-> 4451      Modules in train mode
-> 0         Modules in eval mode
-> Restored all states from the checkpoint at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `list` across ranks is zero. Please make sure this was your intention.
-> Training: |                                                                                                                                  | 0/? [00:00<?, ?it/s]WARNING:root:The exact sequence LDARLDTVYDAIVLGGGMGGLSAAIYLARYGLKCLVVEKGRGRSFWMQDLRNYVGLDPDTPGRDIITHSTQQALHWGADLLRGYVEDVTDEGDTLAVKVKVGKKDSLYPIFRTKYVIAATGIIDNLPQLEDMQNVYDYAGYTLHVCMICDGFDMWDQKAVLIAGTEGQINAAFVLNWFTPYITVLTHGLCTVGDEMKAKLADHGYPLHEAAITKFLGEDHKMSGVELVDGTVMEATTGLINMGSVYHNHYLKGIEGLEWDGENLVTNDMAQTSHPRIFALGDLKKGLNQVSVAVADGTLAATQIWRNIRRASEPRK was not found in 5k0a_D. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence ATLREQSFDEAWLFHRGDIAEGEKQSLDDSQWRQINLPHDWSIEDIPGTNSPFTADAATEVAGGFTVGGTGWYRKHFYIDAAEKGKAIAVSFDGIYMNADIWVNDRHVANHVYGYTAFELDITDYVRFGAENLIAVRVKNEGMNCRWYTGSGIYRHTFLKITNPLHFETWGTFVTTPVATADKAEVHVQSVLANTEKVTGKVILETRIVDKNNHTVARKEQLVTLDNKEKTEVGHALEVLAPQLWSIDNPYLYQVVNRLLQDDKVIDEEYISIGIRNIAFSAENGFQLNGKSMKLKGGCIHHDNGLLGAKAFDRAEERKIELLKAAGFNALRLSHNPPSIALLNACDRLGMLVIDEAFDMWRYGHYQYDYAQYFDKLWKEDLHSMVARDRNHPSVIMWSIGNEIKNKETAEIVDICRELTGFVKTLDTTRPVTAGVNSIVDATDDFLAPLDVCGYNYALNRYESDAKRHPDRIIYASESYASQAYDYWKGVEDHSWVIGDFIWTAFDYIGEASIGWCGYPLDKRIFPWNHANCGDLNLSGERRPQSYLRETLWSDAPVSHIVVTPPVPSFPLNPDKADWSVWDFPDVVDHWNFPGYEGKKMTVSVYSNCEQVELFLNGESLGKQENTADKKNTLVWEVPYAHGILKAVSYNKGGEVGTATLESAGKVEKIRLSADRTEIVADGNDLSYITLELVDSKGIRNQLAEELVAFSIEGDATIEGVGNANPMSIESFVANSRKTWRGSNLLVVRSGKSSGRIIVTAKVKALPVASITIT was not found in 6b6l_B. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence VRKRVLIGLKDAPNFVMRLFTVEPGGLIDRASHPWEHEIFVLKGKLTVLKEQGEETVEEGFYIFVEPNEIHGFRNDTDSEVEFLA was not found in 6l2e_B. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence MVILEVANPQEAARVLNENLLVGYFLPCKLVVYQENGTTKIGMPK was not found in 1q9u_B. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence GRLGVTRNKIMTAQYECYQKIMQDPIQQAEGVYCQRTWDGWLCWNDVAAGTESMQLCPDYFQDFDPSEKVTKICDQDGNWFRHPASQRTWTNYTQCNVNT was not found in 6zho_A. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence SSVPMTQNRNILWIMCDQLRFDYLSCYGHERLNTPNIDKLAKRGVRFTNAYVQATVXGPSRMSAYTGRYVRSHGSTQNGIPLRVGEPTLGDHLRDVGMRNVLIGKTHMRPDLDGMKRLGIDPDSEIGARVGEGGFDAFDRDDGVHPTGYRKKEPAYNDYLRHAGFQAENPWEFWANSAEGKGGENQSGWLLTHADKPARVPEEHSETAYMTRRAMEFMEAAEKDGRPWCAHLSYIKPHWPYIVPAPYHDMFGPDDVKPAVRSDEELKAAHPLFKAMTEEVYSRNFARDEVREKVIPAYMGLIKQIDDQLGQLFAFMQERGLDENTMIVFTADHGDYLGDHWMGEKYLFYEAAAKVPLIIYDPSDKADATRGTVSDALVEMIDLAPTFVDYAGGVPPMHILEGKSLLPLLHDDDSSWDRQYVFSELDYSNLPARLKLGRDIQDCRATMVFDGRYKLVEVMGFAPILFDLEVDPDELKDLGRDPSAEEVRQRLTSALDAWHRNTRQR was not found in 4upi_A. Realigning the template to the actual sequence.
-> WARNING:root:The exact sequence PRGSHMASIKKPNVLILLFDDMRFDTFSYRNGPVSTPNIDALANEGTRFDQAMTSTGLASPSRAAMFTGRWGHKTGLDDNVGLYHSRLSELSLSEGSVIKRATSIGYDVSYVGKWHLGAQGPALRGANFMWGHDKDEERNGRPFTPYQTQKNVARMNAGERDKNGEKHDYYKTLPGTYADTVTAKEVNEGKLMLQNAAKSDKPFFGIVSFEQPHPPYRVPEPYASMYDYKDIKLPKNFGIKRKHKPMAQDDIWWPWHDVSHMSETDWRKAHSFYYGAIAMIDHAVGELINTAKEEGLYDDLHIILVGDQGSMLGEHNLYDKGPYAYDELMRMPLIIRDPSLEPKIINRQVSMLDIAPTLRQWMTLPLDGDEDGRSLLPLMKQGDSADAGKDDISLYAYEWYNGGWFGIRAIRTPEMKFVWNPGDSRDELYDLKNDPYEITNQIDNPKYKKQLTDLVHKMAGELNRIDDPSLTKF was not found in 6pt4_B. Realigning the template to the actual sequence.
-> Epoch 1:   0%|                                                                                                                            | 0/1250 [00:00<?, ?it/s]/opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/distogram_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/experimentally_resolved_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/fape_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/plddt_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/masked_msa_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/supervised_chi_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/violation_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/unscaled_loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/loss_epoch', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/lddt_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/conda/lib/python3.10/site-packages/pytorch_lightning/core/module.py:520: You called `self.log('train/drmsd_ca', ..., logger=True)` but have no logger configured. You can enable one by doing `Trainer(logger=ALogger(...))`
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:202: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/openfold/openfold/model/primitives.py:226: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> /opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
->   return fn(*args, **kwargs)
-> /opt/openfold/openfold/model/primitives.py:258: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
->   with torch.cuda.amp.autocast(enabled=False):
-> Epoch 1:   2%|█▌                                                                                             | 21/1250 [01:56<1:53:48,  0.18it/s, train/loss=50.10]
-> [...]
+Sample abridged output:
+
+```
+[2025-12-17 14:13:31,823] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[rank: 0] Seed set to 743491233
+/opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
+Using bfloat16 Automatic Mixed Precision (AMP)
+GPU available: True (cuda), used: True
+TPU available: False, using: 0 TPU cores
+Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
+[2025-12-17 14:14:00,153] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[2025-12-17 14:14:00,563] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-12-17 14:14:00,570] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-12-17 14:14:00,572] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-12-17 14:14:00,578] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-12-17 14:14:00,588] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-12-17 14:14:00,591] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[...]
+Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[rank: 4] Seed set to 743491233
+[rank: 5] Seed set to 743491233
+[rank: 7] Seed set to 743491233
+[rank: 3] Seed set to 743491233
+[rank: 6] Seed set to 743491233
+[rank: 1] Seed set to 743491233
+[rank: 2] Seed set to 743491233
+Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
+Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
+Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
+Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
+Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
+Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
+Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
+----------------------------------------------------------------------------------------------------
+distributed_backend=nccl
+All distributed processes registered. Starting with 8 processes
+----------------------------------------------------------------------------------------------------
+
+/opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:881: Checkpoint directory /output/PDB/2021-10-10/checkpoints exists and is not empty.
+Restoring states from the checkpoint path at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
+/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+  warnings.warn(
+/opt/conda/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:62: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
+[...]
+  warnings.warn(
+/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/model_summary/model_summary.py:242: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.
+┏━━━┳━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓
+┃   ┃ Name  ┃ Type          ┃ Params ┃ Mode  ┃ FLOPs ┃
+┡━━━╇━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩
+│ 0 │ model │ AlphaFold     │ 93.2 M │ train │     0 │
+│ 1 │ loss  │ AlphaFoldLoss │      0 │ train │     0 │
+└───┴───────┴───────────────┴────────┴───────┴───────┘
+Trainable params: 93.2 M                                                                                                                                           
+Non-trainable params: 0                                                                                                                                            
+Total params: 93.2 M                                                                                                                                               
+Total estimated model params size (MB): 372                                                                                                                        
+Modules in train mode: 4451                                                                                                                                        
+Modules in eval mode: 0                                                                                                                                            
+Total FLOPs: 0                                                                                                                                                     
+Restored all states from the checkpoint at /output/PDB/2021-10-10/checkpoints/0-1250.ckpt
+/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:106: Total length of `list` across ranks is zero. Please make sure this was your intention.
+WARNING:root:The exact sequence HPETTPTMLTAPIDSGFLKDPVITPEGFVYNKSSILKWLETKKEDPQSRKPLTAKDLQPFPELLIIVNRFVET was not found in 4wz0_A. Realigning the template to the actual sequence.
+WARNING:root:The exact sequence LPYSLTSDNCEHFVNHLRY was not found in 4dpz_X. Realigning the template to the actual sequence.
+Epoch 1/999 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0/1250 0:00:00 • -:--:-- 0.00it/s
+[...]
+Epoch 1/999 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7/1250 0:01:13 • 1:28:29 0.23it/s train/loss: 49.389
+[...]
 ```
 
 You can monitor the GPU utilization which running the training as following,
diff --git a/containers/2_ApplicationSpecific/OpenFold/README.md b/containers/2_ApplicationSpecific/OpenFold/README.md
index 562b50d..4ae982a 100644
--- a/containers/2_ApplicationSpecific/OpenFold/README.md
+++ b/containers/2_ApplicationSpecific/OpenFold/README.md
@@ -14,8 +14,12 @@ Note: a GPU is NOT needed to build the OpenFold container<br/>
 See CCR docs for more info on [running jobs](https://docs.ccr.buffalo.edu/en/latest/hpc/jobs/#interactive-job-submission)
 
 ```
-salloc --cluster=ub-hpc --partition=debug --qos=debug --account="[SlurmAccountName]" \
- --mem=0 --exclusive --time=01:00:00
+export SBATCH_ACCOUNT="[SlurmAccountName]"
+```
+
+```
+salloc --cluster=ub-hpc --partition=debug --qos=debug --mem=0 --exclusive \
+ --time=01:00:00
 ```
 
 sample outout:
@@ -133,14 +137,9 @@ python3 "${OF_DIR}/train_openfold.py" --help
 Sample output:
 
 > ```
-> [2025-08-18 17:02:46,110] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
->  [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
->  [WARNING]  async_io: please install the libaio-dev package with apt
->  [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
->   def forward(ctx, input, weight, bias=None):
-> /opt/conda/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
->   def backward(ctx, grad_output):
+> [2025-12-17 10:25:31,032] [WARNING] [real_accelerator.py:162:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.
+> [2025-12-17 10:25:31,093] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)
+> Warning: The default cache directory for DeepSpeed Triton autotune, /user/tkewtest/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
 > usage: train_openfold.py [-h] [--train_mmcif_data_cache_path TRAIN_MMCIF_DATA_CACHE_PATH] [--use_single_seq_mode USE_SINGLE_SEQ_MODE]
 >                          [--distillation_data_dir DISTILLATION_DATA_DIR] [--distillation_alignment_dir DISTILLATION_ALIGNMENT_DIR] [--val_data_dir VAL_DATA_DIR]
 >                          [--val_alignment_dir VAL_ALIGNMENT_DIR] [--val_mmcif_data_cache_path VAL_MMCIF_DATA_CACHE_PATH] [--kalign_binary_path KALIGN_BINARY_PATH]
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
index e7450e1..48ab7d4 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment-aarch64.yml
@@ -8,7 +8,9 @@ dependencies:
   - openmm
   - pdbfixer
   - biopython
-  - wandb
+  # W&B Automations API is experimental and it is recommend pinning the package
+  # version to reduce the risk of disruption
+  - wandb==0.23.1
   - modelcif
   - awscli
   - ml-collections
diff --git a/containers/2_ApplicationSpecific/OpenFold/environment.yml b/containers/2_ApplicationSpecific/OpenFold/environment.yml
index 5fb49ed..3a72560 100644
--- a/containers/2_ApplicationSpecific/OpenFold/environment.yml
+++ b/containers/2_ApplicationSpecific/OpenFold/environment.yml
@@ -21,9 +21,8 @@ dependencies:
   - scipy
   - tqdm
   - typing-extensions
-#  - wandb
   # W&B Automations API is experimental and it is recommend pinning the package
-  # version to 0.23.1 to reduce the risk of disruption
+  # version to reduce the risk of disruption
   - wandb==0.23.1
   - modelcif==0.7
   - awscli