NCATSTranslator · hyi · Oct 11, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/Snakefile b/Snakefile
@@ -1,6 +1,5 @@
 configfile: "config.yaml"
 
-
 include: "src/snakefiles/datacollect.snakefile"
 include: "src/snakefiles/anatomy.snakefile"
 include: "src/snakefiles/cell_line.snakefile"
@@ -20,13 +19,15 @@ include: "src/snakefiles/duckdb.snakefile"
 include: "src/snakefiles/reports.snakefile"
 include: "src/snakefiles/exports.snakefile"
 
+# Some general imports.
+import shutil
+from src.snakefiles.util import write_done
 
 # Some global settings.
 import os
 
 os.environ["TMPDIR"] = config["tmp_directory"]
 
-
 # Top-level rules.
 rule all:
     input:
@@ -41,10 +42,14 @@ rule all:
         # Build all the exports.
         config["output_directory"] + "/kgx/done",
         config["output_directory"] + "/sapbert-training-data/done",
+        # Store the config.yaml file used to produce the output.
+        config_file = "config.yaml",
     output:
         x=config["output_directory"] + "/reports/all_done",
-    shell:
-        "echo 'done' >> {output.x}"
+        output_config_file=config["output_directory"] + "/config.yaml",
+    run:
+        shutil.copyfile(input.config_file, output.output_config_file)
+        write_done(output.x)
 
 
 rule all_outputs:
@@ -65,8 +70,8 @@ rule all_outputs:
         config["output_directory"] + "/reports/publications_done",
     output:
         x=config["output_directory"] + "/reports/outputs_done",
-    shell:
-        "echo 'done' >> {output.x}"
+    run:
+        write_done(output.x)
 
 
 rule clean_compendia:

diff --git a/config.yaml b/config.yaml
@@ -16,6 +16,13 @@ intermediate_directory: babel_outputs/intermediate
 output_directory: babel_outputs
 tmp_directory: babel_downloads/tmp
 
+#
+# SHARED
+#
+
+# DuckDB settings for use in all DuckDB connections:
+duckdb_config: {}
+
 #
 # UMLS
 #
@@ -413,7 +420,5 @@ ensembl_datasets_to_skip:
   - otshawytscha_gene_ensembl
   - aocellaris_gene_ensembl
 
-duckdb_config: {}
-
 demote_labels_longer_than: 25
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ version = "1.14"
 description = "Babel creates cliques of equivalent identifiers across many biomedical vocabularies. "
 readme = "README.md"
 license = "MIT"
-requires-python = ">=3.11"
+requires-python = ">=3.11,<3.14"
 dependencies = [
     "apybiomart",
     "beautifulsoup4>=4.14.2",
@@ -28,6 +28,7 @@ dependencies = [
     "pyyaml>=6.0.3",
     "requests>=2.32.5",
     "snakemake>=9.13.3",
+    "snakemake-executor-plugin-slurm>=1.9.2",
     "sparqlwrapper>=2.0.0",
     "wheel>=0.45.1",
     "xmltodict>=1.0.2",

diff --git a/slurm/config.yaml b/slurm/config.yaml
@@ -0,0 +1,37 @@
+# This is a Snakemake profile (https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles) that provides
+# configuration options that should be applied when Snakemake is run on the RENCI Hatteras cluster using SLURM.
+#
+# To use this profile, run:
+#   $ snakemake --profile slurm
+#
+executor: slurm
+jobs: 50 # maximum number of parallel cluster jobs
+latency-wait: 60 # seconds
+slurm-delete-logfiles-older-than: 0 # Don't delete log files automatically.
+rerun-incomplete: true  # Re-run any jobs that failed with an incomplete status previously.
+keep-going: true # Keep going with independent jobs if a job fails.
+
+# Wrap Python execution with `time -v` to report on memory usage for each rule.
+python:
+  executable: "/usr/bin/time -v python"
+  log_stderr: true
+
+# Set up Hatteras partitions as per https://renci.atlassian.net/wiki/spaces/RENCI/pages/254443570/Cluster+Info#Hardware-Information
+partitions:
+  batch:
+    max_mem_mb: 191000 # 191 GB
+  largemem:
+    max_mem_mb: 1530329 # 1.5 TB
+
+# Default resource settings for all rules
+default-resources:
+  mem: 64G
+  runtime: 120 # minutes
+  cpus_per_task: 4
+
+# Set up the Slurm efficiency report.
+slurm-efficiency-report: True
+slurm-efficiency-report-path: babel_outputs/reports/slurm/slurm_efficiency_report.csv
+
+# Write Slurm logs into a `logs/` directory so we can look at them later.
+slurm-logdir: babel_outputs/logs
diff --git a/slurm/job/run_babel_mutiple_nodes.job b/slurm/job/run_babel_mutiple_nodes.job
@@ -0,0 +1,24 @@
+#!/bin/bash -l
+#SBATCH --job-name=babel-test-cluster
+#SBATCH --output=babel-test-cluster.out
+#SBATCH --time=1:00:00
+#SBATCH --mem=2G
+#SBATCH -n 1
+
+source ~/.bashrc
+conda activate babel
+
+# Go to Babel project directory
+cd /projects/babel/babel-ht-test/Babel
+
+export UMLS_API_KEY="YOUR UMLS API KEY"
+export PYTHONPATH=.
+
+# Build anatomy related compendia in a distributed fashion as defined in slurm/config.yaml profile 
+# Note that since Snakemake supports slurm executor plugin natively, submitting this as a SLURM batch 
+# job is not recommended since that will create an outer SLURM job running Snakemake which then 
+# submits innter SLURM jobs for workflow rules as specified in the profile. The recommended way 
+# is to run this directly on the login or head node. However, it might not be a good thing to have 
+# a long-running process on login/head nodes. So a good compromise is to still use the sbatch wrapper 
+# to submit the snakemake job but request minimal resources for the outer job as shown in this job script.
+snakemake --profile slurm anatomy
diff --git a/slurm/job/run_babel_one_node.job b/slurm/job/run_babel_one_node.job
@@ -0,0 +1,18 @@
+#!/bin/bash -l
+#SBATCH --job-name=babel-test-local
+#SBATCH --output=babel-test-local.out
+#SBATCH --time=5:00:00
+#SBATCH --mem=256G
+#SBATCH -n 1
+
+source ~/.bashrc
+conda activate babel
+
+# Go to Babel project directory
+cd /projects/babel/babel-ht-test/Babel_standalone
+
+export UMLS_API_KEY="YOUR UMLS API KEY"
+export PYTHONPATH=.
+
+# Build anatomy related compendia locally using 1 core
+snakemake --cores 4 anatomy --rerun-incomplete --latency-wait 60
diff --git a/slurm/run-babel-on-slurm.sh b/slurm/run-babel-on-slurm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+sbatch <<EOF
+#!/bin/bash
+#SBATCH --job-name=babel-${BABEL_VERSION:-current}
+#SBATCH --output=babel_outputs/logs/sbatch-${BABEL_VERSION:-babel-current}.out
+#SBATCH --error=babel_outputs/logs/sbatch-${BABEL_VERSION:-babel-current}.err
+#SBATCH --time=${BABEL_TIMEOUT:-24:00:00}
+#SBATCH --mem=16G
+#SBATCH --nodes=1
+#SBATCH --chdir=$PWD
+
+# Notes:
+# --chdir: Change the directory to whatever directory the sbatch job was
+#          started from. So you should run: BABEL_VERSION=babel-1.14 bash slurm/run-babel-on-slurm.sh
+
+source ~/.bashrc
+
+# Run Babel in a distributed fashion as defined in slurm/config.yaml profile
+#
+# Note that since Snakemake supports slurm executor plugin natively, submitting this as a SLURM batch
+# job is not recommended since that will create an outer SLURM job running Snakemake which then
+# submits innter SLURM jobs for workflow rules as specified in the profile. The recommended way
+# is to run this directly on the login or head node. However, it might not be a good thing to have
+# a long-running process on login/head nodes. So a good compromise is to still use the sbatch wrapper
+# to submit the snakemake job but request minimal resources for the outer job as shown in this job script.
+
+uv run snakemake --profile slurm $@
+
+EOF
diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -4,8 +4,10 @@
 from ftplib import FTP
 from io import BytesIO
 import gzip
-from datetime import timedelta
+from datetime import timedelta, datetime
 import time
+from pathlib import Path
+
 import requests
 import os
 import urllib
@@ -23,6 +25,7 @@
 
 # Configuration items
 WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000
+MAX_DOWNLOAD_ERROR = 10
 
 # Set up a logger.
 logger = get_logger(__name__)
@@ -141,15 +144,15 @@
         self.delta = timedelta(milliseconds=delta_ms)
 
     def get(self, url):
-        now = dt.now()
+        now = datetime.now()
         throttled = False
         if self.last_time is not None:
             cdelta = now - self.last_time
             if cdelta < self.delta:
                 waittime = self.delta - cdelta
                 time.sleep(waittime.microseconds / 1e6)
                 throttled = True
-        self.last_time = dt.now()
+        self.last_time = datetime.now()
         response = requests.get(url)
         return response, throttled
 
@@ -166,16 +169,32 @@
                 ntries += 1
 
 
-def pull_via_urllib(url: str, in_file_name: str, decompress=True, subpath=None):
+def pull_via_urllib(url: str, in_file_name: str, decompress=True, subpath=None, verify_gzip=False):
     """
-    Retrieve files via urllib, optionally decompresses it, and writes it locally into downloads
-    url: str - the url with the correct version attached
-    in_file_name: str - the name of the target file to work
-    returns: str - the output file name
+    Download a file via the given URL, optionally decompress it, and save it
+    to the specified local path. Handles HTTP redirects gracefully.
+
+    :param url: The base URL of the remote server (e.g., "http://example.com/").
+        It is combined with the provided filename to determine the full file path.
+    :type url: str
+    :param in_file_name: The name of the file to download, specified as the filename
+        on the remote server.
+    :type in_file_name: str
+    :param decompress: Whether to decompress the downloaded file if it is gzipped.
+        Defaults to True.
+    :type decompress: bool, optional
+    :param subpath: An optional subpath under the main download directory to save the file.
+        If None, the file is saved directly in the download directory.
+    :type subpath: str, optional
+    :param verify_gzip: If downloading a Gzip file that isn't being decompressed, verify that the
+        file is valid (by reading it). Has no effect if decompress=True.
+    :type verify_gzip: bool, optional
+    :return: The path to the downloaded (and optionally decompressed) file.
+    :rtype: str
     """
     # Everything goes in downloads
     download_dir = get_config()["download_directory"]
    working_dir = download_dir

    # get the (local) download file name, derived from the input file name
    if subpath is None:
@@ -187,38 +206,70 @@
     opener = urllib.request.build_opener(urllib.request.HTTPRedirectHandler())
 
     # get a handle to the ftp file
-    print(url + in_file_name)
-    handle = opener.open(url + in_file_name)
+    download_url = url + in_file_name
+    logger.info(f"Downloading {download_url}")
+    handle = opener.open(download_url)
 
     # create the compressed file
-    with open(dl_file_name, "wb") as compressed_file:
-        # while there is data
-        while True:
-            # read a block of data
-            data = handle.read(1024)
-
-            # fif nothing read about
-            if len(data) == 0:
-                break
-
-            # write out the data to the output file
-            compressed_file.write(data)
-
-    if decompress:
-        out_file_name = dl_file_name[:-3]
-
-        # create the output text file
-        with open(out_file_name, "w") as output_file:
-            # open the compressed file
-            with gzip.open(dl_file_name, "rt") as compressed_file:
-                for line in compressed_file:
-                    # write the data to the output file
-                    output_file.write(line)
-
-        # remove the compressed file
-        os.remove(dl_file_name)
-    else:
-        out_file_name = dl_file_name
+    download_verified = False
+    download_attempt = 0
+    while not download_verified:
+        Path(dl_file_name).unlink(missing_ok=True)
+        download_attempt += 1
+        if download_attempt > MAX_DOWNLOAD_ERROR:
+            raise RuntimeError(f"Could not download and verify {download_url}: more than {MAX_DOWNLOAD_ERROR} attempts.")
+        logger.info(f"Downloading {dl_file_name} using urllib, attempt {download_attempt}...")
+
+        with open(dl_file_name, "wb") as compressed_file:
+            # while there is data
+            while True:
+                # read a block of data
+                data = handle.read(1024)
+
+                # fif nothing read about
+                if len(data) == 0:
+                    break
+
+                # write out the data to the output file
+                compressed_file.write(data)
+
+        if decompress:
+            out_file_name = dl_file_name[:-3]
+
+            # create the output text file
+            with open(out_file_name, "w") as output_file:
+                # open the compressed file
+                with gzip.open(dl_file_name, "rt") as compressed_file:
+                    for line in compressed_file:
+                        # write the data to the output file
+                        output_file.write(line)
+
+            # remove the compressed file
+            os.remove(dl_file_name)
+
+            download_verified = True
+        else:
+            out_file_name = dl_file_name
+
+            # Do we need to verify this gzip file?
+            download_verified = True
+            if verify_gzip:
+                # Is it blank/very small? If so, we immediately fail verification.
+                file_size = os.path.getsize(out_file_name)
+                if file_size < 1024:
+                    logger.warning(f"Downloaded Gzip file {out_file_name} is too small ({file_size} bytes), skipping verification.")
+                    download_verified = False
+                    continue
+
+                # To verify a Gzip file, we need to read it entirely.
+                try:
+                    with gzip.open(out_file_name, "rb") as f:
+                        for _ in iter(lambda: f.read(1024 * 1024), b""):
+                            pass
+                    download_verified = True
+                except Exception as e:
+                    logger.warning(f"Error while verifying downloaded Gzip file {out_file_name}: {e}")
+                    download_verified = False
 
     # return the filename to the caller
     return out_file_name
@@ -538,11 +589,11 @@
                    possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"])

                # Step 2. Filter out any suspicious labels.
                filtered_possible_labels = [l for l in possible_labels if l]  # Ignore blank or empty names.

                # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
                # least one label shorter than this limit.
                labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config["demote_labels_longer_than"]]
                if labels_shorter_than_limit:
                    filtered_possible_labels = labels_shorter_than_limit

@@ -731,7 +782,7 @@
    shit_prefixes = set(["KEGG", "PUBCHEM"])
    test_id = "xUBERON:0002262"
    debugit = False
    excised = set()
    for xgroup in newgroups:
        if isinstance(xgroup, frozenset):
            group = set(xgroup)
@@ -751,7 +802,7 @@
        existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set]
        # All of these sets are now going to be combined through the equivalence of our new set.
        existing_sets = [es[0] for es in existing_sets_w_x]
        x = [es[1] for es in existing_sets_w_x]
        newset = set().union(*existing_sets)
        if debugit:
            print("merges:", existing_sets)
@@ -779,7 +830,7 @@
        for up in unique_prefixes:
            if test_id in group:
                print("up?", up)
            idents = [e if type(e) == str else e.identifier for e in newset]
            if len(set([e for e in idents if (e.split(":")[0] == up)])) > 1:
                bad += 1
                setok = False
@@ -789,14 +840,14 @@
                    wrote.add(fs)
                for gel in group:
                    if Text.get_prefix_or_none(gel) == pref:
                        killer = gel
                # for preset in wrote:
                #    print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n')
                # print('------------')
        NPC = sum(1 for s in newset if s.startswith("PUBCHEM.COMPOUND:"))
        if ("PUBCHEM.COMPOUND:3100" in newset) and (NPC > 3):
            if debugit:
                l = sorted(list(newset))
                print("bad")
                for li in l:
                    print(li)

diff --git a/src/cluster_config.yml b/src/cluster_config.yml