NCATSTranslator · gaurav · Dec 14, 2025 · Dec 14, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/Snakefile b/Snakefile
@@ -1,5 +1,6 @@
 configfile: "config.yaml"
 
+
 include: "src/snakefiles/datacollect.snakefile"
 include: "src/snakefiles/anatomy.snakefile"
 include: "src/snakefiles/cell_line.snakefile"
@@ -19,6 +20,7 @@ include: "src/snakefiles/duckdb.snakefile"
 include: "src/snakefiles/reports.snakefile"
 include: "src/snakefiles/exports.snakefile"
 
+
 # Some general imports.
 import shutil
 from src.snakefiles.util import write_done
@@ -28,6 +30,7 @@ import os
 
 os.environ["TMPDIR"] = config["tmp_directory"]
 
+
 # Top-level rules.
 rule all:
     input:
@@ -43,7 +46,7 @@ rule all:
         config["output_directory"] + "/kgx/done",
         config["output_directory"] + "/sapbert-training-data/done",
         # Store the config.yaml file used to produce the output.
-        config_file = "config.yaml",
+        config_file="config.yaml",
     output:
         x=config["output_directory"] + "/reports/all_done",
         output_config_file=config["output_directory"] + "/config.yaml",

diff --git a/input_data/parse_bad_mappings.py b/input_data/parse_bad_mappings.py
@@ -1,18 +1,18 @@
-from collections import defaultdict
 from ast import literal_eval
+from collections import defaultdict
 
 
 def read_bad_hp_mappings(fn):
     drops = defaultdict(set)
-    with open(fn, "r") as infile:
+    with open(fn) as infile:
         for line in infile:
             if line.startswith("-"):
                 continue
             x = line.strip().split("\t")
             hps = x[0]
             commaindex = hps.index(",")
             curie = hps[1:commaindex]
-            name = hps[commaindex + 1 : -1]
+            # name = hps[commaindex + 1 : -1]
             badset = literal_eval(x[1])
             drops[curie].update(badset)
     return drops

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,13 +48,31 @@ apybiomart = { git = "https://github.com/gaurav/apybiomart.git", rev = "change-c
 
 [dependency-groups]
 dev = [
+    "ruff>=0.14.9",
     "snakefmt>=0.11.2",
 ]
 
 # Linting/formatting configuration
 [tool.ruff]
-line-length = 160
+line-length = 120
 
 [tool.snakefmt]
-line_length = 160
+line_length = 120
 include = '\.snakefile$|^Snakefile'
+
+[tool.ruff.lint]
+# Enable all rules that ruff format would normally apply
+select = [
+    "E",   # pycodestyle errors
+    "F",   # pyflakes
+    "I",   # isort (import sorting)
+    "UP",  # pyupgrade
+]
+
+# Optional but common
+ignore = [
+    "E501",  # let Ruff handle wrapping consistently
+]
+
+fixable = ["ALL"]
+unfixable = []
diff --git a/releases/summaries/compare.py b/releases/summaries/compare.py
@@ -5,10 +5,10 @@
 file1 = "2024mar24.json"
 file2 = "2024jul13.json"
 
-with open(file1, "r") as f:
+with open(file1) as f:
     summary1 = json.load(f)
 
-with open(file2, "r") as f:
+with open(file2) as f:
     summary2 = json.load(f)
 
 

diff --git a/src/assess_compendia.py b/src/assess_compendia.py
@@ -1,7 +1,9 @@
 import os
+from collections import defaultdict
 from os import path
+
 import jsonlines
-from collections import defaultdict
+
 from src.util import Text
 
 
@@ -11,7 +13,7 @@ def assess_completeness(input_dir, compendia, reportfile):
     id_files = os.listdir(input_dir)
     all_identifiers = set()
     for idf in id_files:
-        with open(path.join(input_dir, idf), "r") as inf:
+        with open(path.join(input_dir, idf)) as inf:
             for line in inf:
                 x = line.strip().split("\t")[0]
                 all_identifiers.add(x)
@@ -23,11 +25,11 @@ def assess_completeness(input_dir, compendia, reportfile):
                 for identifier in ids:
                     all_identifiers.discard(identifier)
     with open(reportfile, "w") as outf:
-        l = list(all_identifiers)
-        l.sort()
-        print(f"Missing identifiers: {len(l)}\n")
-        outf.write(f"Missing identifiers: {len(l)}\n")
-        for missing_id in l:
+        list_all_identifiers = list(all_identifiers)
+        list_all_identifiers.sort()
+        print(f"Missing identifiers: {len(list_all_identifiers)}\n")
+        outf.write(f"Missing identifiers: {len(list_all_identifiers)}\n")
+        for missing_id in list_all_identifiers:
             outf.write(f"{missing_id}\n")
 
 

diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -1,27 +1,26 @@
+import gzip
+import os
+import sqlite3
 import subprocess
+import time
 import traceback
+import urllib
+from collections import defaultdict
+from datetime import datetime, timedelta
 from enum import Enum
 from ftplib import FTP
 from io import BytesIO
-import gzip
-from datetime import timedelta
-import time
 from pathlib import Path
 
-import requests
-import os
-import urllib
 import jsonlines
+import requests
 from humanfriendly import format_timespan
 
-from src.metadata.provenance import write_combined_metadata
-from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory
-from src.properties import PropertyList, HAS_ALTERNATIVE_ID
-from src.util import Text, get_config, get_memory_usage_summary, get_logger
 from src.LabeledID import LabeledID
-from collections import defaultdict
-import sqlite3
-from typing import List, Tuple
+from src.metadata.provenance import write_combined_metadata
+from src.node import DescriptionFactory, InformationContentFactory, NodeFactory, SynonymFactory, TaxonFactory
+from src.properties import HAS_ALTERNATIVE_ID, PropertyList
+from src.util import Text, get_config, get_logger, get_memory_usage_summary
 
 # Configuration items
 WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000
@@ -144,15 +143,15 @@ def __init__(self, delta_ms):
         self.delta = timedelta(milliseconds=delta_ms)
 
     def get(self, url):
-        now = dt.now()
+        now = datetime.now()
         throttled = False
         if self.last_time is not None:
             cdelta = now - self.last_time
             if cdelta < self.delta:
                 waittime = self.delta - cdelta
                 time.sleep(waittime.microseconds / 1e6)
                 throttled = True
-        self.last_time = dt.now()
+        self.last_time = datetime.now()
         response = requests.get(url)
         return response, throttled
 
@@ -194,7 +193,6 @@ def pull_via_urllib(url: str, in_file_name: str, decompress=True, subpath=None,
     """
     # Everything goes in downloads
     download_dir = get_config()["download_directory"]
-    working_dir = download_dir
 
     # get the (local) download file name, derived from the input file name
     if subpath is None:
@@ -589,11 +587,11 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
                     possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"])
 
                 # Step 2. Filter out any suspicious labels.
-                filtered_possible_labels = [l for l in possible_labels if l]  # Ignore blank or empty names.
+                filtered_possible_labels = [label for label in possible_labels if label]  # Ignore blank or empty names.
 
                 # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
                 # least one label shorter than this limit.
-                labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config["demote_labels_longer_than"]]
+                labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]]
                 if labels_shorter_than_limit:
                     filtered_possible_labels = labels_shorter_than_limit
 
@@ -782,7 +780,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
     shit_prefixes = set(["KEGG", "PUBCHEM"])
     test_id = "xUBERON:0002262"
     debugit = False
-    excised = set()
+    # excised = set()
     for xgroup in newgroups:
         if isinstance(xgroup, frozenset):
             group = set(xgroup)
@@ -802,7 +800,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
         existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set]
         # All of these sets are now going to be combined through the equivalence of our new set.
         existing_sets = [es[0] for es in existing_sets_w_x]
-        x = [es[1] for es in existing_sets_w_x]
+        # x = [es[1] for es in existing_sets_w_x]
         newset = set().union(*existing_sets)
         if debugit:
             print("merges:", existing_sets)
@@ -830,7 +828,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
         for up in unique_prefixes:
             if test_id in group:
                 print("up?", up)
-            idents = [e if type(e) == str else e.identifier for e in newset]
+            idents = [e if isinstance(e, str) else e.identifier for e in newset]
             if len(set([e for e in idents if (e.split(":")[0] == up)])) > 1:
                 bad += 1
                 setok = False
@@ -840,18 +838,15 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
                     wrote.add(fs)
                 for gel in group:
                     if Text.get_prefix_or_none(gel) == pref:
-                        killer = gel
+                        # killer = gel
+                        pass
                 # for preset in wrote:
                 #    print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n')
                 # print('------------')
         NPC = sum(1 for s in newset if s.startswith("PUBCHEM.COMPOUND:"))
         if ("PUBCHEM.COMPOUND:3100" in newset) and (NPC > 3):
             if debugit:
-                l = sorted(list(newset))
-                print("bad")
-                for li in l:
-                    print(li)
-                exit()
+                raise ValueError(f"Debugging information: {sorted(list(newset))}")
         if not setok:
             # Our new group created a new set that merged stuff we didn't want to merge.
             # Previously we did a lot of fooling around at this point.  But now we're just going to say, I have a
@@ -894,7 +889,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
         # Now check the 'close' dictionary to see if we've accidentally gotten to a close match becoming an exact match
         setok = True
         for cpref, closedict in close.items():
-            idents = set([e if type(e) == str else e.identifier for e in newset])
+            idents = set([e if isinstance(e, str) else e.identifier for e in newset])
             prefidents = [e for e in idents if e.startswith(cpref)]
             for pident in prefidents:
                 for cd in closedict[pident]:
@@ -978,7 +973,7 @@ def read_identifier_file(infile):
     a hint to the normalizer about the proper biolink type for this entity."""
     types = {}
     identifiers = list()
-    with open(infile, "r") as inf:
+    with open(infile) as inf:
         for line in inf:
             x = line.strip().split("\t")
             identifiers.append((x[0],))
@@ -987,7 +982,7 @@ def read_identifier_file(infile):
     return identifiers, types
 
 
-def remove_overused_xrefs(pairlist: List[Tuple], bothways: bool = False):
+def remove_overused_xrefs(pairlist: list[tuple], bothways: bool = False):
     """Given a list of tuples (id1, id2) meaning id1-[xref]->id2, remove any id2 that are associated with more
     than one id1.  The idea is that if e.g. id1 is made up of UBERONS and 2 of those have an xref to say a UMLS
     then it doesn't mean that all of those should be identified.  We don't really know what it means, so remove it."""

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
@@ -1,16 +1,16 @@
 from collections import defaultdict
+
 import requests
 
+import src.datahandlers.mesh as mesh
 import src.datahandlers.obo as obo
+import src.datahandlers.umls as umls
+from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium
+from src.categories import ANATOMICAL_ENTITY, CELL, CELLULAR_COMPONENT, GROSS_ANATOMICAL_STRUCTURE
 from src.metadata.provenance import write_concord_metadata
-from src.util import Text
-
-from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS, FMA
-from src.categories import ANATOMICAL_ENTITY, GROSS_ANATOMICAL_STRUCTURE, CELL, CELLULAR_COMPONENT
+from src.prefixes import CL, FMA, GO, MESH, NCIT, SNOMEDCT, UBERON, UMLS, WIKIDATA
 from src.ubergraph import build_sets
-from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs
-import src.datahandlers.umls as umls
-import src.datahandlers.mesh as mesh
+from src.util import Text
 
 
 def remove_overused_xrefs_dict(kv):
@@ -146,7 +146,7 @@
    pairs = []
    for row in rows:
        umls_curie = f"{UMLS}:{row['umls']['value']}"
        wd_curie = f"{WIKIDATA}:{row['wd']['value']}"
        cl_curie = Text.obo_to_curie(row["cl"]["value"])
        pairs.append((umls_curie, cl_curie))
        counts[umls_curie] += 1
@@ -190,7 +190,7 @@
         # them added. So we want to limit concordances to terms that are already in the dicts. But that's ONLY for the
         # UMLS concord.  We trust the others to retrieve decent identifiers.
         bs = frozenset([UMLS, GO])
-        with open(infile, "r") as inf:
+        with open(infile) as inf:
             for line in inf:
                 x = line.strip().split("\t")
                 prefixes = frozenset([xi.split(":")[0] for xi in x[0:3:2]])  # leave out the predicate
@@ -202,7 +202,7 @@
                             use = False
                     if not use:
                         continue
-                pairs.append(([x[0], x[2]]))
+                pairs.append([x[0], x[2]])
         newpairs = remove_overused_xrefs(pairs)
         setpairs = [set(x) for x in newpairs]
         glom(dicts, setpairs, unique_prefixes=[UBERON, GO])

diff --git a/src/createcompendia/cell_line.py b/src/createcompendia/cell_line.py
@@ -1,7 +1,6 @@
+from src.babel_utils import glom, read_identifier_file, write_compendium
 from src.categories import CELL_LINE
 
-from src.babel_utils import read_identifier_file, glom, write_compendium
-
 
 def build_compendia(ifile, metadata_yamls, icrdf_filename):
     """:identifiers: a list of files from which to read identifiers and optional categories"""