diff --git a/Snakefile b/Snakefile index 21527417..af6a18d4 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,6 @@ configfile: "config.yaml" + include: "src/snakefiles/datacollect.snakefile" include: "src/snakefiles/anatomy.snakefile" include: "src/snakefiles/cell_line.snakefile" @@ -19,6 +20,7 @@ include: "src/snakefiles/duckdb.snakefile" include: "src/snakefiles/reports.snakefile" include: "src/snakefiles/exports.snakefile" + # Some general imports. import shutil from src.snakefiles.util import write_done @@ -28,6 +30,7 @@ import os os.environ["TMPDIR"] = config["tmp_directory"] + # Top-level rules. rule all: input: @@ -43,7 +46,7 @@ rule all: config["output_directory"] + "/kgx/done", config["output_directory"] + "/sapbert-training-data/done", # Store the config.yaml file used to produce the output. - config_file = "config.yaml", + config_file="config.yaml", output: x=config["output_directory"] + "/reports/all_done", output_config_file=config["output_directory"] + "/config.yaml", diff --git a/input_data/parse_bad_mappings.py b/input_data/parse_bad_mappings.py index 8705f924..0677b5ba 100644 --- a/input_data/parse_bad_mappings.py +++ b/input_data/parse_bad_mappings.py @@ -1,10 +1,10 @@ -from collections import defaultdict from ast import literal_eval +from collections import defaultdict def read_bad_hp_mappings(fn): drops = defaultdict(set) - with open(fn, "r") as infile: + with open(fn) as infile: for line in infile: if line.startswith("-"): continue @@ -12,7 +12,7 @@ def read_bad_hp_mappings(fn): hps = x[0] commaindex = hps.index(",") curie = hps[1:commaindex] - name = hps[commaindex + 1 : -1] + # name = hps[commaindex + 1 : -1] badset = literal_eval(x[1]) drops[curie].update(badset) return drops diff --git a/pyproject.toml b/pyproject.toml index e5daef09..b8dfcd32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,13 +48,31 @@ apybiomart = { git = "https://github.com/gaurav/apybiomart.git", rev = "change-c [dependency-groups] dev = [ + "ruff>=0.14.9", "snakefmt>=0.11.2", ] # Linting/formatting configuration [tool.ruff] -line-length = 160 +line-length = 120 [tool.snakefmt] -line_length = 160 +line_length = 120 include = '\.snakefile$|^Snakefile' + +[tool.ruff.lint] +# Enable all rules that ruff format would normally apply +select = [ + "E", # pycodestyle errors + "F", # pyflakes + "I", # isort (import sorting) + "UP", # pyupgrade +] + +# Optional but common +ignore = [ + "E501", # let Ruff handle wrapping consistently +] + +fixable = ["ALL"] +unfixable = [] diff --git a/releases/summaries/compare.py b/releases/summaries/compare.py index 6223a668..59e7cf6f 100644 --- a/releases/summaries/compare.py +++ b/releases/summaries/compare.py @@ -5,10 +5,10 @@ file1 = "2024mar24.json" file2 = "2024jul13.json" -with open(file1, "r") as f: +with open(file1) as f: summary1 = json.load(f) -with open(file2, "r") as f: +with open(file2) as f: summary2 = json.load(f) diff --git a/src/assess_compendia.py b/src/assess_compendia.py index b15452a9..90ae4df4 100644 --- a/src/assess_compendia.py +++ b/src/assess_compendia.py @@ -1,7 +1,9 @@ import os +from collections import defaultdict from os import path + import jsonlines -from collections import defaultdict + from src.util import Text @@ -11,7 +13,7 @@ def assess_completeness(input_dir, compendia, reportfile): id_files = os.listdir(input_dir) all_identifiers = set() for idf in id_files: - with open(path.join(input_dir, idf), "r") as inf: + with open(path.join(input_dir, idf)) as inf: for line in inf: x = line.strip().split("\t")[0] all_identifiers.add(x) @@ -23,11 +25,11 @@ def assess_completeness(input_dir, compendia, reportfile): for identifier in ids: all_identifiers.discard(identifier) with open(reportfile, "w") as outf: - l = list(all_identifiers) - l.sort() - print(f"Missing identifiers: {len(l)}\n") - outf.write(f"Missing identifiers: {len(l)}\n") - for missing_id in l: + list_all_identifiers = list(all_identifiers) + list_all_identifiers.sort() + print(f"Missing identifiers: {len(list_all_identifiers)}\n") + outf.write(f"Missing identifiers: {len(list_all_identifiers)}\n") + for missing_id in list_all_identifiers: outf.write(f"{missing_id}\n") diff --git a/src/babel_utils.py b/src/babel_utils.py index ccd1420b..19981327 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -1,27 +1,26 @@ +import gzip +import os +import sqlite3 import subprocess +import time import traceback +import urllib +from collections import defaultdict +from datetime import datetime, timedelta from enum import Enum from ftplib import FTP from io import BytesIO -import gzip -from datetime import timedelta -import time from pathlib import Path -import requests -import os -import urllib import jsonlines +import requests from humanfriendly import format_timespan -from src.metadata.provenance import write_combined_metadata -from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory -from src.properties import PropertyList, HAS_ALTERNATIVE_ID -from src.util import Text, get_config, get_memory_usage_summary, get_logger from src.LabeledID import LabeledID -from collections import defaultdict -import sqlite3 -from typing import List, Tuple +from src.metadata.provenance import write_combined_metadata +from src.node import DescriptionFactory, InformationContentFactory, NodeFactory, SynonymFactory, TaxonFactory +from src.properties import HAS_ALTERNATIVE_ID, PropertyList +from src.util import Text, get_config, get_logger, get_memory_usage_summary # Configuration items WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000 @@ -144,7 +143,7 @@ def __init__(self, delta_ms): self.delta = timedelta(milliseconds=delta_ms) def get(self, url): - now = dt.now() + now = datetime.now() throttled = False if self.last_time is not None: cdelta = now - self.last_time @@ -152,7 +151,7 @@ def get(self, url): waittime = self.delta - cdelta time.sleep(waittime.microseconds / 1e6) throttled = True - self.last_time = dt.now() + self.last_time = datetime.now() response = requests.get(url) return response, throttled @@ -194,7 +193,6 @@ def pull_via_urllib(url: str, in_file_name: str, decompress=True, subpath=None, """ # Everything goes in downloads download_dir = get_config()["download_directory"] - working_dir = download_dir # get the (local) download file name, derived from the input file name if subpath is None: @@ -589,11 +587,11 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"]) # Step 2. Filter out any suspicious labels. - filtered_possible_labels = [l for l in possible_labels if l] # Ignore blank or empty names. + filtered_possible_labels = [label for label in possible_labels if label] # Ignore blank or empty names. # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at # least one label shorter than this limit. - labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config["demote_labels_longer_than"]] + labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]] if labels_shorter_than_limit: filtered_possible_labels = labels_shorter_than_limit @@ -782,7 +780,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={}) shit_prefixes = set(["KEGG", "PUBCHEM"]) test_id = "xUBERON:0002262" debugit = False - excised = set() + # excised = set() for xgroup in newgroups: if isinstance(xgroup, frozenset): group = set(xgroup) @@ -802,7 +800,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={}) existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set] # All of these sets are now going to be combined through the equivalence of our new set. existing_sets = [es[0] for es in existing_sets_w_x] - x = [es[1] for es in existing_sets_w_x] + # x = [es[1] for es in existing_sets_w_x] newset = set().union(*existing_sets) if debugit: print("merges:", existing_sets) @@ -830,7 +828,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={}) for up in unique_prefixes: if test_id in group: print("up?", up) - idents = [e if type(e) == str else e.identifier for e in newset] + idents = [e if isinstance(e, str) else e.identifier for e in newset] if len(set([e for e in idents if (e.split(":")[0] == up)])) > 1: bad += 1 setok = False @@ -840,18 +838,15 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={}) wrote.add(fs) for gel in group: if Text.get_prefix_or_none(gel) == pref: - killer = gel + # killer = gel + pass # for preset in wrote: # print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n') # print('------------') NPC = sum(1 for s in newset if s.startswith("PUBCHEM.COMPOUND:")) if ("PUBCHEM.COMPOUND:3100" in newset) and (NPC > 3): if debugit: - l = sorted(list(newset)) - print("bad") - for li in l: - print(li) - exit() + raise ValueError(f"Debugging information: {sorted(list(newset))}") if not setok: # Our new group created a new set that merged stuff we didn't want to merge. # Previously we did a lot of fooling around at this point. But now we're just going to say, I have a @@ -894,7 +889,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={}) # Now check the 'close' dictionary to see if we've accidentally gotten to a close match becoming an exact match setok = True for cpref, closedict in close.items(): - idents = set([e if type(e) == str else e.identifier for e in newset]) + idents = set([e if isinstance(e, str) else e.identifier for e in newset]) prefidents = [e for e in idents if e.startswith(cpref)] for pident in prefidents: for cd in closedict[pident]: @@ -978,7 +973,7 @@ def read_identifier_file(infile): a hint to the normalizer about the proper biolink type for this entity.""" types = {} identifiers = list() - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") identifiers.append((x[0],)) @@ -987,7 +982,7 @@ def read_identifier_file(infile): return identifiers, types -def remove_overused_xrefs(pairlist: List[Tuple], bothways: bool = False): +def remove_overused_xrefs(pairlist: list[tuple], bothways: bool = False): """Given a list of tuples (id1, id2) meaning id1-[xref]->id2, remove any id2 that are associated with more than one id1. The idea is that if e.g. id1 is made up of UBERONS and 2 of those have an xref to say a UMLS then it doesn't mean that all of those should be identified. We don't really know what it means, so remove it.""" diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index e8dcf7a0..0c043020 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -1,16 +1,16 @@ from collections import defaultdict + import requests +import src.datahandlers.mesh as mesh import src.datahandlers.obo as obo +import src.datahandlers.umls as umls +from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium +from src.categories import ANATOMICAL_ENTITY, CELL, CELLULAR_COMPONENT, GROSS_ANATOMICAL_STRUCTURE from src.metadata.provenance import write_concord_metadata -from src.util import Text - -from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS, FMA -from src.categories import ANATOMICAL_ENTITY, GROSS_ANATOMICAL_STRUCTURE, CELL, CELLULAR_COMPONENT +from src.prefixes import CL, FMA, GO, MESH, NCIT, SNOMEDCT, UBERON, UMLS, WIKIDATA from src.ubergraph import build_sets -from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs -import src.datahandlers.umls as umls -import src.datahandlers.mesh as mesh +from src.util import Text def remove_overused_xrefs_dict(kv): @@ -190,7 +190,7 @@ def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): # them added. So we want to limit concordances to terms that are already in the dicts. But that's ONLY for the # UMLS concord. We trust the others to retrieve decent identifiers. bs = frozenset([UMLS, GO]) - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") prefixes = frozenset([xi.split(":")[0] for xi in x[0:3:2]]) # leave out the predicate @@ -202,7 +202,7 @@ def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): use = False if not use: continue - pairs.append(([x[0], x[2]])) + pairs.append([x[0], x[2]]) newpairs = remove_overused_xrefs(pairs) setpairs = [set(x) for x in newpairs] glom(dicts, setpairs, unique_prefixes=[UBERON, GO]) diff --git a/src/createcompendia/cell_line.py b/src/createcompendia/cell_line.py index e4e55a5a..67782c32 100644 --- a/src/createcompendia/cell_line.py +++ b/src/createcompendia/cell_line.py @@ -1,7 +1,6 @@ +from src.babel_utils import glom, read_identifier_file, write_compendium from src.categories import CELL_LINE -from src.babel_utils import read_identifier_file, glom, write_compendium - def build_compendia(ifile, metadata_yamls, icrdf_filename): """:identifiers: a list of files from which to read identifiers and optional categories""" diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index f55ead18..628ec0d9 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,3 +1,5 @@ +import ast +import gzip import logging import os from collections import defaultdict @@ -5,22 +7,39 @@ import jsonlines import requests -import ast -import gzip - -from src.properties import Property, HAS_ALTERNATIVE_ID -from src.metadata.provenance import write_concord_metadata, write_combined_metadata -from src.ubergraph import UberGraph -from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND, GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI -from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG -from src.sdfreader import read_sdf - -from src.datahandlers.unichem import data_sources as unichem_data_sources -from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs import src.datahandlers.mesh as mesh import src.datahandlers.umls as umls -from src.util import get_memory_usage_summary, Text, get_logger +from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium +from src.categories import ( + CHEMICAL_ENTITY, + CHEMICAL_MIXTURE, + COMPLEX_MOLECULAR_MIXTURE, + DRUG, + MOLECULAR_MIXTURE, + POLYPEPTIDE, + SMALL_MOLECULE, +) +from src.datahandlers.unichem import data_sources as unichem_data_sources +from src.metadata.provenance import write_combined_metadata, write_concord_metadata +from src.prefixes import ( + CHEBI, + CHEMBLCOMPOUND, + DRUGBANK, + DRUGCENTRAL, + GTOPDB, + INCHIKEY, + KEGGCOMPOUND, + MESH, + PUBCHEMCOMPOUND, + RXCUI, + UMLS, + UNII, +) +from src.properties import HAS_ALTERNATIVE_ID, Property +from src.sdfreader import read_sdf +from src.ubergraph import UberGraph +from src.util import Text, get_logger, get_memory_usage_summary logger = get_logger(__name__) @@ -96,7 +115,7 @@ def build_chemical_rxnorm_relationships(conso, idfile, outfile, metadata_yaml): def write_pubchem_ids(labelfile, smilesfile, outfile): # Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier # but since they're already sorted, let's give it a shot - with open(labelfile, "r") as inlabels, gzip.open(smilesfile, "rt", encoding="utf-8") as insmiles, open(outfile, "w") as outf: + with open(labelfile) as inlabels, gzip.open(smilesfile, "rt", encoding="utf-8") as insmiles, open(outfile, "w") as outf: sn = -1 flag_file_ended = False for labelline in inlabels: @@ -196,7 +215,7 @@ def write_chebi_ids(outfile): def write_unii_ids(infile, outfile): """UNII contains a bunch of junk like leaves. We are going to try to clean it a bit to get things that are actually chemicals. In biolink 2.0 we cn revisit exactly what happens here.""" - with open(infile, "r", encoding="windows-1252") as inf, open(outfile, "w") as outf: + with open(infile, encoding="windows-1252") as inf, open(outfile, "w") as outf: h = inf.readline().strip().split("\t") bad_cols = ["NCBI", "PLANTS", "GRIN", "MPNS"] bad_colnos = [h.index(bc) for bc in bad_cols] @@ -220,7 +239,7 @@ def write_drugbank_ids(infile, outfile): drugbank_id = "2" assert unichem_data_sources[drugbank_id] == DRUGBANK written = set() - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: header_line = inf.readline() assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {infile}: {header_line}" for line in inf: @@ -235,11 +254,11 @@ def write_drugbank_ids(infile, outfile): def write_chemical_ids_from_labels_and_smiles(labelfile, smifile, outfile): smiles = {} - with open(smifile, "r") as inf: + with open(smifile) as inf: for line in inf: x = line.strip().split("\t") smiles[x[0]] = x[1] - with open(labelfile, "r") as inf, open(outfile, "w") as outf: + with open(labelfile) as inf, open(outfile, "w") as outf: for line in inf: hmdbid = line.split("\t")[0] if hmdbid in smiles: @@ -252,7 +271,7 @@ def write_chemical_ids_from_labels_and_smiles(labelfile, smifile, outfile): def parse_smifile(infile, outfile, smicol, idcol, pref, stripquotes=False): idcol_index = None smicol_index = None - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: if line.startswith('"# GtoPdb Version'): # Version line! Skip. @@ -318,7 +337,7 @@ def parse_smifile(infile, outfile, smicol, idcol, pref, stripquotes=False): def write_drugcentral_ids(infile, outfile): smicol = 1 idcol = 0 - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split("\t") if x[smicol] == "None": @@ -338,7 +357,7 @@ def write_unichem_concords(structfile, reffile, outdir): concname = f"{outdir}/UNICHEM_{name}" print(concname) concfiles[num] = open(concname, "w") - with open(reffile, "rt") as inf: + with open(reffile) as inf: header_line = inf.readline() assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {reffile}: {header_line}" for line in inf: @@ -380,10 +399,10 @@ def combine_unichem(concordances, output): # but out of paranoia we'll double-check that. prefixes_in_file = set() - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") - pairs.append(([x[0], x[2]])) + pairs.append([x[0], x[2]]) # Get the prefix from the first row to determine if we need to remove overused xrefs prefixes_in_file.add(Text.get_prefix(x[0])) @@ -428,7 +447,7 @@ def is_cas(thing): def make_pubchem_cas_concord(pubchemsynonyms, outfile, metadata_yaml): - with open(pubchemsynonyms, "r") as inf, open(outfile, "w") as outf: + with open(pubchemsynonyms) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split("\t") if is_cas(x[1]): @@ -449,7 +468,7 @@ def make_pubchem_mesh_concord(pubcheminput, meshlabels, outfile, metadata_yaml): # MESH:D014867 Water # MESH:M0022883 Water # but we only want the ones that are MESH:D... or MESH:C.... - with open(meshlabels, "r") as inf: + with open(meshlabels) as inf: for line in inf: x = line.strip().split("\t") if x[0].split(":")[-1][0] in ["C", "D"]: @@ -458,7 +477,7 @@ def make_pubchem_mesh_concord(pubcheminput, meshlabels, outfile, metadata_yaml): # first mapping is the 'best' i.e. the one most frequently reported. # We will only use the first one used_pubchem = set() - with open(pubcheminput, "r") as inf, open(outfile, "w") as outf: + with open(pubcheminput) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split("\t") # x[0] = puchemid (no prefix), x[1] = mesh label if x[0] in used_pubchem: @@ -494,7 +513,7 @@ def build_drugcentral_relations(infile, outfile, metadata_yaml): external_id_col = 1 external_ns_col = 2 drugcentral_id_col = 3 - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: parts = line.strip().split("\t") # print(parts) @@ -515,7 +534,7 @@ def build_drugcentral_relations(infile, outfile, metadata_yaml): def make_gtopdb_relations(infile, outfile, metadata_yaml): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: h = inf.readline() # We might have a header/version line. If so, skip to the next line. if h.startswith('"# GtoPdb Version'): @@ -551,7 +570,7 @@ def make_chebi_relations(sdf, dbx, outfile, propfile_gz, metadata_yaml): # CHEBIs in the sdf by definition have structure (the sdf is a structure file) structured_chebi = set(chebi_sdf_dat.keys()) # READ xrefs - with open(dbx, "r") as inf: + with open(dbx) as inf: dbxdata = inf.read() kk = "keggcompounddatabaselinks" pk = "pubchemdatabaselinks" @@ -613,7 +632,7 @@ def make_chebi_relations(sdf, dbx, outfile, propfile_gz, metadata_yaml): def get_mesh_relationships(mesh_id_file, cas_out, unii_out, cas_metadata, unii_metadata): meshes = set() - with open(mesh_id_file, "r") as inf: + with open(mesh_id_file) as inf: for line in inf: x = line.split("\t") meshes.add(x[0]) @@ -707,10 +726,10 @@ def build_untyped_compendia(concordances, identifiers, unichem_partial, untyped_ print(infile) print("loading", infile) pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") - pairs.append(([x[0], x[2]])) + pairs.append([x[0], x[2]]) p = False if DRUGCENTRAL in [n.split(":")[0] for n in pairs[0]]: p = True @@ -753,14 +772,14 @@ def build_untyped_compendia(concordances, identifiers, unichem_partial, untyped_ def build_compendia(type_file, untyped_compendia_file, properties_jsonl_gz_files, metadata_yamls, icrdf_filename): types = {} - with open(type_file, "r") as inf: + with open(type_file) as inf: for line in inf: x = line.strip().split("\t") types[x[0]] = x[1] logger.info(f"Loaded {len(types)} types from {type_file}: {get_memory_usage_summary()}") untyped_sets = set() - with open(untyped_compendia_file, "r") as inf: + with open(untyped_compendia_file) as inf: for line in inf: s = ast.literal_eval(line.strip()) untyped_sets.add(frozenset(s)) diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index e5aaa661..a2680041 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -1,18 +1,16 @@ -from os import path from collections import defaultdict +from os import path +import src.datahandlers.doid as doid +import src.datahandlers.efo as efo +import src.datahandlers.mesh as mesh import src.datahandlers.obo as obo -from src.metadata.provenance import write_concord_metadata - -from src.prefixes import MESH, NCIT, MONDO, OMIM, HP, SNOMEDCT, MEDDRA, ORPHANET, ICD0, ICD9, ICD10, UMLS, KEGGDISEASE +import src.datahandlers.umls as umls +from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium from src.categories import DISEASE, PHENOTYPIC_FEATURE +from src.metadata.provenance import write_concord_metadata +from src.prefixes import HP, ICD0, ICD9, ICD10, KEGGDISEASE, MEDDRA, MESH, MONDO, NCIT, OMIM, ORPHANET, SNOMEDCT, UMLS from src.ubergraph import build_sets -import src.datahandlers.umls as umls -import src.datahandlers.doid as doid -import src.datahandlers.mesh as mesh -import src.datahandlers.efo as efo - -from src.babel_utils import read_identifier_file, glom, remove_overused_xrefs, get_prefixes, write_compendium def write_obo_ids(irisandtypes, outfile, exclude=[]): @@ -47,7 +45,7 @@ def write_hp_ids(outfile): def write_omim_ids(infile, outfile): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: if line.startswith("#"): continue @@ -89,7 +87,7 @@ def write_mesh_ids(outfile): def write_umls_ids(mrsty, outfile, badumlsfile): badumls = set() - with open(badumlsfile, "r") as inf: + with open(badumlsfile) as inf: for line in inf: if line.startswith("#"): continue @@ -181,7 +179,7 @@ def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfil good_ids = {} for prefix, prefixidfile in [(OMIM, omimfile), (NCIT, ncitfile)]: good_ids[prefix] = set() - with open(prefixidfile, "r") as inf: + with open(prefixidfile) as inf: for line in inf: x = line.split()[0] good_ids[prefix].add(x) @@ -229,7 +227,7 @@ def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badx glom(dicts, new_identifiers, unique_prefixes=[MONDO, HP]) types.update(new_types) # Load close Mondos - with open(mondoclose, "r") as inf: + with open(mondoclose) as inf: close_mondos = defaultdict(set) for line in inf: x = tuple(line.strip().split("\t")) @@ -245,7 +243,7 @@ def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badx else: print("no bad pairs", pref) bad_pairs = set() - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: stuff = line.strip().split("\t") if len(stuff) != 3: @@ -316,7 +314,7 @@ def create_typed_sets(eqsets, types): def read_badxrefs(fn): morebad = set() - with open(fn, "r") as inf: + with open(fn) as inf: for line in inf: if line.startswith("#"): continue diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index daa39f2c..dabf65ed 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -1,10 +1,15 @@ import csv +import json +import logging import sys import time +from collections import defaultdict import jsonlines from humanfriendly import format_timespan +from src.babel_utils import get_numerical_curie_suffix, glom + # from src.categories import ( # SMALL_MOLECULE, # POLYPEPTIDE, @@ -23,13 +28,8 @@ from src.categories import CHEMICAL_ENTITY from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import InformationContentFactory -from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS -from src.babel_utils import glom, get_numerical_curie_suffix -from collections import defaultdict -import json - -import logging -from src.util import LoggingUtil, get_config, get_memory_usage_summary, get_biolink_model_toolkit, Text +from src.prefixes import PUBCHEMCOMPOUND, RXCUI, UMLS +from src.util import LoggingUtil, Text, get_biolink_model_toolkit, get_config, get_memory_usage_summary logger = LoggingUtil.init_logging(__name__, level=logging.INFO) @@ -138,7 +138,7 @@ def get_aui_to_cui(consofile): aui_to_cui = {} sdui_to_cui = defaultdict(set) # consofile = os.path.join('input_data', 'private', "RXNCONSO.RRF") - with open(consofile, "r") as inf: + with open(consofile) as inf: for line in inf: x = line.strip().split("|") aui = x[7] @@ -229,7 +229,7 @@ def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): one_to_one_relations = {} # one_to_one_relations = {"has_tradename": {"subject": defaultdict(set), # "object": defaultdict(set)}} - with open(relfile, "r") as inf, open(outfile, "w") as outf: + with open(relfile) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split("|") # UMLS always has the CUI in it, while RXNORM does not. @@ -274,7 +274,7 @@ def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): def load_cliques_containing_rxcui(compendium): rx_to_clique = {} - with open(compendium, "r") as infile: + with open(compendium) as infile: for line in infile: if RXCUI not in line: continue @@ -287,7 +287,7 @@ def load_cliques_containing_rxcui(compendium): def build_pubchem_relationships(infile, outfile, metadata_yaml): - with open(infile, "r") as inf: + with open(infile) as inf: document = json.load(inf) with open(outfile, "w") as outf: for annotation in document["Annotations"]["Annotation"]: @@ -341,7 +341,7 @@ def build_conflation( manual_concords_curies = set() manual_concords_predicate_counts = defaultdict(int) manual_concords_curie_prefix_counts = defaultdict(int) - with open(manual_concord_filename, "r") as manualf: + with open(manual_concord_filename) as manualf: csv_reader = csv.DictReader(manualf, dialect=csv.excel_tab) for row in csv_reader: # We're only interested in two fields, so you can add additional files ('comment', 'notes', etc.) as needed. @@ -364,7 +364,7 @@ def build_conflation( type_for_preferred_curie = {} clique_for_preferred_curie = {} for chemical_compendium in chemical_compendia: - with open(chemical_compendium, "r") as compendiumf: + with open(chemical_compendium) as compendiumf: logger.info(f"Loading {chemical_compendium}: {get_memory_usage_summary()}") for line in compendiumf: clique = json.loads(line) @@ -388,7 +388,7 @@ def build_conflation( pairs = [] for concfile in [rxn_concord, umls_concord]: - with open(concfile, "r") as infile: + with open(concfile) as infile: for line in infile: x = line.strip().split("\t") subject = x[0] @@ -417,15 +417,17 @@ def build_conflation( pairs.extend(manual_concords) # We've had some issues with non-chemical types getting conflated, so we filter those out here. - biolink_model_toolkit = get_biolink_model_toolkit(config['biolink_version']) - biolink_chemical_types = set(biolink_model_toolkit.get_descendants( - CHEMICAL_ENTITY, - reflexive=True, - formatted=True, - mixin=True, - )) + biolink_model_toolkit = get_biolink_model_toolkit(config["biolink_version"]) + biolink_chemical_types = set( + biolink_model_toolkit.get_descendants( + CHEMICAL_ENTITY, + reflexive=True, + formatted=True, + mixin=True, + ) + ) logging.info(f"Filtering RxCUI pairs to those in these Biolink chemical types: {sorted(biolink_chemical_types)}") - with open(pubchem_rxn_concord, "r") as infile: + with open(pubchem_rxn_concord) as infile: for line in infile: x = line.strip().split("\t") subject = x[0] @@ -493,9 +495,9 @@ def build_conflation( # # So, instead, I'm going to group them by prefix and then to sort it using the ChemicalEntity # prefix sort order. - biolink_model_toolkit = get_biolink_model_toolkit(config['biolink_version']) + biolink_model_toolkit = get_biolink_model_toolkit(config["biolink_version"]) biolink_chemical_entity = biolink_model_toolkit.get_element(CHEMICAL_ENTITY) - conflation_prefix_order = biolink_chemical_entity['id_prefixes'] + conflation_prefix_order = biolink_chemical_entity["id_prefixes"] if not conflation_prefix_order: raise RuntimeError(f"Biolink model {config['biolink_version']} doesn't have a ChemicalEntity prefix order: {biolink_chemical_entity}") @@ -628,9 +630,11 @@ def build_conflation( # The final conflation list won't match the initial list only if some of the Biolink types weren't # chemical types, and so were skipped that way. if set(final_conflation_id_list) != set(normalized_conflation_id_list): - logger.warning("Final conflation ID list does not match the normalized conflation ID list:\n" + - f" - Final conflation ID list: {sorted(final_conflation_id_list)}\n" + - f" - Normalized conflation ID list: {sorted(normalized_conflation_id_list)}") + logger.warning( + "Final conflation ID list does not match the normalized conflation ID list:\n" + + f" - Final conflation ID list: {sorted(final_conflation_id_list)}\n" + + f" - Normalized conflation ID list: {sorted(normalized_conflation_id_list)}" + ) # Write out all the identifiers. logger.info(f"Ordered DrugChemical conflation {final_conflation_id_list} with IC values {clique_ics}.") diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 88bf4a66..d8a2265f 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -1,18 +1,14 @@ +import gzip +import json +import logging +import os import re -from src.metadata.provenance import write_concord_metadata -from src.prefixes import OMIM, ENSEMBL, NCBIGENE, WORMBASE, MGI, ZFIN, DICTYBASE, FLYBASE, RGD, SGD, HGNC, UMLS -from src.categories import GENE - import src.datahandlers.umls as umls - -from src.babel_utils import read_identifier_file, glom, write_compendium - -import os -import json -import gzip - -import logging +from src.babel_utils import glom, read_identifier_file, write_compendium +from src.categories import GENE +from src.metadata.provenance import write_concord_metadata +from src.prefixes import DICTYBASE, ENSEMBL, FLYBASE, HGNC, MGI, NCBIGENE, OMIM, RGD, SGD, UMLS, WORMBASE, ZFIN from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, level=logging.ERROR) @@ -20,7 +16,7 @@ def write_mods_ids(dd, id, modlist): for mod in modlist: - with open(f"{dd}/{mod}/labels", "r") as inf, open(f"{id}/gene/ids/{mod}", "w") as outf: + with open(f"{dd}/{mod}/labels") as inf, open(f"{id}/gene/ids/{mod}", "w") as outf: for line in inf: x = line.split("\t")[0] outf.write(f"{x}\n") @@ -48,7 +44,7 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile, metadata_yaml): infname = os.path.join(dlpath, "BioMart.tsv") if os.path.exists(infname): # open each ensembl file, find the id column, and put it in the output - with open(infname, "r") as inf: + with open(infname) as inf: wrote = set() h = inf.readline() x = h[:-1].split("\t") @@ -95,7 +91,7 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile, metadata_yaml): def write_zfin_ids(infile, outfile): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split() if "GENE" in x[0]: @@ -103,7 +99,7 @@ def write_zfin_ids(infile, outfile): def write_hgnc_ids(infile, outfile): - with open(infile, "r") as inf: + with open(infile) as inf: hgnc_json = json.load(inf) with open(outfile, "w") as outf: for gene in hgnc_json["response"]["docs"]: @@ -111,7 +107,7 @@ def write_hgnc_ids(infile, outfile): def write_omim_ids(infile, outfile): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: if line.startswith("#"): continue @@ -137,7 +133,7 @@ def write_umls_ids(mrconso, mrsty, outfile): ] ) umls_keepers = set() - with open(mrsty, "r") as inf: + with open(mrsty) as inf: for line in inf: x = line.strip().split("|") cat = x[2] @@ -145,7 +141,7 @@ def write_umls_ids(mrconso, mrsty, outfile): umls_keepers.add(x[0]) umls_keepers.difference_update(blacklist) # Now filter out OMIM variants - with open(mrconso, "r") as inf: + with open(mrconso) as inf: for line in inf: x = line.strip().split("|") cui = x[0] @@ -174,7 +170,7 @@ def write_umls_ids(mrconso, mrsty, outfile): def read_ncbi_idfile(ncbi_idfile): ncbi_ids = set() - with open(ncbi_idfile, "r") as inf: + with open(ncbi_idfile) as inf: for line in inf: x = line.strip().split("\t")[0] ncbi_ids.add(x) @@ -258,7 +254,7 @@ def build_gene_ncbigene_xrefs(infile, ncbi_idfile, outfile, metadata_yaml): def build_gene_medgen_relationships(infile, outfile, metadata_yaml): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: h = inf.readline() for line in inf: x = line.strip().split("\t") @@ -297,7 +293,7 @@ def write_ensembl_gene_ids(ensembl_dir, outfile): infname = os.path.join(dlpath, "BioMart.tsv") if os.path.exists(infname): # open each ensembl file, find the id column, and put it in the output - with open(infname, "r") as inf: + with open(infname) as inf: wrote = set() h = inf.readline() x = h[:-1].split("\t") @@ -336,7 +332,7 @@ def build_gene_compendia(concordances, metadata_yamls, identifiers, icrdf_filena print(infile) print("loading", infile) pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") pairs.append(set([x[0], x[2]])) diff --git a/src/createcompendia/genefamily.py b/src/createcompendia/genefamily.py index 7361d5bb..64934700 100644 --- a/src/createcompendia/genefamily.py +++ b/src/createcompendia/genefamily.py @@ -1,7 +1,6 @@ +from src.babel_utils import glom, read_identifier_file, write_compendium from src.categories import GENE_FAMILY -from src.babel_utils import read_identifier_file, glom, write_compendium - def build_compendia(identifiers, metadata_yamls, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/createcompendia/geneprotein.py b/src/createcompendia/geneprotein.py index 096767d9..5ed8aab6 100644 --- a/src/createcompendia/geneprotein.py +++ b/src/createcompendia/geneprotein.py @@ -1,11 +1,11 @@ -from src.metadata.provenance import write_concord_metadata -from src.prefixes import UNIPROTKB, NCBIGENE -from src.babel_utils import glom +import logging from collections import defaultdict import jsonlines -import logging +from src.babel_utils import glom +from src.metadata.provenance import write_concord_metadata +from src.prefixes import NCBIGENE, UNIPROTKB from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, level=logging.ERROR) @@ -16,7 +16,7 @@ def build_uniprotkb_ncbigene_relationships(infile, outfile, metadata_yaml): # Our model is 1 gene, many proteins, so this causes trouble. # For the moment, we will not include that have more than one gene per protein mappings = defaultdict(list) - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split() if x[1] == "GeneID": @@ -87,7 +87,7 @@ def build_conflation(geneprotein_concord, gene_compendium, protein_compendium, o collect_valid_ids(protein_compendium, all_ids) conf = {} pairs = [] - with open(geneprotein_concord, "r") as inf: + with open(geneprotein_concord) as inf: for line in inf: x = line.strip().split("\t") if (x[0] in all_ids) and (x[2] in all_ids): @@ -115,7 +115,7 @@ def build_compendium(gene_compendium, protein_compendium, geneprotein_concord, o """ uniprot2ncbi = {} ncbi2uniprot = defaultdict(list) - with open(geneprotein_concord, "r") as inf: + with open(geneprotein_concord) as inf: for line in inf: x = line.strip().split("\t") uniprot2ncbi[x[0]] = x[2] diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py index 1e264281..323636a0 100644 --- a/src/createcompendia/leftover_umls.py +++ b/src/createcompendia/leftover_umls.py @@ -1,14 +1,14 @@ import json import logging +from pathlib import Path import jsonlines -from pathlib import Path -from src.node import NodeFactory -from src.util import get_biolink_model_toolkit +from src.categories import ACTIVITY, AGENT, DEVICE, DRUG, FOOD, PHYSICAL_ENTITY, PROCEDURE, PUBLICATION, SMALL_MOLECULE from src.datahandlers import umls +from src.node import NodeFactory from src.prefixes import UMLS -from src.categories import ACTIVITY, AGENT, DEVICE, DRUG, FOOD, SMALL_MOLECULE, PHYSICAL_ENTITY, PUBLICATION, PROCEDURE +from src.util import get_biolink_model_toolkit def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, biolink_version): @@ -56,7 +56,7 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym logging.info(f"Starting compendium: {compendium}") umls_ids = set() - with open(compendium, "r") as f: + with open(compendium) as f: for row in f: cluster = json.loads(row) for id in cluster["identifiers"]: @@ -75,7 +75,7 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym preferred_name_by_id = dict() types_by_id = dict() types_by_tui = dict() - with open(mrsty, "r") as inf: + with open(mrsty) as inf: for line in inf: x = line.strip().split("|") umls_id = f"{UMLS}:{x[0]}" @@ -104,7 +104,7 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym # Create a compendium that consists solely of all MRCONSO entries that haven't been referenced. count_no_umls_type = 0 count_multiple_umls_type = 0 - with open(mrconso, "r") as inf: + with open(mrconso) as inf: for line in inf: if not umls.check_mrconso_line(line): continue @@ -194,7 +194,7 @@ def umls_type_to_biolink_type(umls_tui): # Collected synonyms for all IDs in this compendium. synonyms_by_id = dict() - with open(synonyms, "r") as synonymsf: + with open(synonyms) as synonymsf: for line in synonymsf: id, relation, synonym = line.rstrip().split("\t") if id in umls_ids_in_this_compendium: diff --git a/src/createcompendia/macromolecular_complex.py b/src/createcompendia/macromolecular_complex.py index e7d81279..0d0ca54d 100644 --- a/src/createcompendia/macromolecular_complex.py +++ b/src/createcompendia/macromolecular_complex.py @@ -1,7 +1,6 @@ -from src.prefixes import COMPLEXPORTAL +from src.babel_utils import glom, read_identifier_file, write_compendium from src.categories import MACROMOLECULAR_COMPLEX - -from src.babel_utils import read_identifier_file, glom, write_compendium +from src.prefixes import COMPLEXPORTAL def build_compendia(identifiers, metadata_yamls, icrdf_filename): diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index 859b0fbf..68b2f43f 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -1,18 +1,16 @@ from collections import defaultdict +import src.datahandlers.ec as ec import src.datahandlers.obo as obo import src.datahandlers.reactome as reactome import src.datahandlers.rhea as rhea -import src.datahandlers.ec as ec import src.datahandlers.umls as umls -from src.metadata.provenance import write_concord_metadata - -from src.prefixes import GO, REACT, WIKIPATHWAYS, TCDB +from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium from src.categories import BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY, PATHWAY +from src.metadata.provenance import write_concord_metadata +from src.prefixes import GO, REACT, TCDB, WIKIPATHWAYS from src.ubergraph import build_sets -from src.babel_utils import read_identifier_file, glom, remove_overused_xrefs, get_prefixes, write_compendium - def write_obo_ids(irisandtypes, outfile, exclude=[]): order = [PATHWAY, BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY] @@ -102,7 +100,7 @@ def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): # them added. So we want to limit concordances to terms that are already in the dicts. But that's ONLY for the # UMLS concord. We trust the others to retrieve decent identifiers. pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") if infile.endswith("UMLS"): diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 055a66cd..79f66112 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,24 +1,21 @@ +import os import re -from src.metadata.provenance import write_concord_metadata -from src.prefixes import ENSEMBL, PR, UNIPROTKB, NCIT, NCBITAXON, MESH, DRUGBANK -from src.categories import PROTEIN - -import src.datahandlers.umls as umls import src.datahandlers.obo as obo +import src.datahandlers.umls as umls +from src.babel_utils import Text, glom, read_identifier_file, write_compendium +from src.categories import PROTEIN +from src.metadata.provenance import write_concord_metadata +from src.prefixes import DRUGBANK, ENSEMBL, MESH, NCBITAXON, NCIT, PR, UNIPROTKB from src.ubergraph import UberGraph - -from src.babel_utils import read_identifier_file, glom, write_compendium, Text - -import os -from src.util import get_memory_usage_summary, get_logger +from src.util import get_logger, get_memory_usage_summary logger = get_logger(__name__) def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename): """Extract NCBIGene identifiers from the UniProtKB mapping file.""" - with open(idmapping_filename, "r") as inf, open(uniprotkb_taxa_filename, "w") as outf: + with open(idmapping_filename) as inf, open(uniprotkb_taxa_filename, "w") as outf: for line in inf: x = line.strip().split("\t") if x[1] == "NCBI_TaxID": @@ -51,7 +48,7 @@ def write_ensembl_protein_ids(ensembl_dir, outfile): print(f"write_ensembl_ids for input filename {infname}") if os.path.exists(infname): # open each ensembl file, find the id column, and put it in the output - with open(infname, "r") as inf: + with open(infname) as inf: wrote = set() h = inf.readline() x = h[:-1].split("\t") @@ -97,7 +94,7 @@ def build_pr_uniprot_relationships(outfile, ignore_list=[], metadata_yaml=None): def build_protein_uniprotkb_ensemble_relationships(infile, outfile, metadata_yaml): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: x = line.strip().split() if x[1] == "Ensembl_PRO": @@ -129,7 +126,7 @@ def build_protein_uniprotkb_ensemble_relationships(infile, outfile, metadata_yam def build_ncit_uniprot_relationships(infile, outfile, metadata_yaml): - with open(infile, "r") as inf, open(outfile, "w") as outf: + with open(infile) as inf, open(outfile, "w") as outf: for line in inf: # These lines are sometimes empty (I think because the # input file can have DOS line endings). If so, we can @@ -188,7 +185,7 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil for infile in concordances: logger.info(f"Loading concordance file {infile}") pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line_index, line in enumerate(inf): if line_index % 1000000 == 0: logger.info(f"Loading concordance file {infile}: line {line_index:,}") diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index b9d70468..90c4a8fd 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -4,15 +4,15 @@ import logging import os import time +import xml.etree.ElementTree as ET from collections import defaultdict from mmap import ACCESS_READ, mmap from pathlib import Path -import xml.etree.ElementTree as ET -from src.babel_utils import pull_via_wget, WgetRecursionOptions, glom, read_identifier_file, write_compendium +from src.babel_utils import WgetRecursionOptions, glom, pull_via_wget, read_identifier_file, write_compendium from src.categories import JOURNAL_ARTICLE, PUBLICATION from src.metadata.provenance import write_concord_metadata -from src.prefixes import PMID, DOI, PMC +from src.prefixes import DOI, PMC, PMID def download_pubmed(download_file, pubmed_base="ftp://ftp.ncbi.nlm.nih.gov/pubmed/", pmc_base="https://ftp.ncbi.nlm.nih.gov/pub/pmc/"): @@ -69,7 +69,7 @@ def verify_pubmed_download_against_md5(pubmed_filename, md5_filename): logging.warning(f"Could not verify {pubmed_filename}: no MD5 file found at {md5_filename}.") return False - with open(md5_filename, "r") as md5f: + with open(md5_filename) as md5f: md5_line = md5f.readline().strip() expected_md5 = md5_line.split("= ")[1] if len(expected_md5) != 32: @@ -273,7 +273,7 @@ def generate_compendium(concordances, metadata_yamls, identifiers, titles, publi print(infile) print("loading", infile) pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") pairs.append({x[0], x[2]}) @@ -283,7 +283,7 @@ def generate_compendium(concordances, metadata_yamls, identifiers, titles, publi labels = dict() for title_filename in titles: print("loading titles from", title_filename) - with open(title_filename, "r") as titlef: + with open(title_filename) as titlef: for line in titlef: id, title = line.strip().split("\t") if id in labels: diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index 57e2c100..275f1e1d 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -1,13 +1,11 @@ -from src.metadata.provenance import write_concord_metadata -from src.prefixes import NCBITAXON, MESH, UMLS -from src.categories import ORGANISM_TAXON +import logging import src.datahandlers.mesh as mesh import src.datahandlers.umls as umls - -from src.babel_utils import read_identifier_file, glom, write_compendium - -import logging +from src.babel_utils import glom, read_identifier_file, write_compendium +from src.categories import ORGANISM_TAXON +from src.metadata.provenance import write_concord_metadata +from src.prefixes import MESH, NCBITAXON, UMLS from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, level=logging.ERROR) @@ -74,7 +72,7 @@ def build_taxon_umls_relationships(mrconso, idfile, outfile, metadata_yaml): def build_relationships(outfile, mesh_ids, metadata_yaml): regis = mesh.pull_mesh_registry() - with open(mesh_ids, "r") as inf: + with open(mesh_ids) as inf: lines = inf.read().strip().split("\n") all_mesh_taxa = set([x.split("\t")[0] for x in lines]) with open(outfile, "w") as outf: @@ -118,7 +116,7 @@ def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): print(infile) print("loading", infile) pairs = [] - with open(infile, "r") as inf: + with open(infile) as inf: for line in inf: x = line.strip().split("\t") pairs.append(set([x[0], x[2]])) diff --git a/src/datahandlers/chebi.py b/src/datahandlers/chebi.py index 147b6a61..80a2b794 100644 --- a/src/datahandlers/chebi.py +++ b/src/datahandlers/chebi.py @@ -3,7 +3,9 @@ def pull_chebi(): pull_via_ftp("ftp.ebi.ac.uk", "/pub/databases/chebi/SDF", "chebi.sdf.gz", decompress_data=True, outfilename="CHEBI/ChEBI_complete.sdf") - pull_via_ftp("ftp.ebi.ac.uk", "/pub/databases/chebi/flat_files", "database_accession.tsv.gz", decompress_data=True, outfilename="CHEBI/database_accession.tsv") + pull_via_ftp( + "ftp.ebi.ac.uk", "/pub/databases/chebi/flat_files", "database_accession.tsv.gz", decompress_data=True, outfilename="CHEBI/database_accession.tsv" + ) def x(inputfile, labelfile, synfile): diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py index e0272998..d9ed663a 100644 --- a/src/datahandlers/chembl.py +++ b/src/datahandlers/chembl.py @@ -1,8 +1,10 @@ -from src.prefixes import CHEMBLCOMPOUND -from src.babel_utils import pull_via_ftp import ftplib + import pyoxigraph +from src.babel_utils import pull_via_ftp +from src.prefixes import CHEMBLCOMPOUND + def pull_chembl(moleculefilename): fname = get_latest_chembl_name() diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py index de9196d7..7620494a 100644 --- a/src/datahandlers/clo.py +++ b/src/datahandlers/clo.py @@ -1,12 +1,13 @@ import logging import re +import pyoxigraph + +from src.babel_utils import pull_via_urllib +from src.categories import CELL_LINE from src.metadata.provenance import write_download_metadata from src.prefixes import CLO -from src.categories import CELL_LINE -from src.babel_utils import pull_via_urllib -from src.util import Text, LoggingUtil -import pyoxigraph +from src.util import LoggingUtil, Text logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) diff --git a/src/datahandlers/complexportal.py b/src/datahandlers/complexportal.py index b9a289c8..3bc973e8 100644 --- a/src/datahandlers/complexportal.py +++ b/src/datahandlers/complexportal.py @@ -9,7 +9,7 @@ def pull_complexportal(): def make_labels_and_synonyms(infile, labelfile, synfile, metadata_yaml): usedsyns = set() - with open(infile, "r") as inf, open(labelfile, "w") as outl, open(synfile, "w") as outsyn: + with open(infile) as inf, open(labelfile, "w") as outl, open(synfile, "w") as outsyn: next(inf) # skip header for line in inf: sline = line.split("\t") diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py index 9b23aa9d..f5563e67 100644 --- a/src/datahandlers/datacollect.py +++ b/src/datahandlers/datacollect.py @@ -1,7 +1,8 @@ -from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib import gzip from json import loads +from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib + def pull_pubchem_labels(): print("LABEL PUBCHEM") @@ -64,7 +65,7 @@ def pull_prot(which, refresh): swissprot_labels = {} nlines = 0 maxn = 1000 - with open(swissname, "r") as inf: + with open(swissname) as inf: for line in inf: nlines += 1 if line.startswith(">"): diff --git a/src/datahandlers/doid.py b/src/datahandlers/doid.py index e31539a8..3157b83b 100644 --- a/src/datahandlers/doid.py +++ b/src/datahandlers/doid.py @@ -1,7 +1,8 @@ -from src.prefixes import DOID, OIO -from src.babel_utils import pull_via_urllib, norm import json +from src.babel_utils import norm, pull_via_urllib +from src.prefixes import DOID, OIO + def pull_doid(): pull_via_urllib("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/", "doid.json", subpath="DOID", decompress=False) @@ -9,7 +10,7 @@ def pull_doid(): def pull_doid_labels_and_synonyms(infile, labelfile, synonymfile): # Everything in DOID is a disease. - with open(infile, "r") as inf: + with open(infile) as inf: j = json.load(inf) with open(labelfile, "w") as labels, open(synonymfile, "w") as syns: for entry in j["graphs"][0]["nodes"]: @@ -30,7 +31,7 @@ def pull_doid_labels_and_synonyms(infile, labelfile, synonymfile): def build_xrefs(infile, xreffile, other_prefixes={}): # Everything in DOID is a disease. - with open(infile, "r") as inf: + with open(infile) as inf: j = json.load(inf) with open(xreffile, "w") as xrefs: for entry in j["graphs"][0]["nodes"]: diff --git a/src/datahandlers/drugbank.py b/src/datahandlers/drugbank.py index e751f2a6..ccadecfb 100644 --- a/src/datahandlers/drugbank.py +++ b/src/datahandlers/drugbank.py @@ -32,7 +32,7 @@ def extract_drugbank_labels_and_synonyms(drugbank_vocab_csv, labels, synonyms): :param synonyms: The file to write synonyms into. """ - with open(drugbank_vocab_csv, "r") as fin, open(labels, "w") as labelsf, open(synonyms, "w") as synonymsf: + with open(drugbank_vocab_csv) as fin, open(labels, "w") as labelsf, open(synonyms, "w") as synonymsf: reader = csv.DictReader(fin) assert "DrugBank ID" in reader.fieldnames assert "Common name" in reader.fieldnames diff --git a/src/datahandlers/drugcentral.py b/src/datahandlers/drugcentral.py index 22dabb0f..c5ace307 100644 --- a/src/datahandlers/drugcentral.py +++ b/src/datahandlers/drugcentral.py @@ -1,6 +1,7 @@ -from src.prefixes import DRUGCENTRAL import psycopg2 +from src.prefixes import DRUGCENTRAL + def pull_drugcentral(structfile, labelfile, xreffile): # DrugCentral is only available as a postgres db, but fortunately they run a public instance. diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py index f4ef3ecd..469afa2f 100644 --- a/src/datahandlers/ec.py +++ b/src/datahandlers/ec.py @@ -1,9 +1,9 @@ -from src.prefixes import EC -from src.categories import MOLECULAR_ACTIVITY -from src.babel_utils import pull_via_urllib -from src.babel_utils import make_local_name import pyoxigraph +from src.babel_utils import make_local_name, pull_via_urllib +from src.categories import MOLECULAR_ACTIVITY +from src.prefixes import EC + def pull_ec(): outputfile = pull_via_urllib("https://ftp.expasy.org/databases/enzyme/", "enzyme.rdf", subpath="EC", decompress=False) diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 808d8a93..64613f23 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -1,12 +1,12 @@ import logging import re +import pyoxigraph + +from src.babel_utils import pull_via_urllib from src.metadata.provenance import write_concord_metadata from src.prefixes import EFO, ORPHANET -from src.babel_utils import pull_via_urllib -from src.babel_utils import make_local_name -from src.util import Text, LoggingUtil -import pyoxigraph +from src.util import LoggingUtil, Text logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) @@ -31,7 +31,7 @@ def __init__(self, efo_owl_file_path): logger.info(f"Loading EFO from {efo_owl_file_path}.") start = dt.now() self.m = pyoxigraph.Store() - with open(efo_owl_file_path, "r") as inf: + with open(efo_owl_file_path) as inf: self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri="http://example.org/") end = dt.now() logger.info(f"EFO loading complete in {end - start}.") @@ -168,7 +168,7 @@ def make_ids(roots, owlfile, idfname): def make_concords(owlfile, idfilename, outfilename, provenance_metadata=None): """Given a list of identifiers, find out all of the equivalent identifiers from the owl""" m = EFOgraph(owlfile) - with open(idfilename, "r") as inf, open(outfilename, "w") as concfile: + with open(idfilename) as inf, open(outfilename, "w") as concfile: for line in inf: efo_id = line.split("\t")[0] nexacts = m.get_exacts(efo_id, concfile) diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py index 8f8606d3..582a695c 100644 --- a/src/datahandlers/ensembl.py +++ b/src/datahandlers/ensembl.py @@ -1,10 +1,11 @@ import json - -from src.util import get_config -from apybiomart import find_datasets, query, find_attributes import logging import os +from apybiomart import find_attributes, find_datasets, query + +from src.util import get_config + # As per https://support.bioconductor.org/p/39744/#39751, more attributes than this result in an # error from BioMart: Too many attributes selected for External References # This is the real MAX minus one: for every batch, we'll query the ensembl_gene_id so that we can diff --git a/src/datahandlers/gtopdb.py b/src/datahandlers/gtopdb.py index 19da92d6..efe033b9 100644 --- a/src/datahandlers/gtopdb.py +++ b/src/datahandlers/gtopdb.py @@ -1,8 +1,8 @@ -from src.prefixes import GTOPDB -from src.babel_utils import pull_via_urllib - from bs4 import BeautifulSoup +from src.babel_utils import pull_via_urllib +from src.prefixes import GTOPDB + def pull_gtopdb_ligands(): pull_via_urllib("https://www.guidetopharmacology.org/DATA/", "ligands.tsv", decompress=False, subpath="GTOPDB") @@ -27,7 +27,7 @@ def make_labels_and_synonyms(inputfile, labelfile, synfile): idcol = 0 labelcol = 1 syncol = 13 - with open(inputfile, "r") as inf, open(labelfile, "w") as lf, open(synfile, "w") as sf: + with open(inputfile) as inf, open(labelfile, "w") as lf, open(synfile, "w") as sf: h = inf.readline() # Everything in this file is double quoted, hence all the [1:-1] stuff for line in inf: diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py index fbbac905..3d89bdd3 100644 --- a/src/datahandlers/hgnc.py +++ b/src/datahandlers/hgnc.py @@ -1,6 +1,7 @@ -from src.babel_utils import make_local_name, pull_via_urllib import json +from src.babel_utils import make_local_name, pull_via_urllib + def pull_hgnc(): # As per the "quick links" from https://www.genenames.org/download/archive/ @@ -8,7 +9,7 @@ def pull_hgnc(): def pull_hgnc_labels_and_synonyms(infile): - with open(infile, "r") as data: + with open(infile) as data: hgnc_json = json.load(data) lname = make_local_name("labels", subpath="HGNC") sname = make_local_name("synonyms", subpath="HGNC") diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py index f7a79700..95cd233f 100644 --- a/src/datahandlers/hgncfamily.py +++ b/src/datahandlers/hgncfamily.py @@ -1,6 +1,5 @@ import csv - from src.babel_utils import pull_via_urllib from src.metadata.provenance import write_metadata from src.prefixes import HGNCFAMILY @@ -14,7 +13,7 @@ def pull_hgncfamily(): def pull_labels(infile, labelsfile, descriptionsfile, metadata_yaml): - with open(infile, "r") as inf, open(labelsfile, "w") as labelsf, open(descriptionsfile, "w") as descriptionsf: + with open(infile) as inf, open(labelsfile, "w") as labelsf, open(descriptionsfile, "w") as descriptionsf: reader = csv.DictReader(inf) for row in reader: curie = f"{HGNCFAMILY}:{row['id']}" diff --git a/src/datahandlers/hmdb.py b/src/datahandlers/hmdb.py index 2ebcc7d3..7ef8bb30 100644 --- a/src/datahandlers/hmdb.py +++ b/src/datahandlers/hmdb.py @@ -1,9 +1,11 @@ -from zipfile import ZipFile from os import path -from src.prefixes import HMDB -from src.babel_utils import pull_via_urllib +from zipfile import ZipFile + import xmltodict +from src.babel_utils import pull_via_urllib +from src.prefixes import HMDB + def pull_hmdb(): dname = pull_via_urllib("https://hmdb.ca/system/downloads/current/", "hmdb_metabolites.zip", decompress=False, subpath="HMDB") @@ -31,7 +33,7 @@ def handle_metabolite(metabolite, lfile, synfile, smifile): def make_labels_and_synonyms_and_smiles(inputfile, labelfile, synfile, smifile): - with open(inputfile, "r") as inf: + with open(inputfile) as inf: xml = inf.read() parsed = xmltodict.parse(xml) metabolites = parsed["hmdb"]["metabolite"] diff --git a/src/datahandlers/kegg.py b/src/datahandlers/kegg.py index 64f69572..4cc5ee0e 100644 --- a/src/datahandlers/kegg.py +++ b/src/datahandlers/kegg.py @@ -1,8 +1,9 @@ import re +import traceback import requests -import traceback from more_itertools import chunked + from src.prefixes import KEGGCOMPOUND ### diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index d4bf18ac..53058c85 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -1,6 +1,8 @@ -from src.babel_utils import make_local_name, pull_via_ftp -import pyoxigraph from collections import defaultdict + +import pyoxigraph + +from src.babel_utils import make_local_name, pull_via_ftp from src.prefixes import MESH diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py index 41a9c699..41f90f63 100644 --- a/src/datahandlers/mods.py +++ b/src/datahandlers/mods.py @@ -1,8 +1,9 @@ -from src.prefixes import WORMBASE -from src.babel_utils import pull_via_urllib import json import os +from src.babel_utils import pull_via_urllib +from src.prefixes import WORMBASE + mods = ["WB", "FB", "ZFIN", "MGI", "RGD", "SGD"] modmap = {x: x for x in mods} modmap["WB"] = WORMBASE @@ -29,7 +30,7 @@ def pull_mods(): def write_labels(dd): for mod, prefix in modmap.items(): - with open(f"{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json", "r") as inf: + with open(f"{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json") as inf: j = json.load(inf) with open(f"{dd}/{prefix}/labels", "w") as outf: for gene in j["data"]: diff --git a/src/datahandlers/ncbigene.py b/src/datahandlers/ncbigene.py index 1fe8e10d..c02b00c2 100644 --- a/src/datahandlers/ncbigene.py +++ b/src/datahandlers/ncbigene.py @@ -1,6 +1,7 @@ -from src.babel_utils import pull_via_urllib import gzip +from src.babel_utils import pull_via_urllib + def pull_ncbigene(filenames): for fn in filenames: diff --git a/src/datahandlers/ncbitaxon.py b/src/datahandlers/ncbitaxon.py index b9a8bab6..7d10d3ea 100644 --- a/src/datahandlers/ncbitaxon.py +++ b/src/datahandlers/ncbitaxon.py @@ -1,9 +1,9 @@ import gzip import logging +import tarfile from src.babel_utils import pull_via_ftp from src.prefixes import NCBITAXON -import tarfile def pull_ncbitaxon(): diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py index c078c5d3..a502ebb9 100644 --- a/src/datahandlers/obo.py +++ b/src/datahandlers/obo.py @@ -1,11 +1,10 @@ import json import logging +import os +from collections import defaultdict from pathlib import Path from src.ubergraph import UberGraph -from collections import defaultdict -import os - from src.util import Text diff --git a/src/datahandlers/orphanet.py b/src/datahandlers/orphanet.py index 0b45deb8..3c132e66 100644 --- a/src/datahandlers/orphanet.py +++ b/src/datahandlers/orphanet.py @@ -1,9 +1,9 @@ -from src.prefixes import OIO, ORPHANET -from src.babel_utils import pull_via_urllib -from zipfile import ZipFile - # ugh XML import xml.etree.ElementTree as ET +from zipfile import ZipFile + +from src.babel_utils import pull_via_urllib +from src.prefixes import OIO, ORPHANET def pull_orphanet(): diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index 4a54d3f9..88e62773 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -15,7 +15,7 @@ def pull_labels(infile, outfile, metadata_yaml): MAINFAMILY_NAME_COLUMN = 4 SUBFAMILY_NAME_COLUMN = 5 done = set() - with open(infile, "r") as inf, open(outfile, "w") as labelf: + with open(infile) as inf, open(outfile, "w") as labelf: for raw_line in inf: line = raw_line.strip() parts = line.split("\t") diff --git a/src/datahandlers/pantherpathways.py b/src/datahandlers/pantherpathways.py index b464f076..4a1948ae 100644 --- a/src/datahandlers/pantherpathways.py +++ b/src/datahandlers/pantherpathways.py @@ -1,5 +1,5 @@ -from src.prefixes import PANTHERPATHWAY from src.babel_utils import pull_via_urllib +from src.prefixes import PANTHERPATHWAY def pull_panther_pathways(): @@ -9,7 +9,7 @@ def pull_panther_pathways(): def make_pathway_labels(infile, outfile): - with open(infile, "r") as inf: + with open(infile) as inf: data = inf.read() lines = data.split("\n") labels = {} diff --git a/src/datahandlers/pubchem.py b/src/datahandlers/pubchem.py index e22748b8..eac6e6d5 100644 --- a/src/datahandlers/pubchem.py +++ b/src/datahandlers/pubchem.py @@ -1,9 +1,11 @@ -from src.prefixes import PUBCHEMCOMPOUND -from src.babel_utils import pull_via_wget import gzip -import requests import json +import requests + +from src.babel_utils import pull_via_wget +from src.prefixes import PUBCHEMCOMPOUND + def pull_pubchem(): files = ["CID-MeSH", "CID-Synonym-filtered.gz", "CID-Title.gz"] diff --git a/src/datahandlers/reactome.py b/src/datahandlers/reactome.py index 1f9b071a..25473653 100644 --- a/src/datahandlers/reactome.py +++ b/src/datahandlers/reactome.py @@ -1,8 +1,10 @@ -from src.prefixes import REACT -from src.categories import PATHWAY, BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY -import requests import json +import requests + +from src.categories import BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY, PATHWAY +from src.prefixes import REACT + # Reactome doesn't have a great download, but it does have a decent service that lets you get the files you could have # downloaded. In reactome, there are "events" which have subclasses of "pathway" and "reaction like event". @@ -19,7 +21,7 @@ def pull_reactome(outfile): def make_labels(infile, labelfile): - with open(infile, "r") as inf: + with open(infile) as inf: elements = json.load(inf) with open(labelfile, "w") as labels: for element in elements: @@ -37,7 +39,7 @@ def parse_element_for_labels(e, lfile): def write_ids(infile, idfile): - with open(infile, "r") as inf: + with open(infile) as inf: elements = json.load(inf) with open(idfile, "w") as outf: for element in elements: diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py index 46abe244..c25572ce 100644 --- a/src/datahandlers/rhea.py +++ b/src/datahandlers/rhea.py @@ -1,9 +1,9 @@ -from src.metadata.provenance import write_concord_metadata -from src.prefixes import RHEA, EC -from src.babel_utils import pull_via_urllib -from src.babel_utils import make_local_name import pyoxigraph +from src.babel_utils import make_local_name, pull_via_urllib +from src.metadata.provenance import write_concord_metadata +from src.prefixes import EC, RHEA + def pull_rhea(): outputfile = pull_via_urllib("https://ftp.expasy.org/databases/rhea/rdf/", "rhea.rdf.gz", subpath="RHEA", decompress=True) diff --git a/src/datahandlers/smpdb.py b/src/datahandlers/smpdb.py index d24ccff8..f46a8843 100644 --- a/src/datahandlers/smpdb.py +++ b/src/datahandlers/smpdb.py @@ -1,7 +1,8 @@ -from zipfile import ZipFile from os import path -from src.prefixes import SMPDB +from zipfile import ZipFile + from src.babel_utils import pull_via_urllib +from src.prefixes import SMPDB def pull_smpdb(): @@ -13,7 +14,7 @@ def pull_smpdb(): def make_labels(inputfile, labelfile): """Get the SMPDB file. It's not good - there are \n and commas, and commas are also the delimiter. I mean, what?""" - with open(inputfile, "r") as inf, open(labelfile, "w") as outf: + with open(inputfile) as inf, open(labelfile, "w") as outf: h = inf.readline() for line in inf: if "," not in line: diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 958815b2..408edf55 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -1,15 +1,16 @@ -from src.metadata.provenance import write_concord_metadata -from src.prefixes import UMLS, RXCUI -from src.babel_utils import make_local_name -from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE - +import logging +import os +import re import shutil +from collections import defaultdict from zipfile import ZipFile + import requests -from collections import defaultdict -import os -import re -import logging + +from src.babel_utils import make_local_name +from src.categories import CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE +from src.metadata.provenance import write_concord_metadata +from src.prefixes import RXCUI, UMLS def check_mrconso_line(line): @@ -80,7 +81,7 @@ def write_umls_ids(mrsty, category_map, umls_output, prefix=UMLS, blocklist_umls output_lines = defaultdict(list) semantic_type_trees = defaultdict(set) tree_names = defaultdict(set) - with open(mrsty, "r") as inf, open(umls_output, "w") as outf: + with open(mrsty) as inf, open(umls_output, "w") as outf: for line in inf: x = line.strip().split("|") cat = x[2] @@ -159,7 +160,7 @@ def write_rxnorm_ids(category_map, bad_categories, infile, outfile, prefix=RXCUI If there is an IN or PIN TTY, then it's a ChemicalEntity, otherwise a Drug. """ rxnconso = infile # os.path.join('input_data', 'private', "RXNCONSO.RRF") - with open(rxnconso, "r") as inf, open(outfile, "w") as outf: + with open(rxnconso) as inf, open(outfile, "w") as outf: current_id = None current_ttys = set() has_rxnorm = False @@ -227,7 +228,7 @@ def build_sets( acceptable_drugbank_tty = set(["IN", "PIN", "MIN"]) pairs = set() # test_cui = 'C0026827' - with open(mrconso, "r") as inf, open(umls_output, "w") as concordfile: + with open(mrconso) as inf, open(umls_output, "w") as concordfile: for line in inf: if not check_mrconso_line(line): continue @@ -279,7 +280,7 @@ def build_sets( def read_umls_priority(): mrp = os.path.join("input_data", "umls_precedence.txt") pris = [] - with open(mrp, "r") as inf: + with open(mrp) as inf: h = inf.readline() for line in inf: x = line.strip().split() @@ -398,7 +399,7 @@ def pull_umls(mrconso): priority = read_umls_priority() snomed_label_name = make_local_name("labels", subpath="SNOMEDCT") snomed_syn_name = make_local_name("synonyms", subpath="SNOMEDCT") - with open(mrconso, "r") as inf, open(snomed_label_name, "w") as snolabels, open(snomed_syn_name, "w") as snosyns: + with open(mrconso) as inf, open(snomed_label_name, "w") as snolabels, open(snomed_syn_name, "w") as snosyns: for line in inf: if not check_mrconso_line(line): continue diff --git a/src/datahandlers/unichem.py b/src/datahandlers/unichem.py index 3680fed8..e81a9dc5 100644 --- a/src/datahandlers/unichem.py +++ b/src/datahandlers/unichem.py @@ -1,7 +1,7 @@ import gzip -from src.babel_utils import pull_via_urllib, pull_via_wget -from src.prefixes import CHEMBLCOMPOUND, DRUGCENTRAL, DRUGBANK, GTOPDB, KEGGCOMPOUND, CHEBI, UNII, HMDB, PUBCHEMCOMPOUND +from src.babel_utils import pull_via_urllib +from src.prefixes import CHEBI, CHEMBLCOMPOUND, DRUGBANK, DRUGCENTRAL, GTOPDB, HMDB, KEGGCOMPOUND, PUBCHEMCOMPOUND, UNII # global for this file data_sources: dict = { @@ -19,14 +19,18 @@ def pull_unichem(): """Download UniChem files.""" - pull_via_urllib("http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/", "structure.tsv.gz", decompress=False, subpath="UNICHEM", verify_gzip=True) - pull_via_urllib("http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/", "reference.tsv.gz", decompress=False, subpath="UNICHEM", verify_gzip=True) + pull_via_urllib( + "http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/", "structure.tsv.gz", decompress=False, subpath="UNICHEM", verify_gzip=True + ) + pull_via_urllib( + "http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/", "reference.tsv.gz", decompress=False, subpath="UNICHEM", verify_gzip=True + ) def filter_unichem(ref_file, ref_filtered): """Filter UniChem reference file to those sources we're interested in.""" srclist = [str(k) for k in data_sources.keys()] - with gzip.open(ref_file, "rt") as rf, open(ref_filtered, "wt") as ref_filtered: + with gzip.open(ref_file, "rt") as rf, open(ref_filtered, "w") as ref_filtered: header_line = rf.readline() assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {ref_file}: {header_line}" ref_filtered.write(header_line) diff --git a/src/datahandlers/unii.py b/src/datahandlers/unii.py index 5fc1ad38..7615f199 100644 --- a/src/datahandlers/unii.py +++ b/src/datahandlers/unii.py @@ -1,5 +1,5 @@ +from os import listdir, path, rename from zipfile import ZipFile -from os import path, listdir, rename import requests @@ -39,7 +39,7 @@ def make_labels_and_synonyms(inputfile, labelfile, synfile): syncol = 0 wrotelabels = set() wrotesyns = set() - with open(inputfile, "r", encoding="latin-1") as inf, open(labelfile, "w") as lf, open(synfile, "w") as sf: + with open(inputfile, encoding="latin-1") as inf, open(labelfile, "w") as lf, open(synfile, "w") as sf: h = inf.readline() for line in inf: parts = line.strip().split("\t") diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index 1a1a8ad5..d04cb38e 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -10,7 +10,7 @@ def readlabels(which): swissname = make_local_name(f"UniProtKB/uniprot_{which}.fasta") swissprot_labels = {} - with open(swissname, "r") as inf: + with open(swissname) as inf: for line in inf: if line.startswith(">"): # example fasta line: @@ -58,7 +58,7 @@ def download_umls_gene_protein_mappings(umls_uniprotkb_raw_url, umls_uniprotkb_f os.makedirs(os.path.dirname(umls_protein_concords), exist_ok=True) count_rows = 0 - with open(umls_uniprotkb_filename, "r") as f, open(umls_gene_concords, "w") as genef, open(umls_protein_concords, "w") as proteinf: + with open(umls_uniprotkb_filename) as f, open(umls_gene_concords, "w") as genef, open(umls_protein_concords, "w") as proteinf: csv_reader = csv.DictReader(f, dialect="excel-tab") for row in csv_reader: count_rows += 1 diff --git a/src/eutil.py b/src/eutil.py index 18ecda3a..57e85532 100644 --- a/src/eutil.py +++ b/src/eutil.py @@ -1,5 +1,6 @@ import itertools import os + from src.babel_utils import ThrottledRequester diff --git a/src/exporters/duckdb_exporters.py b/src/exporters/duckdb_exporters.py index b466d0f1..37fd9fe8 100644 --- a/src/exporters/duckdb_exporters.py +++ b/src/exporters/duckdb_exporters.py @@ -13,6 +13,7 @@ MIN_FILE_SIZE_FOR_SPLITTING_LOAD = 44_000_000_000 CHUNK_LINE_SIZE = 60_000_000 + def setup_duckdb(duckdb_filename, duckdb_config=None): """ Set up a DuckDB instance using the settings in the config. @@ -28,7 +29,7 @@ def setup_duckdb(duckdb_filename, duckdb_config=None): # Apply some Babel-wide settings to DuckDB. config = get_config() - if 'tmp_directory' in config: + if "tmp_directory" in config: db.execute(f"SET temp_directory = '{config['tmp_directory']}'") db.execute("SET max_temp_directory_size = '500GB';") @@ -75,7 +76,8 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du if compendium_filesize < MIN_FILE_SIZE_FOR_SPLITTING_LOAD: # This seems to be around the threshold where 500G is inadequate on Hatteras. So let's try splitting it. logger.info(f"Loading {compendium_filename} into DuckDB (size {compendium_filesize}) in a single direct ingest.") - db.execute("""INSERT INTO Node + db.execute( + """INSERT INTO Node WITH extracted AS ( SELECT json_extract_string(identifier_row.value, ['i', 'l', 'd', 't']) AS extracted_list FROM read_json($1, format='newline_delimited') AS clique, @@ -87,14 +89,16 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du LOWER(label) AS label_lc, extracted_list[3] AS description, extracted_list[4] AS taxa - FROM extracted""", [compendium_filename]) + FROM extracted""", + [compendium_filename], + ) else: logger.info(f"Loading {compendium_filename} into DuckDB (size {compendium_filesize}) in multiple chunks of {CHUNK_LINE_SIZE:,} lines:") chunk_filenames = [] lines_added = 0 lines_added_file = 0 output_file = None - with open(compendium_filename, "r", encoding="utf-8") as inf: + with open(compendium_filename, encoding="utf-8") as inf: for line in inf: if output_file is None: output_file = tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") @@ -115,7 +119,8 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du logger.info(f"Loaded {len(chunk_filenames)} containing {lines_added:,} lines into chunk files.") for chunk_filename in chunk_filenames: # TODO: maybe add the PREFIX in a different column here so we can SELECT on that later? - db.execute("""INSERT INTO Node + db.execute( + """INSERT INTO Node WITH extracted AS ( SELECT json_extract_string(identifier_row.value, ['i', 'l', 'd', 't']) AS extracted_list FROM read_json($1, format='newline_delimited') AS clique, @@ -127,7 +132,9 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du LOWER(label) AS label_lc, extracted_list[3] AS description, extracted_list[4] AS taxa - FROM extracted""", [chunk_filename]) + FROM extracted""", + [chunk_filename], + ) logger.info(f" - Loaded chunk file {chunk_filename} into DuckDB.") os.remove(chunk_filename) logger.info(f" - Deleted chunk file {chunk_filename}.") @@ -135,7 +142,7 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du logger.info(f"Completed loading {compendium_filename} into DuckDB.") logger.info(f" - Line count: {lines_added:,}.") - node_count = db.execute('SELECT COUNT(*) FROM Node').fetchone()[0] + node_count = db.execute("SELECT COUNT(*) FROM Node").fetchone()[0] logger.info(f" - Identifier count: {node_count:,}.") db.table("Node").write_parquet(node_parquet_filename) @@ -144,22 +151,28 @@ def export_compendia_to_parquet(compendium_filename, clique_parquet_filename, du db.sql("""CREATE TABLE Clique (clique_leader STRING, preferred_name STRING, clique_identifier_count INT, biolink_type STRING, information_content FLOAT)""") - db.execute("""INSERT INTO Clique SELECT + db.execute( + """INSERT INTO Clique SELECT json_extract_string(identifiers, '$[0].i') AS clique_leader, preferred_name, len(identifiers) AS clique_identifier_count, type AS biolink_type, ic AS information_content - FROM read_json(?, format='newline_delimited')""", [compendium_filename]) + FROM read_json(?, format='newline_delimited')""", + [compendium_filename], + ) db.table("Clique").write_parquet(clique_parquet_filename) # Step 2. Create an Edge table with all the clique/CURIE relationships from this file. db.sql("CREATE TABLE Edge (clique_leader STRING, curie STRING, conflation STRING)") - db.execute("""INSERT INTO Edge SELECT + db.execute( + """INSERT INTO Edge SELECT json_extract_string(identifiers, '$[0].i') AS clique_leader, UNNEST(json_extract_string(identifiers, '$[*].i')) AS curie, 'None' AS conflation - FROM read_json(?, format='newline_delimited')""", [compendium_filename]) + FROM read_json(?, format='newline_delimited')""", + [compendium_filename], + ) db.table("Edge").write_parquet(edge_parquet_filename) diff --git a/src/exporters/kgx.py b/src/exporters/kgx.py index 551374f4..c6954b9f 100644 --- a/src/exporters/kgx.py +++ b/src/exporters/kgx.py @@ -5,10 +5,10 @@ import gzip import hashlib import json +import logging import os from itertools import combinations -import logging from src.util import LoggingUtil, get_memory_usage_summary # Default logger for this file. @@ -46,7 +46,7 @@ def convert_compendium_to_kgx(compendium_filename, kgx_nodes_filename, kgx_edges os.makedirs(os.path.dirname(kgx_edges_filename), exist_ok=True) # Open the compendium file for reading. - with open(compendium_filename, "r", encoding="utf-8") as compendium: + with open(compendium_filename, encoding="utf-8") as compendium: # Open the nodes and edges files for writing. with gzip.open(kgx_nodes_filename, "wt", encoding="utf-8") as node_file, gzip.open(kgx_edges_filename, "wt", encoding="utf-8") as edge_file: # set the flag for suppressing the first ",\n" in the written data diff --git a/src/exporters/sapbert.py b/src/exporters/sapbert.py index 3b7a521a..84bfbb68 100644 --- a/src/exporters/sapbert.py +++ b/src/exporters/sapbert.py @@ -10,12 +10,11 @@ import gzip import itertools import json +import logging import os import random import re -import logging - from src.util import LoggingUtil # Default logger for this file. diff --git a/src/make_cliques.py b/src/make_cliques.py index 12234905..770660d4 100644 --- a/src/make_cliques.py +++ b/src/make_cliques.py @@ -1,5 +1,6 @@ -import json import ast +import json + # Starting with a conflation file, and a set of compendia, create a new compendium merging conflated cliques. @@ -9,7 +10,7 @@ def get_conflation_ids(conffilename): return a set of all the ids in the file. """ ids = set() - with open(conffilename, "r") as inf: + with open(conffilename) as inf: for line in inf: ids.update(ast.literal_eval(line.strip())) if "RXCUI:1092396" in ids: @@ -30,7 +31,7 @@ def get_compendia_names(cdir, compendia, ids): """ id2name = {} for compendium in compendia: - with open(f"{cdir}/{compendium}", "r") as inf: + with open(f"{cdir}/{compendium}") as inf: print(compendium) for line in inf: row = json.loads(line) @@ -56,7 +57,7 @@ def label_cliques(conflation_fname, id2name): [{"i": "RXCUI:1092396", "l": "Acetinophem"}, {"i": "RXCUI:849078", "l": "100 mg Tylenol"}, ...] """ print(len(id2name)) - with open("labeled.txt", "w") as outf, open(conflation_fname, "r") as conflation: + with open("labeled.txt", "w") as outf, open(conflation_fname) as conflation: for line in conflation: clique = [] ids = ast.literal_eval(line) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 7505543b..e375a8aa 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -23,7 +23,7 @@ def write_concord_metadata(filename, *, name, concord_filename, url="", descript distinct_curies = set() predicate_counts = defaultdict(int) curie_prefix_counts = defaultdict(int) - with open(concord_filename, "r") as concordf: + with open(concord_filename) as concordf: for line in concordf: row = line.strip().split("\t") if len(row) != 3: @@ -71,7 +71,7 @@ def write_combined_metadata( ) combined_from_filenames = [combined_from_filenames] for metadata_yaml in combined_from_filenames: - with open(metadata_yaml, "r") as metaf: + with open(metadata_yaml) as metaf: metadata_block = yaml.safe_load(metaf) if metadata_block is None or metadata_block == {}: raise ValueError("Metadata file {metadata_yaml} is empty.") diff --git a/src/node.py b/src/node.py index 4d18584f..e52fce13 100644 --- a/src/node.py +++ b/src/node.py @@ -7,16 +7,16 @@ import curies +from src.LabeledID import LabeledID +from src.prefixes import PUBCHEMCOMPOUND from src.util import ( Text, - get_config, get_biolink_model_toolkit, get_biolink_prefix_map, + get_config, get_logger, get_memory_usage_summary, ) -from src.LabeledID import LabeledID -from src.prefixes import PUBCHEMCOMPOUND logger = get_logger(__name__) @@ -52,7 +52,7 @@ def __init__(self, syndir): for common_synonyms_file in self.config["common"]["synonyms"]: common_synonyms_path = os.path.join(self.config["download_directory"], "common", common_synonyms_file) count_common_file_synonyms = 0 - with open(common_synonyms_path, "r") as synonymsf: + with open(common_synonyms_path) as synonymsf: # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other # option. for line in synonymsf: @@ -70,14 +70,14 @@ def load_synonyms(self, prefix): count_labels = 0 count_synonyms = 0 if os.path.exists(labelfname): - with open(labelfname, "r") as inf: + with open(labelfname) as inf: for line in inf: x = line.strip().split("\t") lbs[x[0]].add(("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym", x[1])) count_labels += 1 synfname = os.path.join(self.synonym_dir, prefix, "synonyms") if os.path.exists(synfname): - with open(synfname, "r") as inf: + with open(synfname) as inf: for line in inf: x = line.strip().split("\t") if len(x) < 3: @@ -113,7 +113,7 @@ def __init__(self, rootdir): for common_descriptions_file in self.config["common"]["descriptions"]: common_descriptions_path = os.path.join(self.config["download_directory"], "common", common_descriptions_file) count_common_file_descriptions = 0 - with open(common_descriptions_path, "r") as descriptionsf: + with open(common_descriptions_path) as descriptionsf: # Note that these files may contain ANY CURIE -- we should only fallback to this if we have no other # option. for line in descriptionsf: @@ -130,7 +130,7 @@ def load_descriptions(self, prefix): descfname = os.path.join(self.root_dir, prefix, "descriptions") desc_count = 0 if os.path.exists(descfname): - with open(descfname, "r") as inf: + with open(descfname) as inf: for line in inf: x = line.strip().split("\t") descs[x[0]].add("\t".join(x[1:])) @@ -230,7 +230,7 @@ def load_prefix(self, prefix): logger.info(f"Reading records from {tsv_filename} into memory to load into SQLite: {get_memory_usage_summary()}") records = [] record_count = 0 - with open(tsv_filename, "r") as inf: + with open(tsv_filename) as inf: for line in inf: x = line.strip().split("\t", maxsplit=1) records.append([x[0].upper(), x[1]]) @@ -344,7 +344,7 @@ def __init__(self, ic_file): ubergraph_iri_stem_to_prefix_map = curies.Converter.from_reverse_prefix_map(config["ubergraph_iri_stem_to_prefix_map"]) count_by_prefix = defaultdict(int) - with open(ic_file, "r") as inf: + with open(ic_file) as inf: for line in inf: x = line.strip().split("\t") # We talk in CURIEs, but the infores download is in URLs. We can use the Biolink @@ -502,7 +502,7 @@ def load_extra_labels(self, prefix): labelfname = os.path.join(self.label_dir, prefix, "labels") lbs = {} if os.path.exists(labelfname): - with open(labelfname, "r") as inf: + with open(labelfname) as inf: for line in inf: x = line.strip().split("\t") lbs[x[0]] = x[1] @@ -518,7 +518,7 @@ def apply_labels(self, input_identifiers, labels): for common_labels_file in config["common"]["labels"]: common_labels_path = os.path.join(config["download_directory"], "common", common_labels_file) count_common_file_labels = 0 - with open(common_labels_path, "r") as labelf: + with open(common_labels_path) as labelf: # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other # option. for line in labelf: diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py index cf54dd86..349492c5 100644 --- a/src/reports/compendia_per_file_reports.py +++ b/src/reports/compendia_per_file_reports.py @@ -58,7 +58,7 @@ def generate_content_report_for_compendium(compendium_path, report_path): """ with open(report_path, "w") as report_file: - with open(compendium_path, "r") as compendium_file: + with open(compendium_path) as compendium_file: # This is a JSONL file, so we need to read each line as a JSON object. # Track CURIE breakdowns for this compendium. @@ -148,7 +148,7 @@ def summarize_content_report_for_compendia(compendia_report_paths, summary_path) # Read all the summary reports -- these are small, so we can just read them all in. for report_path in compendia_report_paths: - with open(report_path, "r") as report_file: + with open(report_path) as report_file: report = json.load(report_file) # name = report['name'] diff --git a/src/reports/duckdb_reports.py b/src/reports/duckdb_reports.py index 50a89bfa..09b2a29c 100644 --- a/src/reports/duckdb_reports.py +++ b/src/reports/duckdb_reports.py @@ -1,13 +1,13 @@ -import csv import json import os -from collections import Counter, defaultdict +from collections import defaultdict from src import util from src.exporters.duckdb_exporters import setup_duckdb logger = util.get_logger(__name__) + def check_for_identically_labeled_cliques(parquet_root, duckdb_filename, identically_labeled_cliques_tsv, duckdb_config=None): """ Generate a list of identically labeled cliques. @@ -174,7 +174,7 @@ def generate_curie_report(parquet_root, duckdb_filename, curie_report_json, duck # Add total counts back in. for curie_prefix in by_curie_prefix_results.keys(): - by_curie_prefix_results[curie_prefix]['_totals'] = prefix_totals_report_by_curie_prefix[curie_prefix] + by_curie_prefix_results[curie_prefix]["_totals"] = prefix_totals_report_by_curie_prefix[curie_prefix] with open(curie_report_json, "w") as fout: json.dump(by_curie_prefix_results, fout, indent=2, sort_keys=True) @@ -260,16 +260,18 @@ def generate_clique_leaders_report(parquet_root, duckdb_filename, by_clique_repo # Step 3. Add total counts back in. for filename, clique_leader_prefix_entries in clique_leaders_by_filename.items(): if filename in clique_totals_by_curie_prefix: - clique_leaders_by_filename[filename]['_totals'] = clique_totals_by_curie_prefix[filename] + clique_leaders_by_filename[filename]["_totals"] = clique_totals_by_curie_prefix[filename] # Step 4. Write out by-clique report in JSON. with open(by_clique_report_json, "w") as fout: - json.dump(clique_leaders_by_filename, + json.dump( + clique_leaders_by_filename, fout, indent=2, sort_keys=True, ) + def get_label_distribution(duckdb_filename, output_filename): db = setup_duckdb(duckdb_filename) diff --git a/src/reports/report_tables.py b/src/reports/report_tables.py index fc4f375e..758f60e5 100644 --- a/src/reports/report_tables.py +++ b/src/reports/report_tables.py @@ -15,7 +15,6 @@ # import csv import json -from collections import defaultdict def generate_prefix_table(prefix_report_json: str, prefix_report_table_csv: str): @@ -26,7 +25,7 @@ def generate_prefix_table(prefix_report_json: str, prefix_report_table_csv: str) :param prefix_report_table_csv: The report table CSV file to generate. """ - with open(prefix_report_json, 'r') as f: + with open(prefix_report_json) as f: prefix_report = json.load(f) curie_entries = [] @@ -38,130 +37,112 @@ def generate_prefix_table(prefix_report_json: str, prefix_report_table_csv: str) raise ValueError(f"Duplicate filename {filename} for prefix {prefix}!") filename_entries[filename] = { - 'prefix': prefix, - 'curie_count': entry['curie_count'], - 'curie_distinct_count': entry['curie_distinct_count'], + "prefix": prefix, + "curie_count": entry["curie_count"], + "curie_distinct_count": entry["curie_distinct_count"], } - if '_totals' not in filename_entries: + if "_totals" not in filename_entries: raise ValueError(f"No totals entry for prefix {prefix}!") - sorted_entries = sorted(filename_entries.items(), key=lambda x: x[1]['curie_distinct_count'], reverse=True) + sorted_entries = sorted(filename_entries.items(), key=lambda x: x[1]["curie_distinct_count"], reverse=True) filename_rows = [] for filename, entry in sorted_entries: - if filename == '_totals': + if filename == "_totals": continue - if entry['curie_count'] == entry['curie_distinct_count']: + if entry["curie_count"] == entry["curie_distinct_count"]: filename_rows.append(f"- {filename}: {entry['curie_count']:,} CURIEs") else: filename_rows.append(f"- {filename}: {entry['curie_count']:,} CURIEs ({entry['curie_distinct_count']:,} distinct)") - curie_entries.append({ - 'prefix': prefix, - 'curie_count': filename_entries['_totals']['curie_count'], - 'curie_distinct_count': filename_entries['_totals']['curie_distinct_count'], - 'filenames': "\n".join(filename_rows), - }) + curie_entries.append( + { + "prefix": prefix, + "curie_count": filename_entries["_totals"]["curie_count"], + "curie_distinct_count": filename_entries["_totals"]["curie_distinct_count"], + "filenames": "\n".join(filename_rows), + } + ) # Before writing it out, sort by distinct CURIE count descending. - with open(prefix_report_table_csv, 'w') as f: - writer = csv.DictWriter(f, [ - 'Prefix', - 'CURIE count', - 'Distinct CURIE count', - 'Filenames' - ]) + with open(prefix_report_table_csv, "w") as f: + writer = csv.DictWriter(f, ["Prefix", "CURIE count", "Distinct CURIE count", "Filenames"]) writer.writeheader() - for entry in sorted(curie_entries, key=lambda x: x['curie_distinct_count'], reverse=True): + for entry in sorted(curie_entries, key=lambda x: x["curie_distinct_count"], reverse=True): row = { - 'Prefix': entry['prefix'], - 'CURIE count': "{:,}".format(entry['curie_count']), - 'Distinct CURIE count': "{:,}".format(entry['curie_distinct_count']), - 'Filenames': entry['filenames'], + "Prefix": entry["prefix"], + "CURIE count": "{:,}".format(entry["curie_count"]), + "Distinct CURIE count": "{:,}".format(entry["curie_distinct_count"]), + "Filenames": entry["filenames"], } writer.writerow(row) + def generate_cliques_table(cliques_report_json: str, cliques_table_csv: str): - with open(cliques_report_json, 'r') as f: + with open(cliques_report_json) as f: cliques_report = json.load(f) # To improve the table somewhat, we'll include pipeline descriptions that group filenames. pipeline_descriptions = { - 'Anatomy': { - 'description': 'Anatomical entities at all scales, from brains to endothelium to pancreatic beta cells', - 'filenames': [ - 'AnatomicalEntity', - 'Cell', - 'CellularComponent', - 'GrossAnatomicalStructure' - ], + "Anatomy": { + "description": "Anatomical entities at all scales, from brains to endothelium to pancreatic beta cells", + "filenames": ["AnatomicalEntity", "Cell", "CellularComponent", "GrossAnatomicalStructure"], }, - 'CellLine': { - 'description': 'Cell lines from different species', - 'filenames': ['CellLine'], + "CellLine": { + "description": "Cell lines from different species", + "filenames": ["CellLine"], }, - 'Chemicals': { - 'description': 'All kinds of chemicals, including drugs, small molecules, molecular mixtures, and so on', - 'filenames': [ - 'MolecularMixture', - 'SmallMolecule', - 'Polypeptide', - 'ComplexMolecularMixture', - 'ChemicalEntity', - 'ChemicalMixture', - 'Drug' - ], + "Chemicals": { + "description": "All kinds of chemicals, including drugs, small molecules, molecular mixtures, and so on", + "filenames": ["MolecularMixture", "SmallMolecule", "Polypeptide", "ComplexMolecularMixture", "ChemicalEntity", "ChemicalMixture", "Drug"], }, - 'DiseasePhenotype': { - 'description': 'Conflation of drugs with their active ingredients as chemicals', - 'filenames': [ - 'Disease', - 'PhenotypicFeature' - ], + "DiseasePhenotype": { + "description": "Conflation of drugs with their active ingredients as chemicals", + "filenames": ["Disease", "PhenotypicFeature"], }, - 'DrugChemical': { - 'description': 'Conflation of drugs with their active ingredients as chemicals', - 'filenames': [], + "DrugChemical": { + "description": "Conflation of drugs with their active ingredients as chemicals", + "filenames": [], }, - 'Gene': { - 'description': 'Genes from all species', - 'filenames': ['Gene'], + "Gene": { + "description": "Genes from all species", + "filenames": ["Gene"], }, - 'GeneFamily': { - 'description': 'Families of genes', - 'filenames': ['GeneFamily'], + "GeneFamily": { + "description": "Families of genes", + "filenames": ["GeneFamily"], }, - 'GeneProtein': { - 'description': 'Conflation of genes with the proteins they code for.', - 'filenames': [], + "GeneProtein": { + "description": "Conflation of genes with the proteins they code for.", + "filenames": [], }, - 'Leftover UMLS': { - 'description': 'A special pipeline that adds every UMLS concept not already added elsewhere in Babel', - 'filenames': ['umls'], + "Leftover UMLS": { + "description": "A special pipeline that adds every UMLS concept not already added elsewhere in Babel", + "filenames": ["umls"], }, - 'Macromolecular Complex': { - 'description': '', - 'filenames': ['MacromolecularComplex'], + "Macromolecular Complex": { + "description": "", + "filenames": ["MacromolecularComplex"], }, - 'ProcessActivityPathway': { - 'description': 'Biological processes, activities and pathways', - 'filenames': ['Pathway', 'BiologicalProcess', 'MolecularActivity'], + "ProcessActivityPathway": { + "description": "Biological processes, activities and pathways", + "filenames": ["Pathway", "BiologicalProcess", "MolecularActivity"], }, - 'Protein': { - 'description': 'Proteins from all species', - 'filenames': ['Protein'], + "Protein": { + "description": "Proteins from all species", + "filenames": ["Protein"], }, - 'Publications': { - 'description': 'All publications from PubMed', - 'filenames': ['Publication'], + "Publications": { + "description": "All publications from PubMed", + "filenames": ["Publication"], + }, + "Taxon": { + "description": "Taxonomic entities, including species, genera, families, and so on from the NCBI Taxonomy", + "filenames": ["OrganismTaxon"], }, - 'Taxon': { - 'description': 'Taxonomic entities, including species, genera, families, and so on from the NCBI Taxonomy', - 'filenames': ['OrganismTaxon'], - } } clique_leader_entries = {} @@ -171,18 +152,16 @@ def generate_cliques_table(cliques_report_json: str, cliques_table_csv: str): totals = {} for clique_leader_prefix, inner2 in inner.items(): - if clique_leader_prefix == '_totals': + if clique_leader_prefix == "_totals": totals = inner2 continue clique_leader_prefixes.add(clique_leader_prefix) for curie_prefix, entry in inner2.items(): - curie_prefix_entries.append({ - 'curie_prefix': curie_prefix, - 'curie_count': entry['curie_count'], - 'distinct_curie_count': entry['distinct_curie_count'] - }) + curie_prefix_entries.append( + {"curie_prefix": curie_prefix, "curie_count": entry["curie_count"], "distinct_curie_count": entry["distinct_curie_count"]} + ) if not totals: raise ValueError(f"No totals entry for filename {filename}!") @@ -190,71 +169,80 @@ def generate_cliques_table(cliques_report_json: str, cliques_table_csv: str): if filename in clique_leader_entries: raise ValueError(f"Duplicate filename {filename}!") - curie_prefixes = map(lambda e: f"{e['curie_prefix']}", sorted(curie_prefix_entries, key=lambda x: x['distinct_curie_count'], reverse=True)) + curie_prefixes = map(lambda e: f"{e['curie_prefix']}", sorted(curie_prefix_entries, key=lambda x: x["distinct_curie_count"], reverse=True)) unique_curie_prefixes = [] for prefix in curie_prefixes: if prefix not in unique_curie_prefixes: unique_curie_prefixes.append(prefix) clique_leader_entries[filename] = { - 'curie_count': totals['curie_count'], - 'distinct_curie_count': totals['distinct_curie_count'], - 'total_synonyms': '', - 'clique_leader_prefixes': ", ".join(sorted(clique_leader_prefixes)), - 'curie_prefixes': ", ".join(unique_curie_prefixes), + "curie_count": totals["curie_count"], + "distinct_curie_count": totals["distinct_curie_count"], + "total_synonyms": "", + "clique_leader_prefixes": ", ".join(sorted(clique_leader_prefixes)), + "curie_prefixes": ", ".join(unique_curie_prefixes), } filenames_not_written = set(clique_leader_entries.keys()) - with open(cliques_table_csv, 'w') as f: - writer = csv.DictWriter(f, [ - 'Pipeline', - 'Description', - 'Biolink Types', - 'Number of CURIEs', - 'Number of distinct CURIEs', - 'Clique leader prefixes', - 'CURIE prefixes', - ]) + with open(cliques_table_csv, "w") as f: + writer = csv.DictWriter( + f, + [ + "Pipeline", + "Description", + "Biolink Types", + "Number of CURIEs", + "Number of distinct CURIEs", + "Clique leader prefixes", + "CURIE prefixes", + ], + ) writer.writeheader() for pipeline, entry in pipeline_descriptions.items(): - description = entry['description'] + description = entry["description"] - filenames = entry.get('filenames', []) + filenames = entry.get("filenames", []) if len(filenames) == 0: - writer.writerow({ - 'Pipeline': pipeline, - 'Description': description, - 'Biolink Types': 'N/A', - 'Number of CURIEs': '', - 'Number of distinct CURIEs': '', - 'Clique leader prefixes': '', - 'CURIE prefixes': '', - }) + writer.writerow( + { + "Pipeline": pipeline, + "Description": description, + "Biolink Types": "N/A", + "Number of CURIEs": "", + "Number of distinct CURIEs": "", + "Clique leader prefixes": "", + "CURIE prefixes": "", + } + ) for filename in filenames: if filename not in clique_leader_entries: raise ValueError(f"Pipeline {pipeline} references filename {filename} that isn't in clique_leader_entries!") - writer.writerow({ - 'Pipeline': pipeline, - 'Description': description, - 'Biolink Types': filename, - 'Number of CURIEs': "{:,}".format(clique_leader_entries[filename]['curie_count']), - 'Number of distinct CURIEs': "{:,}".format(clique_leader_entries[filename]['distinct_curie_count']), - 'Clique leader prefixes': clique_leader_entries[filename]['clique_leader_prefixes'], - 'CURIE prefixes': clique_leader_entries[filename]['curie_prefixes'], - }) + writer.writerow( + { + "Pipeline": pipeline, + "Description": description, + "Biolink Types": filename, + "Number of CURIEs": "{:,}".format(clique_leader_entries[filename]["curie_count"]), + "Number of distinct CURIEs": "{:,}".format(clique_leader_entries[filename]["distinct_curie_count"]), + "Clique leader prefixes": clique_leader_entries[filename]["clique_leader_prefixes"], + "CURIE prefixes": clique_leader_entries[filename]["curie_prefixes"], + } + ) filenames_not_written.remove(filename) for filename in sorted(filenames_not_written): - writer.writerow({ - 'Pipeline': '**NONE**', - 'Description': '', - 'Biolink Types': filename, - 'Number of CURIEs': "{:,}".format(clique_leader_entries[filename]['curie_count']), - 'Number of distinct CURIEs': "{:,}".format(clique_leader_entries[filename]['distinct_curie_count']), - 'Clique leader prefixes': clique_leader_entries[filename]['clique_leader_prefixes'], - 'CURIE prefixes': clique_leader_entries[filename]['curie_prefixes'], - }) + writer.writerow( + { + "Pipeline": "**NONE**", + "Description": "", + "Biolink Types": filename, + "Number of CURIEs": "{:,}".format(clique_leader_entries[filename]["curie_count"]), + "Number of distinct CURIEs": "{:,}".format(clique_leader_entries[filename]["distinct_curie_count"]), + "Clique leader prefixes": clique_leader_entries[filename]["clique_leader_prefixes"], + "CURIE prefixes": clique_leader_entries[filename]["curie_prefixes"], + } + ) diff --git a/src/sdfreader.py b/src/sdfreader.py index 7c991403..ef1fd861 100644 --- a/src/sdfreader.py +++ b/src/sdfreader.py @@ -1,7 +1,7 @@ def read_sdf(infile, interesting_keys): """Given an sdf file name and a set of keys that we'd like to extract, return a dictionary going chebiid -> {properties} where the properties are chosen from the interesting keys""" - with open(infile, "r") as inf: + with open(infile) as inf: chebisdf = inf.read() lines = chebisdf.split("\n") chunk = [] diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 7edbbc93..49e166bf 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -52,7 +52,7 @@ rule anatomy_umls_ids: rule get_anatomy_obo_relationships: - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. output: config["intermediate_directory"] + "/anatomy/concords/UBERON", config["intermediate_directory"] + "/anatomy/concords/CL", diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 8a09ac93..dfbb101f 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -101,7 +101,7 @@ rule chemical_drugcentral_ids: rule chemical_chebi_ids: - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. output: outfile=config["intermediate_directory"] + "/chemicals/ids/CHEBI", run: diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 44540911..5d210e79 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -212,7 +212,7 @@ rule get_obo_labels: download_directory=config["download_directory"], prefix=config["generate_dirs_for_labels_and_synonyms_prefixes"], ), - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. run: obo.pull_uber_labels(output.obo_labels, output.generated_labels) @@ -228,7 +228,7 @@ rule get_obo_synonyms: download_directory=config["download_directory"], prefix=config["generate_dirs_for_labels_and_synonyms_prefixes"], ), - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. run: obo.pull_uber_synonyms(output.obo_synonyms, output.generated_synonyms) @@ -236,10 +236,11 @@ rule get_obo_synonyms: rule get_obo_descriptions: output: obo_descriptions=config["download_directory"] + "/common/ubergraph/descriptions.jsonl", - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. run: obo.pull_uber_descriptions(output.obo_descriptions) + rule get_icrdf: input: # Ideally, we would correctly mark all the dependencies for Ubergraph labels, synonyms and descriptions @@ -251,7 +252,7 @@ rule get_icrdf: config["download_directory"] + "/common/ubergraph/descriptions.jsonl", output: icrdf_filename=config["download_directory"] + "/icRDF.tsv", - retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. + retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry. run: obo.pull_uber_icRDF(output.icrdf_filename) diff --git a/src/snakefiles/duckdb.snakefile b/src/snakefiles/duckdb.snakefile index 8c0474ac..ec7a1534 100644 --- a/src/snakefiles/duckdb.snakefile +++ b/src/snakefiles/duckdb.snakefile @@ -87,11 +87,16 @@ rule check_for_identically_labeled_cliques: duckdb_filename=temp(config["output_directory"] + "/duckdb/duckdbs/identically_labeled_clique.duckdb"), identically_labeled_cliques_tsv=config["output_directory"] + "/reports/duckdb/identically_labeled_cliques.tsv.gz", run: - src.reports.duckdb_reports.check_for_identically_labeled_cliques(params.parquet_dir, output.duckdb_filename, output.identically_labeled_cliques_tsv, { - 'memory_limit': '512G', - 'threads': 2, - 'preserve_insertion_order': False, - }) + src.reports.duckdb_reports.check_for_identically_labeled_cliques( + params.parquet_dir, + output.duckdb_filename, + output.identically_labeled_cliques_tsv, + { + "memory_limit": "512G", + "threads": 2, + "preserve_insertion_order": False, + }, + ) rule check_for_duplicate_curies: @@ -106,11 +111,16 @@ rule check_for_duplicate_curies: duckdb_filename=temp(config["output_directory"] + "/duckdb/duckdbs/duplicate_curies.duckdb"), duplicate_curies=config["output_directory"] + "/reports/duckdb/duplicate_curies.tsv", run: - src.reports.duckdb_reports.check_for_duplicate_curies(params.parquet_dir, output.duckdb_filename, output.duplicate_curies, { - 'memory_limit': '1500G', - 'threads': 1, - 'preserve_insertion_order': False, - }) + src.reports.duckdb_reports.check_for_duplicate_curies( + params.parquet_dir, + output.duckdb_filename, + output.duplicate_curies, + { + "memory_limit": "1500G", + "threads": 1, + "preserve_insertion_order": False, + }, + ) rule check_for_duplicate_clique_leaders: @@ -125,11 +135,17 @@ rule check_for_duplicate_clique_leaders: duckdb_filename=temp(config["output_directory"] + "/duckdb/duckdbs/duplicate_clique_leaders.duckdb"), duplicate_clique_leaders_tsv=config["output_directory"] + "/reports/duckdb/duplicate_clique_leaders.tsv", run: - src.reports.duckdb_reports.check_for_duplicate_clique_leaders(params.parquet_dir, output.duckdb_filename, output.duplicate_clique_leaders_tsv, { - 'memory_limit': '512G', - 'threads': 2, - 'preserve_insertion_order': False, - }) + src.reports.duckdb_reports.check_for_duplicate_clique_leaders( + params.parquet_dir, + output.duckdb_filename, + output.duplicate_clique_leaders_tsv, + { + "memory_limit": "512G", + "threads": 2, + "preserve_insertion_order": False, + }, + ) + rule generate_curie_report: resources: @@ -144,12 +160,17 @@ rule generate_curie_report: duckdb_filename=temp(config["output_directory"] + "/duckdb/duckdbs/curie_report.duckdb"), curie_report_json=config["output_directory"] + "/reports/duckdb/curie_report.json", run: - src.reports.duckdb_reports.generate_curie_report(params.parquet_dir, output.duckdb_filename, output.curie_report_json, { - # 'memory_limit': '20G', -- this actually worked! - 'memory_limit': '100G', - 'threads': 5, - 'preserve_insertion_order': False, - }) + src.reports.duckdb_reports.generate_curie_report( + params.parquet_dir, + output.duckdb_filename, + output.curie_report_json, + { + # 'memory_limit': '20G', -- this actually worked! + "memory_limit": "100G", + "threads": 5, + "preserve_insertion_order": False, + }, + ) rule generate_clique_leader_report: @@ -164,11 +185,16 @@ rule generate_clique_leader_report: duckdb_filename=temp(config["output_directory"] + "/duckdb/duckdbs/clique_leaders.duckdb"), clique_leaders_json=config["output_directory"] + "/reports/duckdb/clique_leaders.json", run: - src.reports.duckdb_reports.generate_clique_leaders_report(params.parquet_dir, output.duckdb_filename, output.clique_leaders_json, { - 'memory_limit': '20G', - 'threads': 3, - 'preserve_insertion_order': False, - }) + src.reports.duckdb_reports.generate_clique_leaders_report( + params.parquet_dir, + output.duckdb_filename, + output.clique_leaders_json, + { + "memory_limit": "20G", + "threads": 3, + "preserve_insertion_order": False, + }, + ) rule all_duckdb_reports: diff --git a/src/snakefiles/reports.snakefile b/src/snakefiles/reports.snakefile index 1999319e..116fe3f7 100644 --- a/src/snakefiles/reports.snakefile +++ b/src/snakefiles/reports.snakefile @@ -80,10 +80,12 @@ rule generate_summary_content_report_for_compendia: run: summarize_content_report_for_compendia(input.expected_content_reports, output.report_path) + # # REPORT TABLES # + # Generate a prefix table. rule generate_prefix_table: input: @@ -93,6 +95,7 @@ rule generate_prefix_table: run: report_tables.generate_prefix_table(input.curie_report, output.prefix_table) + # Generate a cliques table. rule generate_cliques_table: input: @@ -102,6 +105,7 @@ rule generate_cliques_table: run: report_tables.generate_cliques_table(input.cliques_report, output.cliques_table) + # Check that all the reports were built correctly. rule all_reports: input: diff --git a/src/snakefiles/util.py b/src/snakefiles/util.py index 1967b59e..77735809 100644 --- a/src/snakefiles/util.py +++ b/src/snakefiles/util.py @@ -1,6 +1,6 @@ # Shared code used by Snakemake files -import shutil import gzip +import shutil import src.util diff --git a/src/synonyms/synonymconflation.py b/src/synonyms/synonymconflation.py index 174b205c..5ac4eb52 100644 --- a/src/synonyms/synonymconflation.py +++ b/src/synonyms/synonymconflation.py @@ -14,6 +14,7 @@ logger = util.get_logger(__name__) + # click.command() # click.option('--conflation-file', multiple=True, type=click.Path(exists=True)) # click.option('--output', type=click.Path(exists=False), default='-') @@ -41,7 +42,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output # Step 1. Load all the conflations. We only need to work on these identifiers, so that simplifies our work. for conflation_filename in conflation_file: logger.info(f"Reading conflation file {conflation_filename}") - with open(conflation_filename, "r") as conflationf: + with open(conflation_filename) as conflationf: count_primary = 0 count_secondary = 0 for line in conflationf: @@ -74,7 +75,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output for compendium_filename in compendia_files: logger.info(f"Reading compendium file {compendium_filename}") - with open(compendium_filename, "r") as compendiumf: + with open(compendium_filename) as compendiumf: for line in compendiumf: clique = json.loads(line) identifiers = clique.get("identifiers", []) diff --git a/src/triplestore.py b/src/triplestore.py index 4031658c..4bf832e3 100644 --- a/src/triplestore.py +++ b/src/triplestore.py @@ -1,14 +1,15 @@ +import logging import os -from src.util import LoggingUtil -from SPARQLWrapper import SPARQLWrapper2, JSON, POSTDIRECTLY, POST from string import Template -import logging +from SPARQLWrapper import JSON, POST, POSTDIRECTLY, SPARQLWrapper2 + +from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, logging.ERROR) -class TripleStore(object): +class TripleStore: """Connect to a SPARQL endpoint and provide services for loading and executing queries.""" def __init__(self, hostname): @@ -21,8 +22,8 @@ def get_template(self, query_name): def get_template_text(self, query_name): """Get the text of a template given its name""" query = None - fn = os.path.join(os.path.dirname(__file__), "query", "{0}.sparql".format(query_name)) - with open(fn, "r") as stream: + fn = os.path.join(os.path.dirname(__file__), "query", f"{query_name}.sparql") + with open(fn) as stream: query = stream.read() return query diff --git a/src/ubergraph.py b/src/ubergraph.py index a03438d4..7dee22f2 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -1,12 +1,12 @@ import logging +from collections import defaultdict from time import sleep +from src.babel_utils import norm from src.triplestore import TripleStore from src.util import Text -from collections import defaultdict -from src.babel_utils import norm -SLEEP_BETWEEN_UBERGRAPH_QUERIES = 5 # seconds +SLEEP_BETWEEN_UBERGRAPH_QUERIES = 5 # seconds class UberGraph: @@ -301,7 +301,7 @@ def get_subclasses_and_xrefs(self, iri): # Sometimes we're getting back just strings that aren't curies, skip those (but complain) try: dcurie = Text.opt_to_curie(row["descendent"]) - results[dcurie].add((Text.opt_to_curie(row["xref"]))) + results[dcurie].add(Text.opt_to_curie(row["xref"])) except ValueError as verr: print(f"Bad XREF from {row['descendent']} to {row['xref']}: {verr}") continue @@ -402,7 +402,7 @@ def get_subclasses_and_close(self, iri): results[desc] += [] else: try: - results[desc].append((Text.opt_to_curie(row["match"]))) + results[desc].append(Text.opt_to_curie(row["match"])) except ValueError as verr: # Sometimes, if there are no exact_matches, we'll get some kind of blank node id # like 't19830198'. Want to filter those out. diff --git a/src/util.py b/src/util.py index 90f77b3e..276f2b7b 100644 --- a/src/util.py +++ b/src/util.py @@ -1,22 +1,21 @@ -import logging +import copy import json +import logging import os import sys +from collections import namedtuple +from logging.handlers import RotatingFileHandler from time import gmtime import curies -import yaml import psutil -from collections import namedtuple -import copy -from logging.handlers import RotatingFileHandler - +import yaml from bmt import Toolkit from humanfriendly import format_size -from src.LabeledID import LabeledID -from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes +from src.LabeledID import LabeledID +from src.prefixes import ICD10, ICD10CM, ICD11FOUNDATION, KEGGPATHWAY, KEGGREACTION, NCIT, OMIM, OMIMPS, SNOMEDCT, UMLS def get_logger(name, loglevel=logging.INFO): @@ -46,7 +45,7 @@ def get_logger(name, loglevel=logging.INFO): # loggers = {} -class LoggingUtil(object): +class LoggingUtil: """Logging utility controlling format and setting initial logging level""" @staticmethod @@ -101,7 +100,7 @@ def init_logging(name, level=logging.INFO, format="short", logFilePath=None, log return logger -class Munge(object): +class Munge: @staticmethod def gene(gene): return gene.split("/")[-1:][0] if gene.startswith("http://") else gene @@ -255,14 +254,14 @@ def get_resource_path(resource_name): @staticmethod def load_json(path): result = None - with open(path, "r") as stream: + with open(path) as stream: result = json.loads(stream.read()) return result @staticmethod def load_yaml(path): result = None - with open(path, "r") as stream: + with open(path) as stream: result = yaml.load(stream.read()) return result @@ -345,7 +344,7 @@ def get_config(): return config_yaml cname = os.path.join(os.path.dirname(__file__), "..", "config.yaml") - with open(cname, "r") as yaml_file: + with open(cname) as yaml_file: config_yaml = yaml.safe_load(yaml_file) return config_yaml diff --git a/tests/datahandlers/test_ensembl.py b/tests/datahandlers/test_ensembl.py index a10aaae7..2a22c89f 100644 --- a/tests/datahandlers/test_ensembl.py +++ b/tests/datahandlers/test_ensembl.py @@ -79,7 +79,7 @@ def test_pull_ensembl(tmp_path): split_tsv = download_as_splits / "choffmanni_gene_ensembl" / "BioMart.tsv" assert unsplit_tsv.exists() assert split_tsv.exists() - with open(unsplit_tsv, "r") as unsplit_file, open(split_tsv, "r") as split_file: + with open(unsplit_tsv) as unsplit_file, open(split_tsv) as split_file: # So we can't compare these files directly, because rows with the same ensembl_gene_id shows up in an # undetermined order. So we need to load them, group them by ENSEMBL gene ID, and then compare those sets. unsplit_rows = list(read_biomart_file(unsplit_file)) diff --git a/tests/test_ThrottledRequester.py b/tests/test_ThrottledRequester.py index 87c194c0..333a3eaa 100644 --- a/tests/test_ThrottledRequester.py +++ b/tests/test_ThrottledRequester.py @@ -1,5 +1,6 @@ from datetime import datetime as dt from datetime import timedelta + from src.babel_utils import ThrottledRequester diff --git a/tests/test_ftp.py b/tests/test_ftp.py index c9dac12c..a6cc4bbd 100644 --- a/tests/test_ftp.py +++ b/tests/test_ftp.py @@ -1,6 +1,8 @@ +import gzip + import pytest + from src.babel_utils import pull_via_ftp -import gzip # FTP doesn't play nicely with travis-ci, so these are marked so they can be excluded. # See: https://blog.travis-ci.com/2018-07-23-the-tale-of-ftp-at-travis-ci @@ -21,7 +23,7 @@ def test_pull_text_to_file(): """Pull a text file into local file""" ofname = "test_text" outname = pull_via_ftp("ftp.ncbi.nlm.nih.gov", "gene/DATA/", "stopwords_gene", outfilename=ofname) - with open(outname, "r") as inf: + with open(outname) as inf: lines = inf.read().split("\n") assert len(lines) > 100 assert lines[0] == "a" @@ -41,7 +43,7 @@ def test_pull_gzip_to_uncompressed_file(): """Pull a gzipped file into memory, decompressed""" ofname = "test_gz_text" outname = pull_via_ftp("ftp.ncbi.nlm.nih.gov", "gene/DATA/", "gene_group.gz", decompress_data=True, outfilename=ofname) - with open(outname, "r") as inf: + with open(outname) as inf: lines = inf.read().split("\n") assert len(lines) > 1000 assert lines[0].startswith("#tax_id") diff --git a/tests/test_geneproteiny.py b/tests/test_geneproteiny.py index e1029468..326e7053 100644 --- a/tests/test_geneproteiny.py +++ b/tests/test_geneproteiny.py @@ -1,6 +1,7 @@ -from src.createcompendia.geneprotein import build_compendium import os +from src.createcompendia.geneprotein import build_compendium + def test_gp(): here = os.path.abspath(os.path.dirname(__file__)) @@ -9,7 +10,7 @@ def test_gp(): geneprotein_concord = os.path.join(here, "testdata", "gp_UniProtNCBI.txt") outfile = os.path.join(here, "testdata", "gp_output.txt") build_compendium(gene_compendium, protein_compendium, geneprotein_concord, outfile) - with open(outfile, "r") as inf: + with open(outfile) as inf: x = inf.read() assert len(x) > 0 print(x) diff --git a/tests/test_node_factory.py b/tests/test_node_factory.py index 223af0c8..24959eff 100644 --- a/tests/test_node_factory.py +++ b/tests/test_node_factory.py @@ -1,7 +1,8 @@ import os -from src.node import NodeFactory -from src.LabeledID import LabeledID + import src.prefixes as pref +from src.LabeledID import LabeledID +from src.node import NodeFactory def test_get_ancestors(): diff --git a/uv.lock b/uv.lock index df33f81d..56bdd516 100644 --- a/uv.lock +++ b/uv.lock @@ -215,6 +215,7 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "ruff" }, { name = "snakefmt" }, ] @@ -249,7 +250,10 @@ requires-dist = [ ] [package.metadata.requires-dev] -dev = [{ name = "snakefmt", specifier = ">=0.11.2" }] +dev = [ + { name = "ruff", specifier = ">=0.14.9" }, + { name = "snakefmt", specifier = ">=0.11.2" }, +] [[package]] name = "bcp47" @@ -2614,6 +2618,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/08/4349bdd5c64d9d193c360aa9db89adeee6f6682ab8825dca0a3f535f434f/rpds_py-0.27.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:dc23e6820e3b40847e2f4a7726462ba0cf53089512abe9ee16318c366494c17a", size = 556523, upload-time = "2025-08-27T12:16:12.188Z" }, ] +[[package]] +name = "ruff" +version = "0.14.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/1b/ab712a9d5044435be8e9a2beb17cbfa4c241aa9b5e4413febac2a8b79ef2/ruff-0.14.9.tar.gz", hash = "sha256:35f85b25dd586381c0cc053f48826109384c81c00ad7ef1bd977bfcc28119d5b", size = 5809165, upload-time = "2025-12-11T21:39:47.381Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/1c/d1b1bba22cffec02351c78ab9ed4f7d7391876e12720298448b29b7229c1/ruff-0.14.9-py3-none-linux_armv6l.whl", hash = "sha256:f1ec5de1ce150ca6e43691f4a9ef5c04574ad9ca35c8b3b0e18877314aba7e75", size = 13576541, upload-time = "2025-12-11T21:39:14.806Z" }, + { url = "https://files.pythonhosted.org/packages/94/ab/ffe580e6ea1fca67f6337b0af59fc7e683344a43642d2d55d251ff83ceae/ruff-0.14.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ed9d7417a299fc6030b4f26333bf1117ed82a61ea91238558c0268c14e00d0c2", size = 13779363, upload-time = "2025-12-11T21:39:20.29Z" }, + { url = "https://files.pythonhosted.org/packages/7d/f8/2be49047f929d6965401855461e697ab185e1a6a683d914c5c19c7962d9e/ruff-0.14.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d5dc3473c3f0e4a1008d0ef1d75cee24a48e254c8bed3a7afdd2b4392657ed2c", size = 12925292, upload-time = "2025-12-11T21:39:38.757Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e9/08840ff5127916bb989c86f18924fd568938b06f58b60e206176f327c0fe/ruff-0.14.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84bf7c698fc8f3cb8278830fb6b5a47f9bcc1ed8cb4f689b9dd02698fa840697", size = 13362894, upload-time = "2025-12-11T21:39:02.524Z" }, + { url = "https://files.pythonhosted.org/packages/31/1c/5b4e8e7750613ef43390bb58658eaf1d862c0cc3352d139cd718a2cea164/ruff-0.14.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa733093d1f9d88a5d98988d8834ef5d6f9828d03743bf5e338bf980a19fce27", size = 13311482, upload-time = "2025-12-11T21:39:17.51Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3a/459dce7a8cb35ba1ea3e9c88f19077667a7977234f3b5ab197fad240b404/ruff-0.14.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a1cfb04eda979b20c8c19550c8b5f498df64ff8da151283311ce3199e8b3648", size = 14016100, upload-time = "2025-12-11T21:39:41.948Z" }, + { url = "https://files.pythonhosted.org/packages/a6/31/f064f4ec32524f9956a0890fc6a944e5cf06c63c554e39957d208c0ffc45/ruff-0.14.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1e5cb521e5ccf0008bd74d5595a4580313844a42b9103b7388eca5a12c970743", size = 15477729, upload-time = "2025-12-11T21:39:23.279Z" }, + { url = "https://files.pythonhosted.org/packages/7a/6d/f364252aad36ccd443494bc5f02e41bf677f964b58902a17c0b16c53d890/ruff-0.14.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd429a8926be6bba4befa8cdcf3f4dd2591c413ea5066b1e99155ed245ae42bb", size = 15122386, upload-time = "2025-12-11T21:39:33.125Z" }, + { url = "https://files.pythonhosted.org/packages/20/02/e848787912d16209aba2799a4d5a1775660b6a3d0ab3944a4ccc13e64a02/ruff-0.14.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab208c1b7a492e37caeaf290b1378148f75e13c2225af5d44628b95fd7834273", size = 14497124, upload-time = "2025-12-11T21:38:59.33Z" }, + { url = "https://files.pythonhosted.org/packages/f3/51/0489a6a5595b7760b5dbac0dd82852b510326e7d88d51dbffcd2e07e3ff3/ruff-0.14.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72034534e5b11e8a593f517b2f2f2b273eb68a30978c6a2d40473ad0aaa4cb4a", size = 14195343, upload-time = "2025-12-11T21:39:44.866Z" }, + { url = "https://files.pythonhosted.org/packages/f6/53/3bb8d2fa73e4c2f80acc65213ee0830fa0c49c6479313f7a68a00f39e208/ruff-0.14.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:712ff04f44663f1b90a1195f51525836e3413c8a773574a7b7775554269c30ed", size = 14346425, upload-time = "2025-12-11T21:39:05.927Z" }, + { url = "https://files.pythonhosted.org/packages/ad/04/bdb1d0ab876372da3e983896481760867fc84f969c5c09d428e8f01b557f/ruff-0.14.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a111fee1db6f1d5d5810245295527cda1d367c5aa8f42e0fca9a78ede9b4498b", size = 13258768, upload-time = "2025-12-11T21:39:08.691Z" }, + { url = "https://files.pythonhosted.org/packages/40/d9/8bf8e1e41a311afd2abc8ad12be1b6c6c8b925506d9069b67bb5e9a04af3/ruff-0.14.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8769efc71558fecc25eb295ddec7d1030d41a51e9dcf127cbd63ec517f22d567", size = 13326939, upload-time = "2025-12-11T21:39:53.842Z" }, + { url = "https://files.pythonhosted.org/packages/f4/56/a213fa9edb6dd849f1cfbc236206ead10913693c72a67fb7ddc1833bf95d/ruff-0.14.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:347e3bf16197e8a2de17940cd75fd6491e25c0aa7edf7d61aa03f146a1aa885a", size = 13578888, upload-time = "2025-12-11T21:39:35.988Z" }, + { url = "https://files.pythonhosted.org/packages/33/09/6a4a67ffa4abae6bf44c972a4521337ffce9cbc7808faadede754ef7a79c/ruff-0.14.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7715d14e5bccf5b660f54516558aa94781d3eb0838f8e706fb60e3ff6eff03a8", size = 14314473, upload-time = "2025-12-11T21:39:50.78Z" }, + { url = "https://files.pythonhosted.org/packages/12/0d/15cc82da5d83f27a3c6b04f3a232d61bc8c50d38a6cd8da79228e5f8b8d6/ruff-0.14.9-py3-none-win32.whl", hash = "sha256:df0937f30aaabe83da172adaf8937003ff28172f59ca9f17883b4213783df197", size = 13202651, upload-time = "2025-12-11T21:39:26.628Z" }, + { url = "https://files.pythonhosted.org/packages/32/f7/c78b060388eefe0304d9d42e68fab8cffd049128ec466456cef9b8d4f06f/ruff-0.14.9-py3-none-win_amd64.whl", hash = "sha256:c0b53a10e61df15a42ed711ec0bda0c582039cf6c754c49c020084c55b5b0bc2", size = 14702079, upload-time = "2025-12-11T21:39:11.954Z" }, + { url = "https://files.pythonhosted.org/packages/26/09/7a9520315decd2334afa65ed258fed438f070e31f05a2e43dd480a5e5911/ruff-0.14.9-py3-none-win_arm64.whl", hash = "sha256:8e821c366517a074046d92f0e9213ed1c13dbc5b37a7fc20b07f79b64d62cc84", size = 13744730, upload-time = "2025-12-11T21:39:29.659Z" }, +] + [[package]] name = "semsql" version = "0.4.0"