Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
configfile: "config.yaml"


include: "src/snakefiles/datacollect.snakefile"
include: "src/snakefiles/anatomy.snakefile"
include: "src/snakefiles/cell_line.snakefile"
Expand All @@ -19,6 +20,7 @@ include: "src/snakefiles/duckdb.snakefile"
include: "src/snakefiles/reports.snakefile"
include: "src/snakefiles/exports.snakefile"


# Some general imports.
import shutil
from src.snakefiles.util import write_done
Expand All @@ -28,6 +30,7 @@ import os

os.environ["TMPDIR"] = config["tmp_directory"]


# Top-level rules.
rule all:
input:
Expand All @@ -43,7 +46,7 @@ rule all:
config["output_directory"] + "/kgx/done",
config["output_directory"] + "/sapbert-training-data/done",
# Store the config.yaml file used to produce the output.
config_file = "config.yaml",
config_file="config.yaml",
output:
x=config["output_directory"] + "/reports/all_done",
output_config_file=config["output_directory"] + "/config.yaml",
Expand Down
6 changes: 3 additions & 3 deletions input_data/parse_bad_mappings.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from collections import defaultdict
from ast import literal_eval
from collections import defaultdict


def read_bad_hp_mappings(fn):
drops = defaultdict(set)
with open(fn, "r") as infile:
with open(fn) as infile:
for line in infile:
if line.startswith("-"):
continue
x = line.strip().split("\t")
hps = x[0]
commaindex = hps.index(",")
curie = hps[1:commaindex]
name = hps[commaindex + 1 : -1]
# name = hps[commaindex + 1 : -1]
badset = literal_eval(x[1])
drops[curie].update(badset)
return drops
Expand Down
22 changes: 20 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,31 @@ apybiomart = { git = "https://github.com/gaurav/apybiomart.git", rev = "change-c

[dependency-groups]
dev = [
"ruff>=0.14.9",
"snakefmt>=0.11.2",
]

# Linting/formatting configuration
[tool.ruff]
line-length = 160
line-length = 120

[tool.snakefmt]
line_length = 160
line_length = 120
include = '\.snakefile$|^Snakefile'

[tool.ruff.lint]
# Enable all rules that ruff format would normally apply
select = [
"E", # pycodestyle errors
"F", # pyflakes
"I", # isort (import sorting)
"UP", # pyupgrade
]

# Optional but common
ignore = [
"E501", # let Ruff handle wrapping consistently
]

fixable = ["ALL"]
unfixable = []
4 changes: 2 additions & 2 deletions releases/summaries/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
file1 = "2024mar24.json"
file2 = "2024jul13.json"

with open(file1, "r") as f:
with open(file1) as f:
summary1 = json.load(f)

with open(file2, "r") as f:
with open(file2) as f:
summary2 = json.load(f)


Expand Down
16 changes: 9 additions & 7 deletions src/assess_compendia.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
from collections import defaultdict
from os import path

import jsonlines
from collections import defaultdict

from src.util import Text


Expand All @@ -11,7 +13,7 @@ def assess_completeness(input_dir, compendia, reportfile):
id_files = os.listdir(input_dir)
all_identifiers = set()
for idf in id_files:
with open(path.join(input_dir, idf), "r") as inf:
with open(path.join(input_dir, idf)) as inf:
for line in inf:
x = line.strip().split("\t")[0]
all_identifiers.add(x)
Expand All @@ -23,11 +25,11 @@ def assess_completeness(input_dir, compendia, reportfile):
for identifier in ids:
all_identifiers.discard(identifier)
with open(reportfile, "w") as outf:
l = list(all_identifiers)
l.sort()
print(f"Missing identifiers: {len(l)}\n")
outf.write(f"Missing identifiers: {len(l)}\n")
for missing_id in l:
list_all_identifiers = list(all_identifiers)
list_all_identifiers.sort()
print(f"Missing identifiers: {len(list_all_identifiers)}\n")
outf.write(f"Missing identifiers: {len(list_all_identifiers)}\n")
for missing_id in list_all_identifiers:
outf.write(f"{missing_id}\n")


Expand Down
55 changes: 25 additions & 30 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
import gzip
import os
import sqlite3
import subprocess
import time
import traceback
import urllib
from collections import defaultdict
from datetime import datetime, timedelta
from enum import Enum
from ftplib import FTP
from io import BytesIO
import gzip
from datetime import timedelta
import time
from pathlib import Path

import requests
import os
import urllib
import jsonlines
import requests
from humanfriendly import format_timespan

from src.metadata.provenance import write_combined_metadata
from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory
from src.properties import PropertyList, HAS_ALTERNATIVE_ID
from src.util import Text, get_config, get_memory_usage_summary, get_logger
from src.LabeledID import LabeledID
from collections import defaultdict
import sqlite3
from typing import List, Tuple
from src.metadata.provenance import write_combined_metadata
from src.node import DescriptionFactory, InformationContentFactory, NodeFactory, SynonymFactory, TaxonFactory
from src.properties import HAS_ALTERNATIVE_ID, PropertyList
from src.util import Text, get_config, get_logger, get_memory_usage_summary

# Configuration items
WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000
Expand Down Expand Up @@ -144,15 +143,15 @@ def __init__(self, delta_ms):
self.delta = timedelta(milliseconds=delta_ms)

def get(self, url):
now = dt.now()
now = datetime.now()
throttled = False
if self.last_time is not None:
cdelta = now - self.last_time
if cdelta < self.delta:
waittime = self.delta - cdelta
time.sleep(waittime.microseconds / 1e6)
throttled = True
self.last_time = dt.now()
self.last_time = datetime.now()
response = requests.get(url)
return response, throttled

Expand Down Expand Up @@ -194,7 +193,6 @@ def pull_via_urllib(url: str, in_file_name: str, decompress=True, subpath=None,
"""
# Everything goes in downloads
download_dir = get_config()["download_directory"]
working_dir = download_dir

# get the (local) download file name, derived from the input file name
if subpath is None:
Expand Down Expand Up @@ -589,11 +587,11 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"])

# Step 2. Filter out any suspicious labels.
filtered_possible_labels = [l for l in possible_labels if l] # Ignore blank or empty names.
filtered_possible_labels = [label for label in possible_labels if label] # Ignore blank or empty names.

# Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
# least one label shorter than this limit.
labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config["demote_labels_longer_than"]]
labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]]
if labels_shorter_than_limit:
filtered_possible_labels = labels_shorter_than_limit

Expand Down Expand Up @@ -782,7 +780,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
shit_prefixes = set(["KEGG", "PUBCHEM"])
test_id = "xUBERON:0002262"
debugit = False
excised = set()
# excised = set()
for xgroup in newgroups:
if isinstance(xgroup, frozenset):
group = set(xgroup)
Expand All @@ -802,7 +800,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set]
# All of these sets are now going to be combined through the equivalence of our new set.
existing_sets = [es[0] for es in existing_sets_w_x]
x = [es[1] for es in existing_sets_w_x]
# x = [es[1] for es in existing_sets_w_x]
newset = set().union(*existing_sets)
if debugit:
print("merges:", existing_sets)
Expand Down Expand Up @@ -830,7 +828,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
for up in unique_prefixes:
if test_id in group:
print("up?", up)
idents = [e if type(e) == str else e.identifier for e in newset]
idents = [e if isinstance(e, str) else e.identifier for e in newset]
if len(set([e for e in idents if (e.split(":")[0] == up)])) > 1:
bad += 1
setok = False
Expand All @@ -840,18 +838,15 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
wrote.add(fs)
for gel in group:
if Text.get_prefix_or_none(gel) == pref:
killer = gel
# killer = gel
pass
# for preset in wrote:
# print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n')
# print('------------')
NPC = sum(1 for s in newset if s.startswith("PUBCHEM.COMPOUND:"))
if ("PUBCHEM.COMPOUND:3100" in newset) and (NPC > 3):
if debugit:
l = sorted(list(newset))
print("bad")
for li in l:
print(li)
exit()
raise ValueError(f"Debugging information: {sorted(list(newset))}")
if not setok:
# Our new group created a new set that merged stuff we didn't want to merge.
# Previously we did a lot of fooling around at this point. But now we're just going to say, I have a
Expand Down Expand Up @@ -894,7 +889,7 @@ def glom(conc_set, newgroups, unique_prefixes=["INCHIKEY"], pref="HP", close={})
# Now check the 'close' dictionary to see if we've accidentally gotten to a close match becoming an exact match
setok = True
for cpref, closedict in close.items():
idents = set([e if type(e) == str else e.identifier for e in newset])
idents = set([e if isinstance(e, str) else e.identifier for e in newset])
prefidents = [e for e in idents if e.startswith(cpref)]
for pident in prefidents:
for cd in closedict[pident]:
Expand Down Expand Up @@ -978,7 +973,7 @@ def read_identifier_file(infile):
a hint to the normalizer about the proper biolink type for this entity."""
types = {}
identifiers = list()
with open(infile, "r") as inf:
with open(infile) as inf:
for line in inf:
x = line.strip().split("\t")
identifiers.append((x[0],))
Expand All @@ -987,7 +982,7 @@ def read_identifier_file(infile):
return identifiers, types


def remove_overused_xrefs(pairlist: List[Tuple], bothways: bool = False):
def remove_overused_xrefs(pairlist: list[tuple], bothways: bool = False):
"""Given a list of tuples (id1, id2) meaning id1-[xref]->id2, remove any id2 that are associated with more
than one id1. The idea is that if e.g. id1 is made up of UBERONS and 2 of those have an xref to say a UMLS
then it doesn't mean that all of those should be identified. We don't really know what it means, so remove it."""
Expand Down
18 changes: 9 additions & 9 deletions src/createcompendia/anatomy.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from collections import defaultdict

import requests

import src.datahandlers.mesh as mesh
import src.datahandlers.obo as obo
import src.datahandlers.umls as umls
from src.babel_utils import get_prefixes, glom, read_identifier_file, remove_overused_xrefs, write_compendium
from src.categories import ANATOMICAL_ENTITY, CELL, CELLULAR_COMPONENT, GROSS_ANATOMICAL_STRUCTURE
from src.metadata.provenance import write_concord_metadata
from src.util import Text

from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS, FMA
from src.categories import ANATOMICAL_ENTITY, GROSS_ANATOMICAL_STRUCTURE, CELL, CELLULAR_COMPONENT
from src.prefixes import CL, FMA, GO, MESH, NCIT, SNOMEDCT, UBERON, UMLS, WIKIDATA
from src.ubergraph import build_sets
from src.babel_utils import write_compendium, glom, get_prefixes, read_identifier_file, remove_overused_xrefs
import src.datahandlers.umls as umls
import src.datahandlers.mesh as mesh
from src.util import Text


def remove_overused_xrefs_dict(kv):
Expand Down Expand Up @@ -146,7 +146,7 @@
pairs = []
for row in rows:
umls_curie = f"{UMLS}:{row['umls']['value']}"
wd_curie = f"{WIKIDATA}:{row['wd']['value']}"

Check failure on line 149 in src/createcompendia/anatomy.py

View workflow job for this annotation

GitHub Actions / Check Python formatting with ruff

Ruff (F841)

src/createcompendia/anatomy.py:149:9: F841 Local variable `wd_curie` is assigned to but never used
cl_curie = Text.obo_to_curie(row["cl"]["value"])
pairs.append((umls_curie, cl_curie))
counts[umls_curie] += 1
Expand Down Expand Up @@ -190,7 +190,7 @@
# them added. So we want to limit concordances to terms that are already in the dicts. But that's ONLY for the
# UMLS concord. We trust the others to retrieve decent identifiers.
bs = frozenset([UMLS, GO])
with open(infile, "r") as inf:
with open(infile) as inf:
for line in inf:
x = line.strip().split("\t")
prefixes = frozenset([xi.split(":")[0] for xi in x[0:3:2]]) # leave out the predicate
Expand All @@ -202,7 +202,7 @@
use = False
if not use:
continue
pairs.append(([x[0], x[2]]))
pairs.append([x[0], x[2]])
newpairs = remove_overused_xrefs(pairs)
setpairs = [set(x) for x in newpairs]
glom(dicts, setpairs, unique_prefixes=[UBERON, GO])
Expand Down
3 changes: 1 addition & 2 deletions src/createcompendia/cell_line.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from src.babel_utils import glom, read_identifier_file, write_compendium
from src.categories import CELL_LINE

from src.babel_utils import read_identifier_file, glom, write_compendium


def build_compendia(ifile, metadata_yamls, icrdf_filename):
""":identifiers: a list of files from which to read identifiers and optional categories"""
Expand Down
Loading
Loading