Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
025ff48
First stab at bulk normalization.
gaurav Sep 9, 2025
6a39345
Better wildcards.
gaurav Sep 9, 2025
df08aad
Added normalize files rules.
gaurav Sep 9, 2025
750d647
Maybe?
gaurav Sep 9, 2025
852e8c0
Maybe??
gaurav Sep 9, 2025
2506fc2
Maybe???
gaurav Sep 9, 2025
eb93603
Maybe??
gaurav Sep 9, 2025
19c25a9
Basic bulk normalization.
gaurav Sep 9, 2025
9a10769
No checkpoints for you.
gaurav Sep 9, 2025
4913e3e
Merge branch 'master' into bulk-normalize
gaurav Sep 25, 2025
1c3f7b3
First stab at a report.
gaurav Sep 25, 2025
0a638f7
Removed prereqs.
gaurav Sep 25, 2025
f94b415
Some improvements.
gaurav Sep 25, 2025
556a1f6
Add unique percentage.
gaurav Sep 25, 2025
6f04bd9
Added normalizer requirements back in.
gaurav Sep 25, 2025
c021841
Added inverse for normalized CURIEs, rearranged columns.
gaurav Sep 25, 2025
53db902
Set fetch-depth=1 and turn off persist-credentials.
gaurav Oct 29, 2025
d1ddef6
Added `build` block in config.yaml.
gaurav Oct 29, 2025
85ee351
Added on:push trigger for testing.
gaurav Oct 29, 2025
cff4e3c
Removed on:push trigger after testing.
gaurav Oct 29, 2025
1b99951
Changed image tag to latest.
gaurav Oct 29, 2025
4007d5b
Updated ChEBI download URLs.
gaurav Oct 31, 2025
a836128
Added taxon_specific to conflated outputs.
gaurav Nov 4, 2025
7c28c44
Replaced logging with logger in SynonymConflation.
gaurav Nov 4, 2025
e0c14d4
Add a taxon_specific flag to leftover UMLS.
gaurav Nov 5, 2025
e116107
Improved decompress when downloading with wget.
gaurav Nov 6, 2025
774fb55
Improve debugging, remove redundant statement.
gaurav Nov 13, 2025
dd9d2ca
Moved PREFERRED_CONFLATION_TYPE_ORDER back into drugchemical.py.
gaurav Nov 13, 2025
3414f55
Moved biolink:Drug to the bottom of the conflation order.
gaurav Nov 17, 2025
40fd3c6
Added filter for non-chemicals.
gaurav Nov 17, 2025
3aca0b0
Fixed bug in chemical type test.
gaurav Nov 17, 2025
dac5222
Skip non-chemical Biolink types.
gaurav Nov 17, 2025
04bcbd6
Normalize identifiers in DrugChemical conflation before writing pairs.
gaurav Nov 17, 2025
d031d2a
Group conflation IDs by prefix instead of Biolink type.
gaurav Nov 19, 2025
ec1aaf1
Fixed call to get_element()['id_prefixes'].
gaurav Nov 19, 2025
00e8445
Removed redundant preferred_conflation_type_order.
gaurav Nov 21, 2025
960fe03
Merge branch 'babel-1.14' into bulk-normalize
gaurav Dec 3, 2025
ffa7568
Merge branch 'master' into bulk-normalize
gaurav Dec 17, 2025
3393de7
Cleaned up merge, added RXCUI to the end.
gaurav Dec 17, 2025
d9d934f
Added some documentation.
gaurav Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
configfile: "config.yaml"


include: "src/snakefiles/datacollect.snakefile"
include: "src/snakefiles/anatomy.snakefile"
include: "src/snakefiles/cell_line.snakefile"
Expand All @@ -19,7 +18,7 @@ include: "src/snakefiles/publications.snakefile"
include: "src/snakefiles/duckdb.snakefile"
include: "src/snakefiles/reports.snakefile"
include: "src/snakefiles/exports.snakefile"

include: "src/snakefiles/bulk_normalizer.snakefile"

# Some global settings.
import os
Expand Down
5 changes: 4 additions & 1 deletion src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,10 @@ def build_conflation(
if not conflation_prefix_order:
raise RuntimeError(f"Biolink model {config['biolink_version']} doesn't have a ChemicalEntity prefix order: {biolink_chemical_entity}")

# Add RXCUI at the bottom.
# Remove RXCUI from the prefix order if it is present.
conflation_prefix_order.remove("RXCUI")

# ... and add it to the bottom.
conflation_prefix_order.append("RXCUI")

# Turn it into a sort order.
Expand Down
162 changes: 162 additions & 0 deletions src/snakefiles/bulk_normalizer.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#
# bulk_normalizer.snakefile - Rules for bulk normalizing various files
#
# These are generally intended for bulk normalizing files that we hope to eventually incorporate into Babel,
# but I won't judge you if you use it for one-off bulk normalization.
#
import csv
import gzip
import json
from collections import defaultdict

from src.babel_utils import pull_via_wget, WgetRecursionOptions
from src.exporters.duckdb_exporters import setup_duckdb
from src.util import get_logger

logger = get_logger(__name__)

# Step 1. Download bulk identifiers to normalize.
rule download_bulk_normalizer_files:
output:
bulk_normalizer_dir = directory(config['download_directory'] + '/bulk-normalizer/bulk_normalizer'),
file_list = config['download_directory'] + '/bulk-normalizer/file-list.txt',
run:
pull_via_wget(
url_prefix='https://stars.renci.org/var/babel_outputs/',
in_file_name='bulk_normalizer/',
subpath='bulk-normalizer',
decompress=False,
recurse=WgetRecursionOptions.RECURSE_DIRECTORY_ONLY,
)
with open(output.file_list, 'w') as f:
# Get the list of files in bulk_normalizer_dir
all_files = os.listdir(output.bulk_normalizer_dir)
filtered_files = [fname for fname in all_files if '.tsv' in fname or '.txt' in fname]
f.write("\n".join(filtered_files))

# Step 2. Normalize the input files.
rule bulk_normalize_files:
# TODO: If I can break this up into two rules, then I'll be able to normalize all these files in parallel.
input:
duckdb_done = config['output_directory'] + '/reports/duckdb/done',
file_list = config['download_directory'] + '/bulk-normalizer/file-list.txt',
bulk_normalizer_input_dir = directory(config['download_directory'] + '/bulk-normalizer/bulk_normalizer'),
output:
bulk_normalizer_output_dir = directory(config['output_directory'] + '/bulk-normalizer'),
normalizer_done = config['output_directory'] + '/bulk-normalizer/done',
run:
os.makedirs(output.bulk_normalizer_output_dir + '/duckdbs', exist_ok=True)
os.makedirs(output.bulk_normalizer_output_dir + '/bulk_normalizer', exist_ok=True)
for filename in os.listdir(input.bulk_normalizer_input_dir):
if '.txt' in filename or '.tsv' in filename:
# Step 0. Set up DuckDB and filenames.
duckdb_filename = output.bulk_normalizer_output_dir + '/duckdbs/' + filename + '.duckdb'
db = setup_duckdb(duckdb_filename)

input_filename = os.path.join(input.bulk_normalizer_input_dir, filename)
output_filename = output.bulk_normalizer_output_dir + '/bulk_normalizer/' + filename

# Step 1. Load the file into DuckDB.
logger.info(f"Loading {input_filename} ...")
input_csv = db.read_csv(input_filename)

# Step 2. Normalize the identifiers.
cliques = db.from_parquet(config['output_directory'] + '/duckdb/parquet/filename=*/Clique.parquet')
edges = db.from_parquet(config['output_directory'] + '/duckdb/parquet/filename=*/Edge.parquet')
conflations = db.from_parquet(config['output_directory'] + '/duckdb/parquet/filename=*/Conflation.parquet')

result = db.sql("""SELECT
input_csv.*,
edges.clique_leader AS normalized_curie,
COALESCE(conflations.preferred_curie, edges.clique_leader) AS normalized_conflated_curie,
conflations.conflation AS conflation_type,
cliques.preferred_name AS preferred_name,
cliques.biolink_type AS biolink_type
FROM input_csv
LEFT JOIN edges ON UPPER(input_csv.id) = UPPER(edges.curie)
LEFT JOIN cliques ON UPPER(edges.clique_leader) = UPPER(cliques.clique_leader)
LEFT JOIN conflations ON UPPER(input_csv.id) = UPPER(conflations.curie) AND
(conflations.conflation = 'DrugChemical' OR conflations.conflation = 'GeneProtein')""")

# Step 3. Write out the output file.
logger.info(f"Writing {output_filename} ...")
result.to_csv(output_filename, sep='\t')

with open(output.normalizer_done, 'w') as f:
f.write("done")

# Generate a report for each normalized file.
rule bulk_normalize_reports:
input:
bulk_normalizer_output_dir = config['output_directory'] + '/bulk-normalizer',
normalizer_done = config['output_directory'] + '/bulk-normalizer/done',
output:
bulk_normalizer_report = config['output_directory'] + '/bulk-normalizer/report.tsv',
run:
# Prepare to write the output file.
with open(output.bulk_normalizer_report, 'w') as outf:
writer = csv.DictWriter(outf, delimiter='\t', fieldnames=[
'filename',
'rows',
'unique_id_count',
'unique_normalized_curie_count',
'unique_normalized_curie_percent',
'unique_normalized_curie_absent',
'unique_normalized_curie_absent_percent',
'biolink_types',
# The following columns probably aren't very useful.
'rows_with_normalized_curie',
'rows_with_normalized_curie_percent',
'rows_without_normalized_curie',
'rows_without_normalized_curie_percent'
])
writer.writeheader()

# Iterate over all the files in the bulk-normalizer output directory.
for filename in os.listdir(input.bulk_normalizer_output_dir + '/bulk_normalizer'):
filename_lc = filename.lower()
if '.txt' in filename_lc or '.tsv' in filename_lc:
logger.info(f"Generating report for bulk normalized file {filename} ...")
if '.gz' in filename_lc:
file = gzip.open(input.bulk_normalizer_output_dir + '/bulk_normalizer/' + filename, 'rt')
else:
file = open(input.bulk_normalizer_output_dir + '/bulk_normalizer/' + filename, 'r')

# Get the number of input records and the number of normalized records.
row_count = 0
rows_with_normalized_curie_count = 0
rows_without_normalized_curie_count = 0
unique_id = set()
unique_normalized_curie = set()
biolink_types = defaultdict(int)

reader = csv.DictReader(file, delimiter='\t')
for row in reader:
row_count += 1
unique_id.add(row['id'])
if not row['normalized_curie']:
rows_without_normalized_curie_count += 1
else:
unique_normalized_curie.add(row['normalized_curie'])
rows_with_normalized_curie_count += 1
biolink_types[row['biolink_type']] += 1

file.close()

# Write out the report line.
writer.writerow({
'filename': filename,
'rows': row_count,
'unique_id_count': len(unique_id),
'unique_normalized_curie_count': len(unique_normalized_curie),
'unique_normalized_curie_percent': round(len(unique_normalized_curie) / len(unique_id) * 100, 2),
'unique_normalized_curie_absent': len(unique_id) - len(unique_normalized_curie),
'unique_normalized_curie_absent_percent': round((len(unique_id) - len(unique_normalized_curie)) / len(unique_id) * 100, 2),
'biolink_types': json.dumps(biolink_types),

# The following columns probably aren't very useful.
'rows_with_normalized_curie': rows_with_normalized_curie_count,
'rows_with_normalized_curie_percent': round(rows_with_normalized_curie_count / row_count * 100, 2),
'rows_without_normalized_curie': rows_without_normalized_curie_count,
'rows_without_normalized_curie_percent': round(rows_without_normalized_curie_count / row_count * 100, 2),
})
Loading