Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
types: [opened] # Only trigger on PR "opened" event
# push: # Uncomment, update branches to develop / debug
# branches:
# jb-metadata-boolean
# jlc_show_gene_name

jobs:
build:
Expand Down
61 changes: 47 additions & 14 deletions ingest/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import scanpy as sc
import re
import glob
from anndata import AnnData

try:
from monitor import setup_logger, log_exception
Expand Down Expand Up @@ -171,7 +172,7 @@ def subset_adata(adata, de_cells):
"subsetting matrix on cells in clustering"
)
matrix_subset_list = np.in1d(adata.obs_names, de_cells)
adata = adata[matrix_subset_list]
adata = adata[matrix_subset_list].copy()
return adata

def execute_de(self):
Expand Down Expand Up @@ -219,8 +220,10 @@ def execute_de(self):
msg,
)
raise ValueError(msg)
except:
raise
except Exception as e:
print(e)



@staticmethod
def get_genes(genes_path):
Expand All @@ -241,7 +244,7 @@ def get_genes(genes_path):
"dev_info: Features file contains duplicate identifiers in column 2"
)
print(warning)
genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
genes_df['new_id'] = genes_df[[1, 0]].agg('|'.join, axis=1)
if genes_df['new_id'].count() != genes_df['new_id'].nunique():
msg = "Duplicates in features file even after joining gene_id and gene_name"
log_exception(
Expand Down Expand Up @@ -306,10 +309,16 @@ def delimiter_in_gene_name(rank):
@staticmethod
def extract_gene_id_for_out_file(rank):
"""Separate out gene name from gene ID"""
rank['gene_id'] = rank['names'].str.split('|').str[0]
rank['names'] = rank['names'].str.split('|').str[1]
rank['feature_id'] = rank['names'].str.split('|').str[1]
rank['names'] = rank['names'].str.split('|').str[0]

return rank

@staticmethod
def all_match_ensembl_id_regex(adata, col_name):
regex = r'ENS[A-Z]{0,4}\d{11}'
return adata[col_name].astype(str).str.match(regex).all()

@staticmethod
def run_scanpy_de(
cluster,
Expand Down Expand Up @@ -340,23 +349,40 @@ def run_scanpy_de(
elif matrix_file_type == "h5ad":
matrix_object = IngestFiles(matrix_file_path, None)
local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
adata = matrix_object.open_anndata(local_file_path)
orig_adata = matrix_object.open_anndata(local_file_path)
else:
# MTX reconstitution UNTESTED (SCP-4203)
# will want try/except here to catch failed data object composition
adata = DifferentialExpression.adata_from_mtx(
matrix_file_path, genes_path, barcodes_path
)

if not matrix_file_type == "h5ad":
if matrix_file_type == "h5ad":
if orig_adata.raw is not None:
adata = AnnData(
# using .copy() for the AnnData components is good practice
# but we won't be using orig_adata for analyses
# choosing to avoid .copy() for memory efficiency
X = orig_adata.raw.X,
obs = orig_adata.obs,
var = orig_adata.var,
)
else:
msg = f'{matrix_file_path} does not have a .raw attribute'
print(msg)
log_exception(
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
)
raise ValueError(msg)
# AnnData expects gene x cell so dense and mtx matrices require transposition
else:
adata = adata.transpose()

use_raw = True if matrix_file_type == "h5ad" else False

adata = DifferentialExpression.subset_adata(adata, de_cells)

# will need try/except (SCP-4205)
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
# h5ad inputs will already have obs data, only non-h5ad need this step
if not matrix_file_type == "h5ad":
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)

adata = DifferentialExpression.remove_single_sample_data(adata, annotation)

Expand All @@ -369,7 +395,6 @@ def run_scanpy_de(
adata,
annotation,
key_added=rank_key,
use_raw=use_raw,
method=method,
pts=True,
)
Expand All @@ -393,13 +418,21 @@ def run_scanpy_de(
for group in groups:
clean_group = DifferentialExpression.sanitize_string(group)
clean_annotation = DifferentialExpression.sanitize_string(annotation)
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"):
feature_name_rank = rank.merge(adata.var['feature_name'], left_on="names", right_index=True, how='left')
feature_name_rank.rename(columns={'names':'feature_id'}, inplace=True)
feature_name_rank.rename(columns={'feature_name':'names'}, inplace=True)
new_column_order = ["names"] + [col for col in feature_name_rank.columns if col not in {"names", "feature_id"}] + ["feature_id"]
feature_name_rank = feature_name_rank[new_column_order]

rank = feature_name_rank
if DifferentialExpression.delimiter_in_gene_name(rank):
DifferentialExpression.extract_gene_id_for_out_file(rank)
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
# Round numbers to 4 significant digits while respecting fixed point
# and scientific notation (note: trailing zeros are removed)
DifferentialExpression.de_logger.info(f"Writing DE output for {clean_group}")
rank.to_csv(out_file, sep='\t', float_format='%.4g')

# Provide h5ad of DE analysis as reference computable object
Expand Down
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/ncbitaxon.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1729700083 # validation cache key
1733232455 # validation cache key
Binary file added tests/data/anndata/compliant_liver.h5ad
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading