Skip to content

Commit 9652c86

Browse files
authored
Merge pull request #372 from broadinstitute/jlc_show_gene_name
Show gene name in DE table for CELLxGENE-style AnnData files (SCP-5859)
2 parents 00109a5 + bfb9260 commit 9652c86

13 files changed

+33431
-36
lines changed

.github/workflows/minify_ontologies.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ on:
55
types: [opened] # Only trigger on PR "opened" event
66
# push: # Uncomment, update branches to develop / debug
77
# branches:
8-
# jb-metadata-boolean
8+
# jlc_show_gene_name
99

1010
jobs:
1111
build:

ingest/de.py

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import scanpy as sc
55
import re
66
import glob
7+
from anndata import AnnData
78

89
try:
910
from monitor import setup_logger, log_exception
@@ -171,7 +172,7 @@ def subset_adata(adata, de_cells):
171172
"subsetting matrix on cells in clustering"
172173
)
173174
matrix_subset_list = np.in1d(adata.obs_names, de_cells)
174-
adata = adata[matrix_subset_list]
175+
adata = adata[matrix_subset_list].copy()
175176
return adata
176177

177178
def execute_de(self):
@@ -219,8 +220,10 @@ def execute_de(self):
219220
msg,
220221
)
221222
raise ValueError(msg)
222-
except:
223-
raise
223+
except Exception as e:
224+
print(e)
225+
226+
224227

225228
@staticmethod
226229
def get_genes(genes_path):
@@ -241,7 +244,7 @@ def get_genes(genes_path):
241244
"dev_info: Features file contains duplicate identifiers in column 2"
242245
)
243246
print(warning)
244-
genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
247+
genes_df['new_id'] = genes_df[[1, 0]].agg('|'.join, axis=1)
245248
if genes_df['new_id'].count() != genes_df['new_id'].nunique():
246249
msg = "Duplicates in features file even after joining gene_id and gene_name"
247250
log_exception(
@@ -306,10 +309,16 @@ def delimiter_in_gene_name(rank):
306309
@staticmethod
307310
def extract_gene_id_for_out_file(rank):
308311
"""Separate out gene name from gene ID"""
309-
rank['gene_id'] = rank['names'].str.split('|').str[0]
310-
rank['names'] = rank['names'].str.split('|').str[1]
312+
rank['feature_id'] = rank['names'].str.split('|').str[1]
313+
rank['names'] = rank['names'].str.split('|').str[0]
314+
311315
return rank
312316

317+
@staticmethod
318+
def all_match_ensembl_id_regex(adata, col_name):
319+
regex = r'ENS[A-Z]{0,4}\d{11}'
320+
return adata[col_name].astype(str).str.match(regex).all()
321+
313322
@staticmethod
314323
def run_scanpy_de(
315324
cluster,
@@ -340,23 +349,40 @@ def run_scanpy_de(
340349
elif matrix_file_type == "h5ad":
341350
matrix_object = IngestFiles(matrix_file_path, None)
342351
local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
343-
adata = matrix_object.open_anndata(local_file_path)
352+
orig_adata = matrix_object.open_anndata(local_file_path)
344353
else:
345354
# MTX reconstitution UNTESTED (SCP-4203)
346355
# will want try/except here to catch failed data object composition
347356
adata = DifferentialExpression.adata_from_mtx(
348357
matrix_file_path, genes_path, barcodes_path
349358
)
350359

351-
if not matrix_file_type == "h5ad":
360+
if matrix_file_type == "h5ad":
361+
if orig_adata.raw is not None:
362+
adata = AnnData(
363+
# using .copy() for the AnnData components is good practice
364+
# but we won't be using orig_adata for analyses
365+
# choosing to avoid .copy() for memory efficiency
366+
X = orig_adata.raw.X,
367+
obs = orig_adata.obs,
368+
var = orig_adata.var,
369+
)
370+
else:
371+
msg = f'{matrix_file_path} does not have a .raw attribute'
372+
print(msg)
373+
log_exception(
374+
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
375+
)
376+
raise ValueError(msg)
377+
# AnnData expects gene x cell so dense and mtx matrices require transposition
378+
else:
352379
adata = adata.transpose()
353380

354-
use_raw = True if matrix_file_type == "h5ad" else False
355-
356381
adata = DifferentialExpression.subset_adata(adata, de_cells)
357382

358-
# will need try/except (SCP-4205)
359-
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
383+
# h5ad inputs will already have obs data, only non-h5ad need this step
384+
if not matrix_file_type == "h5ad":
385+
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
360386

361387
adata = DifferentialExpression.remove_single_sample_data(adata, annotation)
362388

@@ -369,7 +395,6 @@ def run_scanpy_de(
369395
adata,
370396
annotation,
371397
key_added=rank_key,
372-
use_raw=use_raw,
373398
method=method,
374399
pts=True,
375400
)
@@ -393,13 +418,21 @@ def run_scanpy_de(
393418
for group in groups:
394419
clean_group = DifferentialExpression.sanitize_string(group)
395420
clean_annotation = DifferentialExpression.sanitize_string(annotation)
396-
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
397421
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
422+
if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"):
423+
feature_name_rank = rank.merge(adata.var['feature_name'], left_on="names", right_index=True, how='left')
424+
feature_name_rank.rename(columns={'names':'feature_id'}, inplace=True)
425+
feature_name_rank.rename(columns={'feature_name':'names'}, inplace=True)
426+
new_column_order = ["names"] + [col for col in feature_name_rank.columns if col not in {"names", "feature_id"}] + ["feature_id"]
427+
feature_name_rank = feature_name_rank[new_column_order]
428+
429+
rank = feature_name_rank
398430
if DifferentialExpression.delimiter_in_gene_name(rank):
399431
DifferentialExpression.extract_gene_id_for_out_file(rank)
400432
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
401433
# Round numbers to 4 significant digits while respecting fixed point
402434
# and scientific notation (note: trailing zeros are removed)
435+
DifferentialExpression.de_logger.info(f"Writing DE output for {clean_group}")
403436
rank.to_csv(out_file, sep='\t', float_format='%.4g')
404437

405438
# Provide h5ad of DE analysis as reference computable object
56 Bytes
Binary file not shown.
1.38 KB
Binary file not shown.
6.58 KB
Binary file not shown.
113 Bytes
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1729700083 # validation cache key
1+
1733232455 # validation cache key
8.27 MB
Binary file not shown.
Binary file not shown.
7.54 KB
Binary file not shown.

0 commit comments

Comments
 (0)