44import scanpy as sc
55import re
66import glob
7+ from anndata import AnnData
78
89try :
910 from monitor import setup_logger , log_exception
@@ -171,7 +172,7 @@ def subset_adata(adata, de_cells):
171172 "subsetting matrix on cells in clustering"
172173 )
173174 matrix_subset_list = np .in1d (adata .obs_names , de_cells )
174- adata = adata [matrix_subset_list ]
175+ adata = adata [matrix_subset_list ]. copy ()
175176 return adata
176177
177178 def execute_de (self ):
@@ -219,8 +220,10 @@ def execute_de(self):
219220 msg ,
220221 )
221222 raise ValueError (msg )
222- except :
223- raise
223+ except Exception as e :
224+ print (e )
225+
226+
224227
225228 @staticmethod
226229 def get_genes (genes_path ):
@@ -241,7 +244,7 @@ def get_genes(genes_path):
241244 "dev_info: Features file contains duplicate identifiers in column 2"
242245 )
243246 print (warning )
244- genes_df ['new_id' ] = genes_df [[0 , 1 ]].agg ('|' .join , axis = 1 )
247+ genes_df ['new_id' ] = genes_df [[1 , 0 ]].agg ('|' .join , axis = 1 )
245248 if genes_df ['new_id' ].count () != genes_df ['new_id' ].nunique ():
246249 msg = "Duplicates in features file even after joining gene_id and gene_name"
247250 log_exception (
@@ -306,10 +309,16 @@ def delimiter_in_gene_name(rank):
306309 @staticmethod
307310 def extract_gene_id_for_out_file (rank ):
308311 """Separate out gene name from gene ID"""
309- rank ['gene_id' ] = rank ['names' ].str .split ('|' ).str [0 ]
310- rank ['names' ] = rank ['names' ].str .split ('|' ).str [1 ]
312+ rank ['feature_id' ] = rank ['names' ].str .split ('|' ).str [1 ]
313+ rank ['names' ] = rank ['names' ].str .split ('|' ).str [0 ]
314+
311315 return rank
312316
317+ @staticmethod
318+ def all_match_ensembl_id_regex (adata , col_name ):
319+ regex = r'ENS[A-Z]{0,4}\d{11}'
320+ return adata [col_name ].astype (str ).str .match (regex ).all ()
321+
313322 @staticmethod
314323 def run_scanpy_de (
315324 cluster ,
@@ -340,23 +349,40 @@ def run_scanpy_de(
340349 elif matrix_file_type == "h5ad" :
341350 matrix_object = IngestFiles (matrix_file_path , None )
342351 local_file_path = matrix_object .resolve_path (matrix_file_path )[1 ]
343- adata = matrix_object .open_anndata (local_file_path )
352+ orig_adata = matrix_object .open_anndata (local_file_path )
344353 else :
345354 # MTX reconstitution UNTESTED (SCP-4203)
346355 # will want try/except here to catch failed data object composition
347356 adata = DifferentialExpression .adata_from_mtx (
348357 matrix_file_path , genes_path , barcodes_path
349358 )
350359
351- if not matrix_file_type == "h5ad" :
360+ if matrix_file_type == "h5ad" :
361+ if orig_adata .raw is not None :
362+ adata = AnnData (
363+ # using .copy() for the AnnData components is good practice
364+ # but we won't be using orig_adata for analyses
365+ # choosing to avoid .copy() for memory efficiency
366+ X = orig_adata .raw .X ,
367+ obs = orig_adata .obs ,
368+ var = orig_adata .var ,
369+ )
370+ else :
371+ msg = f'{ matrix_file_path } does not have a .raw attribute'
372+ print (msg )
373+ log_exception (
374+ DifferentialExpression .dev_logger , DifferentialExpression .de_logger , msg
375+ )
376+ raise ValueError (msg )
377+ # AnnData expects gene x cell so dense and mtx matrices require transposition
378+ else :
352379 adata = adata .transpose ()
353380
354- use_raw = True if matrix_file_type == "h5ad" else False
355-
356381 adata = DifferentialExpression .subset_adata (adata , de_cells )
357382
358- # will need try/except (SCP-4205)
359- adata .obs = DifferentialExpression .order_annots (de_annots , adata .obs_names )
383+ # h5ad inputs will already have obs data, only non-h5ad need this step
384+ if not matrix_file_type == "h5ad" :
385+ adata .obs = DifferentialExpression .order_annots (de_annots , adata .obs_names )
360386
361387 adata = DifferentialExpression .remove_single_sample_data (adata , annotation )
362388
@@ -369,7 +395,6 @@ def run_scanpy_de(
369395 adata ,
370396 annotation ,
371397 key_added = rank_key ,
372- use_raw = use_raw ,
373398 method = method ,
374399 pts = True ,
375400 )
@@ -393,13 +418,21 @@ def run_scanpy_de(
393418 for group in groups :
394419 clean_group = DifferentialExpression .sanitize_string (group )
395420 clean_annotation = DifferentialExpression .sanitize_string (annotation )
396- DifferentialExpression .de_logger .info (f"Writing DE output for { group } " )
397421 rank = sc .get .rank_genes_groups_df (adata , key = rank_key , group = group )
422+ if DifferentialExpression .all_match_ensembl_id_regex (rank , "names" ):
423+ feature_name_rank = rank .merge (adata .var ['feature_name' ], left_on = "names" , right_index = True , how = 'left' )
424+ feature_name_rank .rename (columns = {'names' :'feature_id' }, inplace = True )
425+ feature_name_rank .rename (columns = {'feature_name' :'names' }, inplace = True )
426+ new_column_order = ["names" ] + [col for col in feature_name_rank .columns if col not in {"names" , "feature_id" }] + ["feature_id" ]
427+ feature_name_rank = feature_name_rank [new_column_order ]
428+
429+ rank = feature_name_rank
398430 if DifferentialExpression .delimiter_in_gene_name (rank ):
399431 DifferentialExpression .extract_gene_id_for_out_file (rank )
400432 out_file = f'{ cluster_name } --{ clean_annotation } --{ clean_group } --{ annot_scope } --{ method } .tsv'
401433 # Round numbers to 4 significant digits while respecting fixed point
402434 # and scientific notation (note: trailing zeros are removed)
435+ DifferentialExpression .de_logger .info (f"Writing DE output for { clean_group } " )
403436 rank .to_csv (out_file , sep = '\t ' , float_format = '%.4g' )
404437
405438 # Provide h5ad of DE analysis as reference computable object
0 commit comments