8
8
9
9
# MsstatsIN functionality now available in MzTabIndexer
10
10
from quantmsio .core .quantms .mztab import MzTabIndexer
11
- from quantmsio .core .quantms .psm import Psm
12
- from quantmsio .core .sdrf import SDRFHandler
13
- from quantmsio .operate .tools import get_protein_accession , get_ahocorasick
11
+ from quantmsio .operate .tools import get_protein_accession
14
12
from quantmsio .utils .file_utils import (
15
- close_file ,
16
13
extract_protein_list ,
17
- save_slice_file ,
18
14
ParquetBatchWriter ,
19
15
)
20
16
@@ -61,11 +57,17 @@ def extract_psm_msg(self, protein_str=None):
61
57
"""
62
58
# Get PSMs from the indexer
63
59
try :
60
+ self .logger .info ("Get PSMs from the indexer." )
64
61
psms_df = self ._indexer .get_psms ()
65
62
if psms_df .empty :
66
63
return {}, {}
67
64
65
+ self .logger .info ("Get metadata from the indexer." )
68
66
metadata_df = self ._indexer .get_metadata ()
67
+
68
+ self .logger .info (
69
+ "Calling get_ms_run_reference_map to retrieve MS run reference map."
70
+ )
69
71
ms_reference_map = self .get_ms_run_reference_map (psms_df , metadata_df )
70
72
psms_df ["reference_file_name" ] = psms_df ["spectra_ref_file" ].map (
71
73
ms_reference_map
@@ -81,6 +83,7 @@ def extract_psm_msg(self, protein_str=None):
81
83
pep_dict = {}
82
84
83
85
# Create mapping dictionaries
86
+ self .logger .info ("Start creating mapping dictionaries." )
84
87
for _ , row in psms_df .iterrows ():
85
88
# Map key: (reference_file_name, peptidoform, precursor_charge)
86
89
reference_file_name = row .get ("reference_file_name" , "" )
@@ -116,6 +119,8 @@ def extract_psm_msg(self, protein_str=None):
116
119
scan_info [1 ] if len (scan_info ) > 1 else "" ,
117
120
]
118
121
122
+ self .logger .info ("Finished creating mapping dictionaries." )
123
+
119
124
return map_dict , pep_dict
120
125
121
126
except Exception as e :
@@ -367,6 +372,17 @@ def transform_msstats_in(self, file_num=10, protein_str=None):
367
372
]
368
373
369
374
if not batch .empty :
375
+
376
+ # Unique peptide indicator from PSM Table
377
+ unique_peptide_df = self ._indexer .get_unique_from_psm_table ()
378
+
379
+ batch = pd .merge (
380
+ batch ,
381
+ unique_peptide_df ,
382
+ on = ["pg_accessions" , "peptidoform" ],
383
+ how = "left" ,
384
+ )
385
+
370
386
# Aggregate data to create feature-level records with intensities array
371
387
aggregated_features = self ._aggregate_msstats_to_features (
372
388
batch , experiment_type
@@ -412,7 +428,7 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
412
428
if "channel" in row and row ["channel" ] is not None
413
429
else ("LFQ" if experiment_type == "LFQ" else "Unknown" )
414
430
),
415
- "intensity" : float (row .get ("Intensity " , 0.0 )),
431
+ "intensity" : float (row .get ("intensity " , 0.0 )),
416
432
}
417
433
intensities .append (intensity_entry )
418
434
@@ -428,6 +444,7 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
428
444
"pg_accessions" : [protein_name ] if protein_name else [],
429
445
"anchor_protein" : protein_name or "" ,
430
446
"rt" : first_row .get ("rt" , None ),
447
+ "unique" : float (first_row .get ("unique" , 1 )),
431
448
# Will add more fields in subsequent processing steps
432
449
}
433
450
@@ -471,8 +488,16 @@ def merge_psm(rows, index):
471
488
)
472
489
473
490
def generate_feature (self , file_num = 10 , protein_str = None ):
491
+
492
+ feature_count = 0
474
493
for msstats in self .generate_feature_report (file_num , protein_str ):
475
494
feature = self .transform_feature (msstats )
495
+
496
+ feature_count += len (feature )
497
+ self .logger .info (
498
+ f"Generated { len (feature )} features, the total to { feature_count } ."
499
+ )
500
+
476
501
yield feature
477
502
478
503
def generate_feature_report (self , file_num = 10 , protein_str = None ):
@@ -525,34 +550,6 @@ def merge_psm(rows, index):
525
550
axis = 1 ,
526
551
)
527
552
528
- @staticmethod
529
- def slice (df , partitions ):
530
- cols = df .columns
531
- if not isinstance (partitions , list ):
532
- raise Exception (f"{ partitions } is not a list" )
533
- if len (partitions ) == 0 :
534
- raise Exception (f"{ partitions } is empty" )
535
- for partion in partitions :
536
- if partion not in cols :
537
- raise Exception (f"{ partion } does not exist" )
538
- for key , df in df .groupby (partitions ):
539
- yield key , df
540
-
541
- def generate_slice_feature (
542
- self ,
543
- partitions ,
544
- file_num = 10 ,
545
- protein_str = None ,
546
- duckdb_max_memory = "16GB" ,
547
- duckdb_threads = 4 ,
548
- ):
549
- for msstats in self .generate_feature_report (
550
- file_num , protein_str , duckdb_max_memory , duckdb_threads
551
- ):
552
- for key , df in self .slice (msstats , partitions ):
553
- feature = self .transform_feature (df )
554
- yield key , feature
555
-
556
553
@staticmethod
557
554
def transform_feature (df ):
558
555
return pa .Table .from_pandas (df , schema = FEATURE_SCHEMA )
@@ -577,10 +574,7 @@ def write_feature_to_file(
577
574
try :
578
575
for feature_df in self .generate_feature (file_num , protein_str ):
579
576
if feature_df .num_rows > 0 :
580
- # The schema is applied when creating the table
581
- feature_df = feature_df .to_pandas ()
582
- records = feature_df .to_dict ("records" )
583
- batch_writer .write_batch (records )
577
+ batch_writer .write_batch (feature_df .to_pylist ())
584
578
finally :
585
579
batch_writer .close ()
586
580
@@ -592,36 +586,6 @@ def write_feature_to_file(
592
586
# Clean up the temporary MzTabIndexer
593
587
self ._indexer .cleanup_duckdb ()
594
588
595
- # def write_features_to_file(
596
- # self,
597
- # output_folder,
598
- # filename,
599
- # partitions,
600
- # file_num=10,
601
- # protein_file=None,
602
- # duckdb_max_memory="16GB",
603
- # duckdb_threads=4,
604
- # ):
605
- # logger = logging.getLogger("quantmsio.core.feature")
606
-
607
- # # Log input and output paths
608
- # logger.info(f"Input mzTab file: {self._indexer._mztab_path}")
609
- # logger.info(f"Output folder: {output_folder}")
610
- # logger.info(f"Base filename: {filename}")
611
- # if protein_file:
612
- # logger.info(f"Protein filter file: {protein_file}")
613
-
614
- # pqwriters = {}
615
- # protein_list = extract_protein_list(protein_file) if protein_file else None
616
- # protein_str = "|".join(protein_list) if protein_list else None
617
- # for key, feature in self.generate_slice_feature(
618
- # partitions, file_num, protein_str, duckdb_max_memory, duckdb_threads
619
- # ):
620
- # pqwriters = save_slice_file(
621
- # feature, pqwriters, output_folder, key, filename
622
- # )
623
- # close_file(pqwriters)
624
-
625
589
@staticmethod
626
590
def generate_best_scan (rows , pep_dict ):
627
591
key = (rows ["peptidoform" ], rows ["precursor_charge" ])
@@ -679,7 +643,6 @@ def add_additional_msg(self, msstats, pep_dict):
679
643
msstats .loc [:, "ion_mobility" ] = None
680
644
msstats .loc [:, "start_ion_mobility" ] = None
681
645
msstats .loc [:, "stop_ion_mobility" ] = None
682
- msstats .loc [:, "unique" ] = None # Will be set based on protein mapping
683
646
684
647
def _extract_sequence_from_peptidoform (self , peptidoform ):
685
648
"""Extract plain sequence from peptidoform by removing modifications"""
@@ -839,8 +802,13 @@ def convert_to_parquet_format(res):
839
802
for col in complex_columns :
840
803
if col in res .columns :
841
804
# Ensure proper structure for complex fields
842
- if col == "intensities" or col == "modifications" :
805
+ if (
806
+ col == "intensities"
807
+ or col == "modifications"
808
+ or col == "additional_scores"
809
+ ):
843
810
res [col ] = res [col ].apply (Feature ._ensure_list_type )
811
+
844
812
elif col == "file_metadata" :
845
813
# file_metadata should be a dict for each record
846
814
res [col ] = res [col ].apply (Feature ._ensure_dict_type )
0 commit comments