Skip to content

Commit d625a5e

Browse files
authored
Merge pull request #109 from yueqixuan/dev
Fix the quantms LFQ/TMT modules
2 parents faeb920 + 7e02000 commit d625a5e

File tree

14 files changed

+1124
-1770
lines changed

14 files changed

+1124
-1770
lines changed

quantmsio/commands/convert/quantms.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,10 @@ def convert_quantms_feature_cmd(
8888
indexer = None
8989
if database_path and Path(database_path).exists():
9090
logger.info(f"Opening existing MzTabIndexer at {database_path}")
91-
indexer = MzTabIndexer.open(str(database_path))
91+
indexer = MzTabIndexer.open(
92+
database_path=str(database_path),
93+
sdrf_path=sdrf_file,
94+
)
9295
elif database_path and mztab_path:
9396
logger.info(
9497
f"Creating new MzTabIndexer at {database_path} from {mztab_path}"
@@ -177,7 +180,9 @@ def convert_quantms_psm_cmd(
177180
# Determine how to open or create the indexer
178181
if database_path and Path(database_path).exists():
179182
logger.info(f"Opening existing MzTabIndexer at {database_path}")
180-
indexer = MzTabIndexer.open(str(database_path))
183+
indexer = MzTabIndexer.open(
184+
database_path=str(database_path),
185+
)
181186
elif database_path and mztab_path:
182187
logger.info(
183188
f"Creating new MzTabIndexer at {database_path} from {mztab_path}"
@@ -363,7 +368,10 @@ def convert_quantms_pg_cmd(
363368
indexer = None
364369
if database_path and Path(database_path).exists():
365370
logger.info(f"Opening existing MzTabIndexer at {database_path}")
366-
indexer = MzTabIndexer.open(str(database_path))
371+
indexer = MzTabIndexer.open(
372+
database_path=str(database_path),
373+
sdrf_path=sdrf_file,
374+
)
367375
elif database_path and mztab_path:
368376
logger.info(
369377
f"Creating new MzTabIndexer at {database_path} from {mztab_path}"
@@ -403,7 +411,7 @@ def convert_quantms_pg_cmd(
403411
# Convert to parquet and write
404412
table = mztab_pg._convert_to_parquet_format(result_df)
405413
pq.write_table(table, str(output_file))
406-
logger.info("Successfully wrote protein groups to parquet file")
414+
logger.info(f"[Writer] Successfully wrote protein groups to: {output_file}")
407415

408416
except Exception as e:
409417
logger.exception(f"Error in mzTab protein group conversion: {str(e)}")

quantmsio/core/duckdb.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
},
4848
)
4949
self.logger.info(
50-
f"Time to initialize duckdb {time.time() - start_time} seconds"
50+
f"Time to initialize duckdb {(time.time() - start_time):.2f} seconds"
5151
)
5252

5353
def destroy_database(self):
@@ -56,6 +56,27 @@ def destroy_database(self):
5656
self._duckdb.close()
5757
self._duckdb = None
5858

59+
def cleanup_duckdb(self):
60+
"""Check if DuckDB connection is closed, then delete the database file."""
61+
# Close connection if it is still open
62+
if self._duckdb:
63+
try:
64+
self._duckdb.close()
65+
self.logger.info("[Check] DuckDB connection closed.")
66+
except Exception as e:
67+
self.logger.info(f"Failed to close DuckDB connection: {e}")
68+
finally:
69+
self._duckdb = None
70+
71+
db_file = Path(self._database_path)
72+
# Delete the database file using pathlib
73+
if db_file.exists():
74+
try:
75+
db_file.unlink()
76+
self.logger.info(f"[CleanUp] Database file deleted: {db_file}")
77+
except Exception as e:
78+
self.logger.info(f"Failed to delete database file: {e}")
79+
5980
def query_to_df(self, query: str) -> pd.DataFrame:
6081
"""Execute query and return result as DataFrame."""
6182
return self._duckdb.execute(query).df()

quantmsio/core/quantms/feature.py

Lines changed: 41 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
import logging
2-
import tempfile
32
from pathlib import Path
4-
from typing import Union
5-
63
import pandas as pd
74
import pyarrow as pa
8-
import pyarrow.parquet as pq
95

106
from quantmsio.core.common import FEATURE_SCHEMA
117

@@ -202,7 +198,7 @@ def generate_modifications_details(self, peptidoform, modifications_dict):
202198
[
203199
{
204200
"score_name": "localization_probability",
205-
"score_value": 1.0,
201+
"score_value": None,
206202
}
207203
]
208204
if mod_name in select_mods
@@ -350,10 +346,6 @@ def _create_file_metadata(self):
350346
}
351347

352348
def transform_msstats_in(self, file_num=10, protein_str=None):
353-
# Check if msstats data is already loaded in the indexer
354-
# if not self._indexer._msstats_path:
355-
# # Add msstats data to the existing indexer
356-
# self._indexer.add_msstats_table(self._msstats_in)
357349

358350
# Determine experiment type (LFQ vs TMT)
359351
experiment_type = self._indexer.get_msstats_experiment_type()
@@ -382,17 +374,15 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
382374
"""
383375

384376
# Group by feature identifier (peptidoform + charge + reference file + protein)
385-
grouping_cols = ["PeptideSequence", "ProteinName", "reference_file_name"]
377+
grouping_cols = ["peptidoform", "pg_accessions", "reference_file_name"]
386378

387379
# Add charge column if available, otherwise use default
388-
if "Charge" in msstats_batch.columns:
389-
grouping_cols.append("Charge")
390-
elif "PrecursorCharge" in msstats_batch.columns:
391-
grouping_cols.append("PrecursorCharge")
380+
if "charge" in msstats_batch.columns:
381+
grouping_cols.append("charge")
392382
else:
393383
# Add a default charge if not available
394-
msstats_batch["Charge"] = 3
395-
grouping_cols.append("Charge")
384+
msstats_batch["charge"] = 3
385+
grouping_cols.append("charge")
396386

397387
features_list = []
398388

@@ -430,7 +420,7 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
430420
"intensities": intensities,
431421
"pg_accessions": [protein_name] if protein_name else [],
432422
"anchor_protein": protein_name or "",
433-
"rt": first_row.get("RetentionTime", None),
423+
"rt": first_row.get("rt", None),
434424
# Will add more fields in subsequent processing steps
435425
}
436426

@@ -588,40 +578,42 @@ def write_feature_to_file(
588578
batch_writer.close()
589579

590580
if Path(output_path).exists():
591-
self.logger.info(f"Feature file written to {output_path}")
581+
self.logger.info(
582+
f"[Writer] Successfully wrote Feature to: {output_path}"
583+
)
592584

593585
# Clean up the temporary MzTabIndexer
594-
self._indexer.destroy_database()
595-
596-
def write_features_to_file(
597-
self,
598-
output_folder,
599-
filename,
600-
partitions,
601-
file_num=10,
602-
protein_file=None,
603-
duckdb_max_memory="16GB",
604-
duckdb_threads=4,
605-
):
606-
logger = logging.getLogger("quantmsio.core.feature")
607-
608-
# Log input and output paths
609-
logger.info(f"Input mzTab file: {self._indexer._mztab_path}")
610-
logger.info(f"Output folder: {output_folder}")
611-
logger.info(f"Base filename: {filename}")
612-
if protein_file:
613-
logger.info(f"Protein filter file: {protein_file}")
614-
615-
pqwriters = {}
616-
protein_list = extract_protein_list(protein_file) if protein_file else None
617-
protein_str = "|".join(protein_list) if protein_list else None
618-
for key, feature in self.generate_slice_feature(
619-
partitions, file_num, protein_str, duckdb_max_memory, duckdb_threads
620-
):
621-
pqwriters = save_slice_file(
622-
feature, pqwriters, output_folder, key, filename
623-
)
624-
close_file(pqwriters)
586+
self._indexer.cleanup_duckdb()
587+
588+
# def write_features_to_file(
589+
# self,
590+
# output_folder,
591+
# filename,
592+
# partitions,
593+
# file_num=10,
594+
# protein_file=None,
595+
# duckdb_max_memory="16GB",
596+
# duckdb_threads=4,
597+
# ):
598+
# logger = logging.getLogger("quantmsio.core.feature")
599+
600+
# # Log input and output paths
601+
# logger.info(f"Input mzTab file: {self._indexer._mztab_path}")
602+
# logger.info(f"Output folder: {output_folder}")
603+
# logger.info(f"Base filename: {filename}")
604+
# if protein_file:
605+
# logger.info(f"Protein filter file: {protein_file}")
606+
607+
# pqwriters = {}
608+
# protein_list = extract_protein_list(protein_file) if protein_file else None
609+
# protein_str = "|".join(protein_list) if protein_list else None
610+
# for key, feature in self.generate_slice_feature(
611+
# partitions, file_num, protein_str, duckdb_max_memory, duckdb_threads
612+
# ):
613+
# pqwriters = save_slice_file(
614+
# feature, pqwriters, output_folder, key, filename
615+
# )
616+
# close_file(pqwriters)
625617

626618
@staticmethod
627619
def generate_best_scan(rows, pep_dict):

0 commit comments

Comments
 (0)