|
1 | 1 | import logging
|
2 |
| -import tempfile |
3 | 2 | from pathlib import Path
|
4 |
| -from typing import Union |
5 |
| - |
6 | 3 | import pandas as pd
|
7 | 4 | import pyarrow as pa
|
8 |
| -import pyarrow.parquet as pq |
9 | 5 |
|
10 | 6 | from quantmsio.core.common import FEATURE_SCHEMA
|
11 | 7 |
|
@@ -202,7 +198,7 @@ def generate_modifications_details(self, peptidoform, modifications_dict):
|
202 | 198 | [
|
203 | 199 | {
|
204 | 200 | "score_name": "localization_probability",
|
205 |
| - "score_value": 1.0, |
| 201 | + "score_value": None, |
206 | 202 | }
|
207 | 203 | ]
|
208 | 204 | if mod_name in select_mods
|
@@ -350,10 +346,6 @@ def _create_file_metadata(self):
|
350 | 346 | }
|
351 | 347 |
|
352 | 348 | def transform_msstats_in(self, file_num=10, protein_str=None):
|
353 |
| - # Check if msstats data is already loaded in the indexer |
354 |
| - # if not self._indexer._msstats_path: |
355 |
| - # # Add msstats data to the existing indexer |
356 |
| - # self._indexer.add_msstats_table(self._msstats_in) |
357 | 349 |
|
358 | 350 | # Determine experiment type (LFQ vs TMT)
|
359 | 351 | experiment_type = self._indexer.get_msstats_experiment_type()
|
@@ -382,17 +374,15 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
|
382 | 374 | """
|
383 | 375 |
|
384 | 376 | # Group by feature identifier (peptidoform + charge + reference file + protein)
|
385 |
| - grouping_cols = ["PeptideSequence", "ProteinName", "reference_file_name"] |
| 377 | + grouping_cols = ["peptidoform", "pg_accessions", "reference_file_name"] |
386 | 378 |
|
387 | 379 | # Add charge column if available, otherwise use default
|
388 |
| - if "Charge" in msstats_batch.columns: |
389 |
| - grouping_cols.append("Charge") |
390 |
| - elif "PrecursorCharge" in msstats_batch.columns: |
391 |
| - grouping_cols.append("PrecursorCharge") |
| 380 | + if "charge" in msstats_batch.columns: |
| 381 | + grouping_cols.append("charge") |
392 | 382 | else:
|
393 | 383 | # Add a default charge if not available
|
394 |
| - msstats_batch["Charge"] = 3 |
395 |
| - grouping_cols.append("Charge") |
| 384 | + msstats_batch["charge"] = 3 |
| 385 | + grouping_cols.append("charge") |
396 | 386 |
|
397 | 387 | features_list = []
|
398 | 388 |
|
@@ -430,7 +420,7 @@ def _aggregate_msstats_to_features(self, msstats_batch, experiment_type):
|
430 | 420 | "intensities": intensities,
|
431 | 421 | "pg_accessions": [protein_name] if protein_name else [],
|
432 | 422 | "anchor_protein": protein_name or "",
|
433 |
| - "rt": first_row.get("RetentionTime", None), |
| 423 | + "rt": first_row.get("rt", None), |
434 | 424 | # Will add more fields in subsequent processing steps
|
435 | 425 | }
|
436 | 426 |
|
@@ -588,40 +578,42 @@ def write_feature_to_file(
|
588 | 578 | batch_writer.close()
|
589 | 579 |
|
590 | 580 | if Path(output_path).exists():
|
591 |
| - self.logger.info(f"Feature file written to {output_path}") |
| 581 | + self.logger.info( |
| 582 | + f"[Writer] Successfully wrote Feature to: {output_path}" |
| 583 | + ) |
592 | 584 |
|
593 | 585 | # Clean up the temporary MzTabIndexer
|
594 |
| - self._indexer.destroy_database() |
595 |
| - |
596 |
| - def write_features_to_file( |
597 |
| - self, |
598 |
| - output_folder, |
599 |
| - filename, |
600 |
| - partitions, |
601 |
| - file_num=10, |
602 |
| - protein_file=None, |
603 |
| - duckdb_max_memory="16GB", |
604 |
| - duckdb_threads=4, |
605 |
| - ): |
606 |
| - logger = logging.getLogger("quantmsio.core.feature") |
607 |
| - |
608 |
| - # Log input and output paths |
609 |
| - logger.info(f"Input mzTab file: {self._indexer._mztab_path}") |
610 |
| - logger.info(f"Output folder: {output_folder}") |
611 |
| - logger.info(f"Base filename: {filename}") |
612 |
| - if protein_file: |
613 |
| - logger.info(f"Protein filter file: {protein_file}") |
614 |
| - |
615 |
| - pqwriters = {} |
616 |
| - protein_list = extract_protein_list(protein_file) if protein_file else None |
617 |
| - protein_str = "|".join(protein_list) if protein_list else None |
618 |
| - for key, feature in self.generate_slice_feature( |
619 |
| - partitions, file_num, protein_str, duckdb_max_memory, duckdb_threads |
620 |
| - ): |
621 |
| - pqwriters = save_slice_file( |
622 |
| - feature, pqwriters, output_folder, key, filename |
623 |
| - ) |
624 |
| - close_file(pqwriters) |
| 586 | + self._indexer.cleanup_duckdb() |
| 587 | + |
| 588 | + # def write_features_to_file( |
| 589 | + # self, |
| 590 | + # output_folder, |
| 591 | + # filename, |
| 592 | + # partitions, |
| 593 | + # file_num=10, |
| 594 | + # protein_file=None, |
| 595 | + # duckdb_max_memory="16GB", |
| 596 | + # duckdb_threads=4, |
| 597 | + # ): |
| 598 | + # logger = logging.getLogger("quantmsio.core.feature") |
| 599 | + |
| 600 | + # # Log input and output paths |
| 601 | + # logger.info(f"Input mzTab file: {self._indexer._mztab_path}") |
| 602 | + # logger.info(f"Output folder: {output_folder}") |
| 603 | + # logger.info(f"Base filename: {filename}") |
| 604 | + # if protein_file: |
| 605 | + # logger.info(f"Protein filter file: {protein_file}") |
| 606 | + |
| 607 | + # pqwriters = {} |
| 608 | + # protein_list = extract_protein_list(protein_file) if protein_file else None |
| 609 | + # protein_str = "|".join(protein_list) if protein_list else None |
| 610 | + # for key, feature in self.generate_slice_feature( |
| 611 | + # partitions, file_num, protein_str, duckdb_max_memory, duckdb_threads |
| 612 | + # ): |
| 613 | + # pqwriters = save_slice_file( |
| 614 | + # feature, pqwriters, output_folder, key, filename |
| 615 | + # ) |
| 616 | + # close_file(pqwriters) |
625 | 617 |
|
626 | 618 | @staticmethod
|
627 | 619 | def generate_best_scan(rows, pep_dict):
|
|
0 commit comments