bigbio
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎quantmsio/commands/convert/maxquant.py‎
Lines changed: 35 additions & 37 deletions b/‎quantmsio/commands/convert/maxquant.py‎
Lines changed: 35 additions & 37 deletions
diff --git a/‎quantmsio/core/common.py‎
Lines changed: 6 additions & 1 deletion b/‎quantmsio/core/common.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎quantmsio/core/diann/diann.py‎
Lines changed: 13 additions & 15 deletions b/‎quantmsio/core/diann/diann.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎quantmsio/core/duckdb.py‎
Lines changed: 1 addition & 4 deletions b/‎quantmsio/core/duckdb.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎quantmsio/core/format.py‎
Lines changed: 2 additions & 0 deletions b/‎quantmsio/core/format.py‎
Lines changed: 2 additions & 0 deletions
@@ -32,7 +32,7 @@ biopython = "*"
 seaborn = "*"
 numpy = "*"
 matplotlib = "*"
-duckdb = "*"
+duckdb = ">=1.1.3,<=1.3.0"
 mygene = "*"
 pyahocorasick = "*"
 swifter = "*"
@@ -62,4 +62,5 @@ pattern = "^(?P<base>\\d+\\.\\d+\\.\\d+)$"
 [tool.pytest.ini_options]
 markers = [
     "integration: marks tests as integration tests (deselect with '-m \"not integration\"')",
+    "large_data: marks tests as requiring large data files (deselect with '-m \"not large_data\"')",
 ]
@@ -74,22 +74,20 @@ def convert_maxquant_psm_cmd(
         if not all([msms_file, output_folder]):
             raise click.UsageError("ERROR: Please provide all required parameters")
 
-        # Ensure output directory exists
         output_folder = Path(output_folder)
         output_folder.mkdir(parents=True, exist_ok=True)
         logger.info(f"Using output directory: {output_folder}")
 
-        # Set default prefix if not provided
         prefix = output_prefix or "psm"
         filename = create_uuid_filename(prefix, ".psm.parquet")
         output_path = output_folder / filename
         logger.info(f"Will save PSM file as: {filename}")
 
         logger.info("Initializing MaxQuant PSM converter...")
-        mq = MaxQuant()
+        processor = MaxQuant()
 
         logger.info(f"Starting PSM conversion (batch size: {batch_size:,})...")
-        mq.write_psm_to_file(
+        processor.process_psm_file(
             msms_path=str(msms_file), output_path=str(output_path), chunksize=batch_size
         )
         logger.info(f"PSM file successfully saved to: {output_path}")
@@ -126,6 +124,11 @@ def convert_maxquant_psm_cmd(
     help="Protein file with specific requirements",
     type=click.Path(exists=True, dir_okay=False, path_type=Path),
 )
+@click.option(
+    "--protein-groups-file",
+    help="MaxQuant proteinGroups.txt file for Q-value mapping (optional)",
+    type=click.Path(exists=True, dir_okay=False, path_type=Path),
+)
 @click.option(
     "--partitions",
     help="Field(s) used for splitting files (comma-separated)",
@@ -146,6 +149,7 @@ def convert_maxquant_feature_cmd(
     sdrf_file: Path,
     output_folder: Path,
     protein_file: Optional[Path],
+    protein_groups_file: Optional[Path],
     partitions: Optional[str],
     batch_size: int,
     output_prefix: Optional[str],
@@ -162,6 +166,7 @@ def convert_maxquant_feature_cmd(
             --evidence-file evidence.txt \\
             --sdrf-file data.sdrf.tsv \\
             --output-folder ./output \\
+            --protein-groups-file proteinGroups.txt \\
             --batch-size 1000000
     """
     logger = get_logger("quantmsio.commands.maxquant")
@@ -173,44 +178,43 @@ def convert_maxquant_feature_cmd(
         if not all([evidence_file, sdrf_file, output_folder]):
             raise click.UsageError("ERROR: Please provide all required parameters")
 
-        # Ensure output directory exists
         output_folder = Path(output_folder)
         output_folder.mkdir(parents=True, exist_ok=True)
         logger.info(f"Using output directory: {output_folder}")
 
-        # Set default prefix if not provided
         prefix = output_prefix or "feature"
         filename = create_uuid_filename(prefix, ".feature.parquet")
         output_path = output_folder / filename
         logger.info(f"Will save feature file as: {filename}")
 
         logger.info("Initializing MaxQuant feature converter...")
-        mq = MaxQuant()
+        processor = MaxQuant()
 
         if not partitions:
             logger.info(f"Starting feature conversion (batch size: {batch_size:,})...")
-            mq.write_feature_to_file(
+
+            if protein_groups_file:
+                logger.info(
+                    f"Using proteinGroups file for Q-value mapping: {protein_groups_file}"
+                )
+                processor._init_protein_group_qvalue_mapping(str(protein_groups_file))
+            else:
+                logger.info(
+                    "No proteinGroups file provided, auto-detection will be used"
+                )
+
+            processor.process_feature_file(
                 evidence_path=str(evidence_file),
-                sdrf_path=str(sdrf_file),
                 output_path=str(output_path),
-                chunksize=batch_size,
+                sdrf_path=str(sdrf_file),
                 protein_file=str(protein_file) if protein_file else None,
+                chunksize=batch_size,
             )
             logger.info(f"Feature file successfully saved to: {output_path}")
         else:
-            logger.info(f"Starting partitioned feature conversion using: {partitions}")
-            partition_list = partitions.split(",")
-            mq.write_features_to_file(
-                evidence_path=str(evidence_file),
-                sdrf_path=str(sdrf_file),
-                output_folder=str(output_folder),
-                filename=filename,
-                partitions=partition_list,
-                chunksize=batch_size,
-                protein_file=str(protein_file) if protein_file else None,
-            )
-            logger.info(
-                f"Partitioned feature files successfully saved to: {output_folder}"
+            logger.error("Partitioned conversion not implemented")
+            raise click.ClickException(
+                "Partitioned conversion feature is not yet available. Please use the standard conversion without --partitions."
             )
 
     except Exception as e:
@@ -240,14 +244,9 @@ def convert_maxquant_feature_cmd(
     required=True,
     type=click.Path(file_okay=False, path_type=Path),
 )
-@click.option(
-    "--protein-file",
-    help="Protein file with specific requirements",
-    type=click.Path(exists=True, dir_okay=False, path_type=Path),
-)
 @click.option(
     "--batch-size",
-    help="Read batch size",
+    help="Batch size (for logging purposes only)",
     default=1000000,
     type=int,
 )
@@ -260,7 +259,6 @@ def convert_maxquant_pg_cmd(
     protein_groups_file: Path,
     sdrf_file: Path,
     output_folder: Path,
-    protein_file: Optional[Path],
     batch_size: int,
     output_prefix: Optional[str],
     verbose: bool = False,
@@ -287,29 +285,25 @@ def convert_maxquant_pg_cmd(
         if not all([protein_groups_file, sdrf_file, output_folder]):
             raise click.UsageError("ERROR: Please provide all required parameters")
 
-        # Ensure output directory exists
         output_folder = Path(output_folder)
         output_folder.mkdir(parents=True, exist_ok=True)
         logger.info(f"Using output directory: {output_folder}")
 
-        # Set default prefix if not provided
         prefix = output_prefix or "pg"
         filename = create_uuid_filename(prefix, ".pg.parquet")
         output_path = output_folder / filename
         logger.info(f"Will save protein groups file as: {filename}")
 
         logger.info("Initializing MaxQuant protein groups converter...")
-        mq = MaxQuant()
+        processor = MaxQuant()
 
         logger.info(
             f"Starting protein groups conversion (batch size: {batch_size:,})..."
         )
-        mq.write_protein_groups_to_file(
+        processor.process_pg_file(
             protein_groups_path=str(protein_groups_file),
-            sdrf_path=str(sdrf_file),
             output_path=str(output_path),
-            chunksize=batch_size,
-            protein_file=str(protein_file) if protein_file else None,
+            sdrf_path=str(sdrf_file),
         )
         logger.info(f"Protein groups file successfully saved to: {output_path}")
 
@@ -318,3 +312,7 @@ def convert_maxquant_pg_cmd(
             f"Error in MaxQuant protein groups conversion: {str(e)}", exc_info=True
         )
         raise click.ClickException(f"Error: {str(e)}\nCheck the logs for more details.")
+
+
+if __name__ == "__main__":
+    convert()
@@ -124,7 +124,7 @@
 
 MAXQUANT_PSM_MAP = {
     "Sequence": "sequence",
-    "Proteins": "mp_accessions",
+    "Proteins": "protein_accessions",
     "PEP": "posterior_error_probability",
     "Modified sequence": "peptidoform",
     "Reverse": "is_decoy",
@@ -136,12 +136,16 @@
     "Score": "andromeda_score",
     "Delta score": "andromeda_delta_score",
     "PIF": "parent_ion_fraction",
+    "Masses": "mz_array",
+    "Intensities": "intensity_array",
+    "Number of matches": "number_peaks",
 }
 
 MAXQUANT_FEATURE_MAP = {
     "Sequence": "sequence",
     "Proteins": "mp_accessions",
     "Leading proteins": "pg_accessions",
+    "Leading razor protein": "anchor_protein",
     "Gene names": "gg_names",
     "PEP": "posterior_error_probability",
     "Modified sequence": "peptidoform",
@@ -156,6 +160,7 @@
     "Calibrated retention time": "rt",
     "Calibrated retention time start": "rt_start",
     "Calibrated retention time finish": "rt_stop",
+    "Intensity": "intensity",
 }
 
 IBAQ_USECOLS = [
 
@@ -40,7 +40,7 @@
 DIANN_PG_SQL = ", ".join([f'"{name}"' for name in DIANN_PG_USECOLS])
 
 
-class DiaNNConvert:
+class DiaNNConvert(DiannDuckDB):
     """Convert DIA-NN report to quantms.io format."""
 
     def __init__(
@@ -51,8 +51,11 @@ def __init__(
         duckdb_max_memory="16GB",
         duckdb_threads=4,
     ):
-        super(DiaNNConvert, self).__init__(
-            diann_report, duckdb_max_memory, duckdb_threads
+        super().__init__(
+            diann_report_path=diann_report,
+            max_memory=duckdb_max_memory,
+            worker_threads=duckdb_threads,
+            pg_matrix_path=pg_matrix_path,
         )
         if pg_matrix_path:
             self.pg_matrix = self.get_pg_matrix(pg_matrix_path)
@@ -62,8 +65,7 @@ def __init__(
 
     def destroy_duckdb_database(self):
         """Clean up DuckDB resources."""
-        if self._duckdb:
-            self._duckdb.destroy_database()
+        self.destroy_database()
 
     def get_report_from_database(
         self, runs: list, sql: str = DIANN_SQL
@@ -78,7 +80,7 @@ def get_report_from_database(
             DataFrame with report data
         """
         s = time.time()
-        report = self._duckdb.query_to_df(
+        report = self.query_to_df(
             """
             select {}
             from report
@@ -92,7 +94,7 @@ def get_report_from_database(
         return report
 
     def get_masses_and_modifications_map(self):
-        database = self._duckdb.query_to_df(
+        database = self.query_to_df(
             """
             select DISTINCT "Modified.Sequence" from report
             """
@@ -105,7 +107,7 @@ def get_masses_and_modifications_map(self):
 
     def get_peptide_map_from_database(self):
         s = time.time()
-        database = self._duckdb.query_to_df(
+        database = self.query_to_df(
             """
             SELECT "Precursor.Id","Q.Value","Run"
             FROM (
@@ -192,9 +194,7 @@ def generate_pg_matrix(self, report):
         ].apply(
             lambda rows: [
                 {
-                    "sample_accession": self._sample_map[
-                        rows["reference_file_name"] + "-LFQ"
-                    ],
+                    "sample_accession": self._sample_map[rows["reference_file_name"]],
                     "channel": "LFQ",
                     "intensity": rows["pg_quantity"],
                 }
@@ -208,9 +208,7 @@ def generate_pg_matrix(self, report):
         ].apply(
             lambda rows: [
                 {
-                    "sample_accession": self._sample_map[
-                        rows["reference_file_name"] + "-LFQ"
-                    ],
+                    "sample_accession": self._sample_map[rows["reference_file_name"]],
                     "channel": "LFQ",
                     "intensities": [
                         {"intensity_name": "lfq", "intensity_value": rows["lfq"]},
@@ -604,4 +602,4 @@ def get_unique_references(self, column: str) -> list:
         Returns:
             List of unique values
         """
-        return self._duckdb.get_unique_values("report", column)
+        return self.get_unique_values("report", column)
@@ -303,10 +303,7 @@ def __init__(
         self._pg_matrix_path = str(pg_matrix_path) if pg_matrix_path else None
         self._cache_size = cache_size
         database_name = create_uuid_filename("diann-report", ".duckdb")
-        super().__init__(database_name)
-
-        # Initialize database and create report table
-        self.initialize_database(max_memory, worker_threads)
+        super().__init__(database_name, max_memory, worker_threads)
         self.create_table_from_file("report", self._report_path, [PROTEIN_GROUP, RUN])
 
         # Load protein groups matrix if provided
 
@@ -453,11 +453,13 @@
     pa.field(
         "pg_names",
         pa.list_(pa.string()),
+        nullable=True,
         metadata={"description": "Protein group names"},
     ),
     pa.field(
         "gg_accessions",
         pa.list_(pa.string()),
+        nullable=True,
         metadata={"description": "Gene group accessions, as a string array"},
     ),
     pa.field(