Skip to content

Commit 3d4d085

Browse files
authored
Merge pull request #113 from Shen-YuFei/dev
Improve MaxQuant converter with enhanced data processing
2 parents d625a5e + c2792cb commit 3d4d085

30 files changed

+2840
-1149
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ biopython = "*"
3232
seaborn = "*"
3333
numpy = "*"
3434
matplotlib = "*"
35-
duckdb = "*"
35+
duckdb = ">=1.1.3,<=1.3.0"
3636
mygene = "*"
3737
pyahocorasick = "*"
3838
swifter = "*"
@@ -62,4 +62,5 @@ pattern = "^(?P<base>\\d+\\.\\d+\\.\\d+)$"
6262
[tool.pytest.ini_options]
6363
markers = [
6464
"integration: marks tests as integration tests (deselect with '-m \"not integration\"')",
65+
"large_data: marks tests as requiring large data files (deselect with '-m \"not large_data\"')",
6566
]

quantmsio/commands/convert/maxquant.py

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -74,22 +74,20 @@ def convert_maxquant_psm_cmd(
7474
if not all([msms_file, output_folder]):
7575
raise click.UsageError("ERROR: Please provide all required parameters")
7676

77-
# Ensure output directory exists
7877
output_folder = Path(output_folder)
7978
output_folder.mkdir(parents=True, exist_ok=True)
8079
logger.info(f"Using output directory: {output_folder}")
8180

82-
# Set default prefix if not provided
8381
prefix = output_prefix or "psm"
8482
filename = create_uuid_filename(prefix, ".psm.parquet")
8583
output_path = output_folder / filename
8684
logger.info(f"Will save PSM file as: {filename}")
8785

8886
logger.info("Initializing MaxQuant PSM converter...")
89-
mq = MaxQuant()
87+
processor = MaxQuant()
9088

9189
logger.info(f"Starting PSM conversion (batch size: {batch_size:,})...")
92-
mq.write_psm_to_file(
90+
processor.process_psm_file(
9391
msms_path=str(msms_file), output_path=str(output_path), chunksize=batch_size
9492
)
9593
logger.info(f"PSM file successfully saved to: {output_path}")
@@ -126,6 +124,11 @@ def convert_maxquant_psm_cmd(
126124
help="Protein file with specific requirements",
127125
type=click.Path(exists=True, dir_okay=False, path_type=Path),
128126
)
127+
@click.option(
128+
"--protein-groups-file",
129+
help="MaxQuant proteinGroups.txt file for Q-value mapping (optional)",
130+
type=click.Path(exists=True, dir_okay=False, path_type=Path),
131+
)
129132
@click.option(
130133
"--partitions",
131134
help="Field(s) used for splitting files (comma-separated)",
@@ -146,6 +149,7 @@ def convert_maxquant_feature_cmd(
146149
sdrf_file: Path,
147150
output_folder: Path,
148151
protein_file: Optional[Path],
152+
protein_groups_file: Optional[Path],
149153
partitions: Optional[str],
150154
batch_size: int,
151155
output_prefix: Optional[str],
@@ -162,6 +166,7 @@ def convert_maxquant_feature_cmd(
162166
--evidence-file evidence.txt \\
163167
--sdrf-file data.sdrf.tsv \\
164168
--output-folder ./output \\
169+
--protein-groups-file proteinGroups.txt \\
165170
--batch-size 1000000
166171
"""
167172
logger = get_logger("quantmsio.commands.maxquant")
@@ -173,44 +178,43 @@ def convert_maxquant_feature_cmd(
173178
if not all([evidence_file, sdrf_file, output_folder]):
174179
raise click.UsageError("ERROR: Please provide all required parameters")
175180

176-
# Ensure output directory exists
177181
output_folder = Path(output_folder)
178182
output_folder.mkdir(parents=True, exist_ok=True)
179183
logger.info(f"Using output directory: {output_folder}")
180184

181-
# Set default prefix if not provided
182185
prefix = output_prefix or "feature"
183186
filename = create_uuid_filename(prefix, ".feature.parquet")
184187
output_path = output_folder / filename
185188
logger.info(f"Will save feature file as: {filename}")
186189

187190
logger.info("Initializing MaxQuant feature converter...")
188-
mq = MaxQuant()
191+
processor = MaxQuant()
189192

190193
if not partitions:
191194
logger.info(f"Starting feature conversion (batch size: {batch_size:,})...")
192-
mq.write_feature_to_file(
195+
196+
if protein_groups_file:
197+
logger.info(
198+
f"Using proteinGroups file for Q-value mapping: {protein_groups_file}"
199+
)
200+
processor._init_protein_group_qvalue_mapping(str(protein_groups_file))
201+
else:
202+
logger.info(
203+
"No proteinGroups file provided, auto-detection will be used"
204+
)
205+
206+
processor.process_feature_file(
193207
evidence_path=str(evidence_file),
194-
sdrf_path=str(sdrf_file),
195208
output_path=str(output_path),
196-
chunksize=batch_size,
209+
sdrf_path=str(sdrf_file),
197210
protein_file=str(protein_file) if protein_file else None,
211+
chunksize=batch_size,
198212
)
199213
logger.info(f"Feature file successfully saved to: {output_path}")
200214
else:
201-
logger.info(f"Starting partitioned feature conversion using: {partitions}")
202-
partition_list = partitions.split(",")
203-
mq.write_features_to_file(
204-
evidence_path=str(evidence_file),
205-
sdrf_path=str(sdrf_file),
206-
output_folder=str(output_folder),
207-
filename=filename,
208-
partitions=partition_list,
209-
chunksize=batch_size,
210-
protein_file=str(protein_file) if protein_file else None,
211-
)
212-
logger.info(
213-
f"Partitioned feature files successfully saved to: {output_folder}"
215+
logger.error("Partitioned conversion not implemented")
216+
raise click.ClickException(
217+
"Partitioned conversion feature is not yet available. Please use the standard conversion without --partitions."
214218
)
215219

216220
except Exception as e:
@@ -240,14 +244,9 @@ def convert_maxquant_feature_cmd(
240244
required=True,
241245
type=click.Path(file_okay=False, path_type=Path),
242246
)
243-
@click.option(
244-
"--protein-file",
245-
help="Protein file with specific requirements",
246-
type=click.Path(exists=True, dir_okay=False, path_type=Path),
247-
)
248247
@click.option(
249248
"--batch-size",
250-
help="Read batch size",
249+
help="Batch size (for logging purposes only)",
251250
default=1000000,
252251
type=int,
253252
)
@@ -260,7 +259,6 @@ def convert_maxquant_pg_cmd(
260259
protein_groups_file: Path,
261260
sdrf_file: Path,
262261
output_folder: Path,
263-
protein_file: Optional[Path],
264262
batch_size: int,
265263
output_prefix: Optional[str],
266264
verbose: bool = False,
@@ -287,29 +285,25 @@ def convert_maxquant_pg_cmd(
287285
if not all([protein_groups_file, sdrf_file, output_folder]):
288286
raise click.UsageError("ERROR: Please provide all required parameters")
289287

290-
# Ensure output directory exists
291288
output_folder = Path(output_folder)
292289
output_folder.mkdir(parents=True, exist_ok=True)
293290
logger.info(f"Using output directory: {output_folder}")
294291

295-
# Set default prefix if not provided
296292
prefix = output_prefix or "pg"
297293
filename = create_uuid_filename(prefix, ".pg.parquet")
298294
output_path = output_folder / filename
299295
logger.info(f"Will save protein groups file as: {filename}")
300296

301297
logger.info("Initializing MaxQuant protein groups converter...")
302-
mq = MaxQuant()
298+
processor = MaxQuant()
303299

304300
logger.info(
305301
f"Starting protein groups conversion (batch size: {batch_size:,})..."
306302
)
307-
mq.write_protein_groups_to_file(
303+
processor.process_pg_file(
308304
protein_groups_path=str(protein_groups_file),
309-
sdrf_path=str(sdrf_file),
310305
output_path=str(output_path),
311-
chunksize=batch_size,
312-
protein_file=str(protein_file) if protein_file else None,
306+
sdrf_path=str(sdrf_file),
313307
)
314308
logger.info(f"Protein groups file successfully saved to: {output_path}")
315309

@@ -318,3 +312,7 @@ def convert_maxquant_pg_cmd(
318312
f"Error in MaxQuant protein groups conversion: {str(e)}", exc_info=True
319313
)
320314
raise click.ClickException(f"Error: {str(e)}\nCheck the logs for more details.")
315+
316+
317+
if __name__ == "__main__":
318+
convert()

quantmsio/core/common.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124

125125
MAXQUANT_PSM_MAP = {
126126
"Sequence": "sequence",
127-
"Proteins": "mp_accessions",
127+
"Proteins": "protein_accessions",
128128
"PEP": "posterior_error_probability",
129129
"Modified sequence": "peptidoform",
130130
"Reverse": "is_decoy",
@@ -136,12 +136,16 @@
136136
"Score": "andromeda_score",
137137
"Delta score": "andromeda_delta_score",
138138
"PIF": "parent_ion_fraction",
139+
"Masses": "mz_array",
140+
"Intensities": "intensity_array",
141+
"Number of matches": "number_peaks",
139142
}
140143

141144
MAXQUANT_FEATURE_MAP = {
142145
"Sequence": "sequence",
143146
"Proteins": "mp_accessions",
144147
"Leading proteins": "pg_accessions",
148+
"Leading razor protein": "anchor_protein",
145149
"Gene names": "gg_names",
146150
"PEP": "posterior_error_probability",
147151
"Modified sequence": "peptidoform",
@@ -156,6 +160,7 @@
156160
"Calibrated retention time": "rt",
157161
"Calibrated retention time start": "rt_start",
158162
"Calibrated retention time finish": "rt_stop",
163+
"Intensity": "intensity",
159164
}
160165

161166
IBAQ_USECOLS = [

quantmsio/core/diann/diann.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
DIANN_PG_SQL = ", ".join([f'"{name}"' for name in DIANN_PG_USECOLS])
4141

4242

43-
class DiaNNConvert:
43+
class DiaNNConvert(DiannDuckDB):
4444
"""Convert DIA-NN report to quantms.io format."""
4545

4646
def __init__(
@@ -51,8 +51,11 @@ def __init__(
5151
duckdb_max_memory="16GB",
5252
duckdb_threads=4,
5353
):
54-
super(DiaNNConvert, self).__init__(
55-
diann_report, duckdb_max_memory, duckdb_threads
54+
super().__init__(
55+
diann_report_path=diann_report,
56+
max_memory=duckdb_max_memory,
57+
worker_threads=duckdb_threads,
58+
pg_matrix_path=pg_matrix_path,
5659
)
5760
if pg_matrix_path:
5861
self.pg_matrix = self.get_pg_matrix(pg_matrix_path)
@@ -62,8 +65,7 @@ def __init__(
6265

6366
def destroy_duckdb_database(self):
6467
"""Clean up DuckDB resources."""
65-
if self._duckdb:
66-
self._duckdb.destroy_database()
68+
self.destroy_database()
6769

6870
def get_report_from_database(
6971
self, runs: list, sql: str = DIANN_SQL
@@ -78,7 +80,7 @@ def get_report_from_database(
7880
DataFrame with report data
7981
"""
8082
s = time.time()
81-
report = self._duckdb.query_to_df(
83+
report = self.query_to_df(
8284
"""
8385
select {}
8486
from report
@@ -92,7 +94,7 @@ def get_report_from_database(
9294
return report
9395

9496
def get_masses_and_modifications_map(self):
95-
database = self._duckdb.query_to_df(
97+
database = self.query_to_df(
9698
"""
9799
select DISTINCT "Modified.Sequence" from report
98100
"""
@@ -105,7 +107,7 @@ def get_masses_and_modifications_map(self):
105107

106108
def get_peptide_map_from_database(self):
107109
s = time.time()
108-
database = self._duckdb.query_to_df(
110+
database = self.query_to_df(
109111
"""
110112
SELECT "Precursor.Id","Q.Value","Run"
111113
FROM (
@@ -192,9 +194,7 @@ def generate_pg_matrix(self, report):
192194
].apply(
193195
lambda rows: [
194196
{
195-
"sample_accession": self._sample_map[
196-
rows["reference_file_name"] + "-LFQ"
197-
],
197+
"sample_accession": self._sample_map[rows["reference_file_name"]],
198198
"channel": "LFQ",
199199
"intensity": rows["pg_quantity"],
200200
}
@@ -208,9 +208,7 @@ def generate_pg_matrix(self, report):
208208
].apply(
209209
lambda rows: [
210210
{
211-
"sample_accession": self._sample_map[
212-
rows["reference_file_name"] + "-LFQ"
213-
],
211+
"sample_accession": self._sample_map[rows["reference_file_name"]],
214212
"channel": "LFQ",
215213
"intensities": [
216214
{"intensity_name": "lfq", "intensity_value": rows["lfq"]},
@@ -604,4 +602,4 @@ def get_unique_references(self, column: str) -> list:
604602
Returns:
605603
List of unique values
606604
"""
607-
return self._duckdb.get_unique_values("report", column)
605+
return self.get_unique_values("report", column)

quantmsio/core/duckdb.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,10 +303,7 @@ def __init__(
303303
self._pg_matrix_path = str(pg_matrix_path) if pg_matrix_path else None
304304
self._cache_size = cache_size
305305
database_name = create_uuid_filename("diann-report", ".duckdb")
306-
super().__init__(database_name)
307-
308-
# Initialize database and create report table
309-
self.initialize_database(max_memory, worker_threads)
306+
super().__init__(database_name, max_memory, worker_threads)
310307
self.create_table_from_file("report", self._report_path, [PROTEIN_GROUP, RUN])
311308

312309
# Load protein groups matrix if provided

quantmsio/core/format.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,11 +453,13 @@
453453
pa.field(
454454
"pg_names",
455455
pa.list_(pa.string()),
456+
nullable=True,
456457
metadata={"description": "Protein group names"},
457458
),
458459
pa.field(
459460
"gg_accessions",
460461
pa.list_(pa.string()),
462+
nullable=True,
461463
metadata={"description": "Gene group accessions, as a string array"},
462464
),
463465
pa.field(

0 commit comments

Comments
 (0)