Skip to content

Commit cd6cc95

Browse files
authored
Merge pull request #115 from bigbio/copilot/fix-108
Add fragment annotation arrays to PSM schema
2 parents 4aa9351 + 34da27e commit cd6cc95

File tree

6 files changed

+238
-0
lines changed

6 files changed

+238
-0
lines changed

quantmsio/core/format.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,27 @@
146146
"description": "Array of intensity values for the spectrum used for the peptide spectrum match"
147147
},
148148
),
149+
pa.field(
150+
"charge_array",
151+
pa.list_(pa.int32()),
152+
metadata={
153+
"description": "Array of fragment ion charge values for the spectrum used for the peptide spectrum match"
154+
},
155+
),
156+
pa.field(
157+
"ion_type_array",
158+
pa.list_(pa.string()),
159+
metadata={
160+
"description": "Array of fragment ion type annotations (e.g., b, y, a) for the spectrum used for the peptide spectrum match"
161+
},
162+
),
163+
pa.field(
164+
"ion_mobility_array",
165+
pa.list_(pa.float32()),
166+
metadata={
167+
"description": "Array of fragment ion mobility values for the spectrum used for the peptide spectrum match"
168+
},
169+
),
149170
]
150171

151172
FEATURE_UNIQUE_FIELDS = [

quantmsio/core/fragpipe.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,15 @@ def to_arrow(cls, batch: Iterator["Spectrum"]):
279279
[None] * len(precursor_charge), type=pa.list_(pa.float32())
280280
)
281281
num_peaks_array = pa.array([None] * len(precursor_charge), type=pa.int32())
282+
charge_arrays = pa.array(
283+
[None] * len(precursor_charge), type=pa.list_(pa.int32())
284+
)
285+
ion_type_arrays = pa.array(
286+
[None] * len(precursor_charge), type=pa.list_(pa.string())
287+
)
288+
ion_mobility_arrays = pa.array(
289+
[None] * len(precursor_charge), type=pa.list_(pa.float32())
290+
)
282291
return {
283292
"precursor_charge": pa.array(precursor_charge, type=pa.int32()),
284293
"observed_mz": pa.array(observed_mz, type=pa.float32()),
@@ -289,6 +298,9 @@ def to_arrow(cls, batch: Iterator["Spectrum"]):
289298
"num_peaks": num_peaks_array,
290299
"mz_array": mz_arrays,
291300
"intensity_array": intensity_arrays,
301+
"charge_array": charge_arrays,
302+
"ion_type_array": ion_type_arrays,
303+
"ion_mobility_array": ion_mobility_arrays,
292304
}
293305

294306
@classmethod

quantmsio/core/maxquant.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,9 @@ def transform_psm(df: pd.DataFrame):
336336
df.loc[:, "intensity_array"] = None
337337
df.loc[:, "mz_array"] = None
338338
df.loc[:, "number_peaks"] = None
339+
df.loc[:, "charge_array"] = None
340+
df.loc[:, "ion_type_array"] = None
341+
df.loc[:, "ion_mobility_array"] = None
339342

340343
def transform_feature(self, df: pd.DataFrame):
341344
self.generate_intensity_msg(

quantmsio/core/psm.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ def add_addition_msg(df: pd.DataFrame) -> None:
215215
df.loc[:, "number_peaks"] = None
216216
df.loc[:, "mz_array"] = None
217217
df.loc[:, "intensity_array"] = None
218+
df.loc[:, "charge_array"] = None
219+
df.loc[:, "ion_type_array"] = None
220+
df.loc[:, "ion_mobility_array"] = None
218221

219222
def write_psm_to_file(
220223
self,

quantmsio/operate/tools.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ def generate_psms_of_spectrum(
5858
axis=1,
5959
result_type="expand",
6060
)
61+
# Initialize new fragment annotation arrays as None for backward compatibility
62+
table["charge_array"] = None
63+
table["ion_type_array"] = None
64+
table["ion_mobility_array"] = None
6165
pqwriters, pqwriter_no_part = save_parquet_file(
6266
partitions,
6367
table,

tests/test_fragment_annotations.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Test for the new fragment annotation arrays in PSM schema.
3+
"""
4+
5+
import pytest
6+
from pathlib import Path
7+
8+
try:
9+
import pyarrow as pa
10+
from quantmsio.core.format import PSM_FIELDS, PSM_UNIQUE_FIELDS
11+
from quantmsio.core.common import PSM_SCHEMA
12+
13+
PYARROW_AVAILABLE = True
14+
except ImportError:
15+
PYARROW_AVAILABLE = False
16+
17+
18+
@pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not available")
19+
def test_psm_schema_contains_fragment_annotation_fields():
20+
"""Test that PSM schema contains the new fragment annotation fields."""
21+
22+
# Check that the new fields are present
23+
field_names = [field.name for field in PSM_UNIQUE_FIELDS]
24+
required_new_fields = ["charge_array", "ion_type_array", "ion_mobility_array"]
25+
26+
for field_name in required_new_fields:
27+
assert field_name in field_names, f"{field_name} not found in PSM_UNIQUE_FIELDS"
28+
29+
# Check that the PSM schema can be created successfully
30+
schema = PSM_SCHEMA
31+
assert schema is not None
32+
assert len(schema) > 0
33+
34+
# Verify the new fields are in the full schema
35+
full_field_names = [field.name for field in schema]
36+
for field_name in required_new_fields:
37+
assert field_name in full_field_names, f"{field_name} not found in PSM_SCHEMA"
38+
39+
40+
@pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not available")
41+
def test_fragment_annotation_fields_types():
42+
"""Test that the new fragment annotation fields have correct types."""
43+
44+
# Get the new fields from PSM_UNIQUE_FIELDS
45+
field_dict = {field.name: field for field in PSM_UNIQUE_FIELDS}
46+
47+
# Test charge_array field
48+
charge_field = field_dict["charge_array"]
49+
assert charge_field.type == pa.list_(
50+
pa.int32()
51+
), "charge_array should be list of int32"
52+
assert charge_field.nullable, "charge_array should be nullable"
53+
54+
# Test ion_type_array field
55+
ion_type_field = field_dict["ion_type_array"]
56+
assert ion_type_field.type == pa.list_(
57+
pa.string()
58+
), "ion_type_array should be list of string"
59+
assert ion_type_field.nullable, "ion_type_array should be nullable"
60+
61+
# Test ion_mobility_array field
62+
ion_mobility_field = field_dict["ion_mobility_array"]
63+
assert ion_mobility_field.type == pa.list_(
64+
pa.float32()
65+
), "ion_mobility_array should be list of float32"
66+
assert ion_mobility_field.nullable, "ion_mobility_array should be nullable"
67+
68+
69+
@pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not available")
70+
def test_fragment_annotation_fields_have_descriptions():
71+
"""Test that the new fragment annotation fields have proper descriptions."""
72+
73+
field_dict = {field.name: field for field in PSM_UNIQUE_FIELDS}
74+
75+
# Check that all new fields have descriptions
76+
for field_name in ["charge_array", "ion_type_array", "ion_mobility_array"]:
77+
field = field_dict[field_name]
78+
assert field.metadata is not None, f"{field_name} should have metadata"
79+
assert (
80+
b"description" in field.metadata
81+
), f"{field_name} should have description in metadata"
82+
description = field.metadata[b"description"].decode()
83+
assert len(description) > 0, f"{field_name} should have non-empty description"
84+
assert (
85+
"fragment" in description.lower()
86+
), f"{field_name} description should mention 'fragment'"
87+
88+
89+
@pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not available")
90+
def test_create_table_with_fragment_annotations():
91+
"""Test creating a PyArrow table with the new fragment annotation fields."""
92+
93+
# Create sample data with all required fields including the new ones
94+
sample_data = {
95+
# Basic required fields
96+
"sequence": ["PEPTIDE"],
97+
"peptidoform": ["PEPTIDE"],
98+
"precursor_charge": [2],
99+
"posterior_error_probability": [0.01],
100+
"is_decoy": [0],
101+
"calculated_mz": [450.2],
102+
"observed_mz": [450.21],
103+
"mp_accessions": [["P12345"]],
104+
"predicted_rt": [None],
105+
"reference_file_name": ["sample1"],
106+
"scan": ["1000"],
107+
"rt": [120.5],
108+
"ion_mobility": [None],
109+
# Array fields
110+
"number_peaks": [3],
111+
"mz_array": [[100.1, 200.2, 300.3]],
112+
"intensity_array": [[1000.0, 2000.0, 3000.0]],
113+
# New fragment annotation fields
114+
"charge_array": [[1, 1, 2]],
115+
"ion_type_array": [["b", "y", "b"]],
116+
"ion_mobility_array": [[0.8, 0.9, 1.0]],
117+
}
118+
119+
# Create a minimal schema for testing
120+
test_fields = [
121+
pa.field("sequence", pa.string()),
122+
pa.field("peptidoform", pa.string()),
123+
pa.field("precursor_charge", pa.int32()),
124+
pa.field("posterior_error_probability", pa.float32()),
125+
pa.field("is_decoy", pa.int32()),
126+
pa.field("calculated_mz", pa.float32()),
127+
pa.field("observed_mz", pa.float32()),
128+
pa.field("mp_accessions", pa.list_(pa.string())),
129+
pa.field("predicted_rt", pa.float32()),
130+
pa.field("reference_file_name", pa.string()),
131+
pa.field("scan", pa.string()),
132+
pa.field("rt", pa.float32()),
133+
pa.field("ion_mobility", pa.float32()),
134+
pa.field("number_peaks", pa.int32()),
135+
pa.field("mz_array", pa.list_(pa.float32())),
136+
pa.field("intensity_array", pa.list_(pa.float32())),
137+
pa.field("charge_array", pa.list_(pa.int32())),
138+
pa.field("ion_type_array", pa.list_(pa.string())),
139+
pa.field("ion_mobility_array", pa.list_(pa.float32())),
140+
]
141+
142+
test_schema = pa.schema(test_fields)
143+
144+
# Create table - this should not raise an exception
145+
table = pa.table(sample_data, schema=test_schema)
146+
147+
# Verify table structure
148+
assert table.num_rows == 1
149+
assert table.num_columns == len(test_fields)
150+
151+
# Verify the new fields contain expected data
152+
assert table.column("charge_array").to_pylist() == [[1, 1, 2]]
153+
assert table.column("ion_type_array").to_pylist() == [["b", "y", "b"]]
154+
assert table.column("ion_mobility_array").to_pylist() == [[0.8, 0.9, 1.0]]
155+
156+
157+
@pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not available")
158+
def test_nullable_fragment_annotation_fields():
159+
"""Test that fragment annotation fields can contain null values."""
160+
161+
# Create sample data where fragment annotation fields are null
162+
sample_data = {
163+
"sequence": ["PEPTIDE"],
164+
"peptidoform": ["PEPTIDE"],
165+
"precursor_charge": [2],
166+
"mz_array": [[100.1, 200.2, 300.3]],
167+
"intensity_array": [[1000.0, 2000.0, 3000.0]],
168+
"number_peaks": [3],
169+
# New fragment annotation fields as null
170+
"charge_array": [None],
171+
"ion_type_array": [None],
172+
"ion_mobility_array": [None],
173+
}
174+
175+
test_fields = [
176+
pa.field("sequence", pa.string()),
177+
pa.field("peptidoform", pa.string()),
178+
pa.field("precursor_charge", pa.int32()),
179+
pa.field("mz_array", pa.list_(pa.float32())),
180+
pa.field("intensity_array", pa.list_(pa.float32())),
181+
pa.field("number_peaks", pa.int32()),
182+
pa.field("charge_array", pa.list_(pa.int32())),
183+
pa.field("ion_type_array", pa.list_(pa.string())),
184+
pa.field("ion_mobility_array", pa.list_(pa.float32())),
185+
]
186+
187+
test_schema = pa.schema(test_fields)
188+
189+
# Create table with null values - this should not raise an exception
190+
table = pa.table(sample_data, schema=test_schema)
191+
192+
# Verify null values are handled correctly
193+
assert table.column("charge_array").to_pylist() == [None]
194+
assert table.column("ion_type_array").to_pylist() == [None]
195+
assert table.column("ion_mobility_array").to_pylist() == [None]

0 commit comments

Comments
 (0)