diff --git a/.gitignore b/.gitignore index 63cb5314f..7c42c05fc 100644 --- a/.gitignore +++ b/.gitignore @@ -153,4 +153,7 @@ jupyter_notebooks/analysis/temp_results/* uv.lock # Large files -*.gz \ No newline at end of file +*.gz +# Atomic-write / editor temp files +*.tmp +*.tmp.* diff --git a/CLAUDE.md b/CLAUDE.md index b7fcc6c67..343edcefc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -190,7 +190,7 @@ run_name_cleanup = "(?:\\.custom_suffix|\\.raw)$" - `[species_expected_ratio.]`: `A_vs_B` (float ratio), `color` (hex) - `[general]`: `min_count_multispec` (int), `level` ("ion" or "peptidoform") -The `MODULE_TO_CLASS` dict at the bottom of `parse_settings.py` routes module IDs to parser classes: all `quant_lfq_*` modules use `ParseSettingsQuant`, and `denovo_DDA_HCD` uses `ParseSettingsDeNovo`. (The dict contains a benign duplicate `quant_lfq_DDA_ion_Astral` key, and has no `quant_lfq_DIA_peptidoform` entry, matching that module being an unregistered stub.) +The `MODULE_TO_CLASS` dict at the bottom of `parse_settings.py` routes module IDs to parser classes: all `quant_lfq_*` modules use `ParseSettingsQuant`, and `denovo_DDA_HCD` uses `ParseSettingsDeNovo`. (It has no `quant_lfq_DIA_peptidoform` entry, matching that module being an unregistered stub.) The validation layer reuses this dict to infer a module's default validation profile. `io/parsing/utils.py` provides ProForma fixed-modification helpers: `add_fixed_mod(proforma, mod_name, aas)` and `add_maxquant_fixed_modifications(params, result_perf)`. @@ -326,6 +326,78 @@ value = st.session_state[st.session_state[variables.slider_id_uuid]] Session state serves 4 purposes: widget state (via UUID keys), data cache (DataFrames), plot cache (Plotly figures), and flow control (submit flags, highlight lists, tour state). +### Submission Validation Layer (`proteobench/validation/`) + +A framework-agnostic, **registry-driven** validation package that checks uploaded submissions for internal consistency before the public datapoint is created. It returns a structured `ValidationReport` instead of raising generic exceptions. Which checks run is determined by a *validation profile*, so the orchestrator is generic: adding a module of an existing category is config-only, and adding a new category only requires registering a new profile. + +**Validation does not block submission.** All findings (including `error`-severity ones) are surfaced in the UI and embedded in the pull-request description via `ValidationReport.summary()` so reviewers see them, but submission always proceeds. The `error`/`warning`/`info` severities only control display prominence and PR inclusion, not gating. (The earlier blocking behavior in `submit_to_repository` was removed.) + +**Package layout:** + +| File | Contents | +|------|----------| +| `report.py` | `Severity` enum (`error`/`warning`/`info`), `ValidationIssue` dataclass, `ValidationReport` collection | +| `exceptions.py` | `SubmissionValidationError` — wraps a report for programmatic callers | +| `fasta.py` | `FastaReference` — builds the expected protein set from FASTA text, path, bytes, zip/gzip, or URL; parses UniProt `sp|P49327|FAS_HUMAN`, `tr|...`, bare accessions, isoforms | +| `protein_ids.py` | `split_protein_groups()` (`;`/`,` separators), `extract_identifiers()`, `is_decoy_or_contaminant()` | +| `context.py` | `ValidationContext` — bundles all inputs a check might need (`standard_df`, `parameters`, `config`, `fasta`, `input_format`, generic `reference`, `extras`) so every check has the uniform signature `ctx -> list[ValidationIssue]` | +| `config.py` | `ModuleValidationConfig` — column names, contaminant flag, decoy prefixes, FASTA URL, and the resolved `validation_profile`; built via `from_parse_settings(parse_settings_dir, module_id, input_format)` | +| `checks.py` | Pure check functions: `check_protein_ids`, `check_charge_range`, `check_peptide_length`, `check_enzyme`, `check_modifications`, `check_max_modifications`, `check_mass_tolerances`, `check_fdr_psm`, `check_run_consistency` (kept individually unit-testable) | +| `profiles.py` | `Check` (named `ctx`-callable), `ValidationProfile` (ordered list of checks), the profile **registry** (`register_profile`/`unregister_profile`/`get_profile`/`available_profiles`), and the built-in `quant_lfq` and `denovo` profiles | +| `validator.py` | `validate_submission(standard_df, parameters, fasta, config, input_format, profile)` — resolves the profile, builds the context, runs the profile's checks; each check is fault-tolerant (unexpected exceptions become warnings) | + +**Profile registry (the extensibility surface).** A `ValidationProfile` is a named, ordered list of `Check`s; checks are reusable across profiles (e.g. `run_consistency` is shared by `quant_lfq` and `denovo`). `validate_submission` looks up `config.validation_profile` in the registry and runs only that profile's checks — there is no `if module_type == ...` branching. To support a brand-new module category: register a profile in `profiles.py` (or from third-party code via `register_profile`) and point the module at it; the orchestrator never changes. + +**Profile resolution** (`ModuleValidationConfig.from_parse_settings`, in precedence order): +1. explicit `[validation].profile` in the module's `module_settings.toml` (declarative); +2. inferred from the parser class via the existing `MODULE_TO_CLASS` registry (`ParseSettingsQuant` → `quant_lfq`, `ParseSettingsDeNovo` → `denovo`); +3. `DEFAULT_VALIDATION_PROFILE` (`quant_lfq`). + +An unregistered profile name produces a single `unknown_validation_profile` warning and runs nothing (never blocks). + +**`ValidationIssue` fields:** `code` (machine-readable), `severity`, `message`, `check`, `field`, `observed`, `expected`, `examples` (up to 20 offending protein identifiers via `MAX_PROTEIN_EXAMPLES`, or up to 10 example rows for the other checks via `MAX_ROW_EXAMPLES`). + +**`quant_lfq` profile checks and their default severity** (severity affects display/PR prominence only; nothing blocks): + +| Check | Default severity | Notes | +|-------|------------------|-------| +| `protein_ids` (vs FASTA) | ERROR | Skips decoys and contaminants; splits groups; case-insensitive | +| `charge_range` | ERROR | Uses `min_precursor_charge` / `max_precursor_charge` from params | +| `peptide_length` | ERROR | Uses `min_peptide_length` / `max_peptide_length`; counts alpha chars | +| `enzyme` (missed cleavages) | WARNING | Supports trypsin, trypsin/P, Lys-C, Arg-C, Glu-C, chymotrypsin via `_ENZYME_CLEAVAGE_RULES`; N-terminal cleavers (Asp-N/Lys-N) and unknown enzymes skipped as info; heuristic, ignores ragged termini | +| `modifications` (names) | WARNING | Human-readable names in `proforma` vs declared mods; mass/UniMod tokens skipped | +| `max_modifications` | WARNING | Counts bracketed mods per peptidoform vs `max_mods`; upper bound (includes fixed mods written into the sequence) | +| `mass_tolerances` | WARNING | Sanity check of `precursor_mass_tolerance`/`fragment_mass_tolerance` (parseable, positive, plausible ppm/Da); no per-result comparison exists | +| `fdr_psm` | WARNING | `ident_fdr_psm` within `[0,1]` and ≤ `config.recommended_max_fdr_psm` (default 0.01) | +| `run_consistency` (software identity) | ERROR | `params.software_name` vs `input_format` | +| (absent parameter) | WARNING | Each param-dependent check self-reports when its constraint was not parsed; never crashes | + +The `denovo` profile currently runs only `run_consistency` plus a `denovo_pending` info placeholder (de novo uses a different standardized schema — `spectrum_id`/`peptide_str`/`aa_scores` — and a ground-truth table rather than a FASTA; content checks are a documented TODO in `profiles.py`). + +**Reference FASTA configuration.** Each quant module's `module_settings.toml` contains an optional section: + +```toml +[reference_database] +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +"profile" = "quant_lfq" # optional; usually inferred from the parser class +``` + +`[reference_database]` is populated for all 9 quant modules (HYE modules share the HYE FASTA; single-cell uses HY; Plasma uses Distler-PYE). The de novo `module_settings.toml` declares `[validation] profile = "denovo"`. If `[reference_database]` is absent the FASTA check is skipped with an info message. + +**Integration point.** Validation runs inside `submit_to_repository()` in `webinterface/pages/base_pages/tabs/tab6_submit_results.py`, after the "I really want to upload it" button press and before `create_pull_request()`. The standardized DataFrame is re-derived by rerunning the existing parser on the `input_df` already in session state (no duplicated tool-specific parsing logic). All findings (errors and warnings) are rendered in the UI and appended to the PR description via `ValidationReport.summary()`; none of them block the PR. The local Tab 2 upload path is unaffected. + +**Streamlit glue.** `webinterface/pages/base_pages/utils/validation_ui.py` provides: +- `run_submission_validation(variables, ionmodule, user_input, params)` — orchestrates the full flow (re-parse, load FASTA from cache, run checks), returns a `ValidationReport`. +- `_load_fasta_reference(fasta_url, fasta_filename)` — `@st.cache_data` wrapper around `FastaReference.from_url()`. +- `render_validation_report(report)` — renders errors as `st.error`, warnings as `st.warning`, and info items inside a collapsed `st.expander`. + +**Documented limitations (intentionally skipped checks):** +- Full enzyme-specificity checks require reference protein sequences (not available); only internal K/R counting is done. +- Cross-tool modification normalization is not implemented (MaxQuant uses human-readable names, DIA-NN UniMod accessions, Sage raw mass dicts); only matching human-readable names against declared mods is attempted. +- Run-level matching (raw-file, sample, experiment) is not possible because `ProteoBenchParameters` does not expose those fields; only software identity is compared. + ### Streamlit Coupling in Core Library Three files in `proteobench/` import Streamlit (coupling violations): @@ -347,6 +419,7 @@ Tests in `test/` use pytest. No `conftest.py` (uses defaults). ~74 `def test_` f - `test/data/quant/quant_lfq_peptidoform_DDA/` - Peptidoform sample files - `test/data/denovo/` - de novo configs and result files (~8.5MB) - `test/data/intermediate_files/` - Reference intermediate results +- `test/data/validation/ProteoBench_validation_reference.fasta` - Small 6-protein FASTA fixture for validation unit tests - `test/params/` - Parameter files for all tools (~124 top-level files, ~130 including `test/params/denovo/`, ~9MB) - `test/data/quant/quant_lfq_proteingroup_DIA_Astral/` exists but is currently empty; empty legacy dirs `test/data/dda_quant/` and `test/data/dia_quant/` remain. @@ -375,6 +448,7 @@ Only two module-level test files exist: `test_module_quant_ion_DDA_QExactive.py` - `test_plot_quant.py` - `TestPlotDataPoint` - `test_modules_constants.py` - parametrized check that every `MODULE_SETTINGS_DIRS` entry resolves to an existing directory - `test_github_repo.py` - tests for `GithubProteobotRepo` +- `test_validation.py` - 63 tests for the submission-validation layer; covers FASTA parsing, protein-ID matching/groups/decoy-contaminant skipping, charge/length violations, missing-parameter warnings, multi-enzyme missed-cleavage checks, modification/max-modification checks, mass-tolerance and PSM-FDR sanity checks, the profile registry (resolution, custom-profile registration, unknown-profile handling, de novo routing), report serialization, and lightweight integration through the real MaxQuant and Sage parsers ### What CI Validates @@ -406,6 +480,7 @@ Separate workflow for the webinterface (`test-streamlit.yml`): ### Adding a new benchmark module 1. Create a module class in `modules/quant/` inheriting `QuantModule`, setting `module_id`, the precursor column, and repo names + - For submission validation (see the Submission Validation Layer section): add a `[reference_database] fasta_url = "..."` entry to the new module's `module_settings.toml` (if absent, protein-identifier validation is silently skipped). A quant module auto-resolves to the `quant_lfq` profile; a module of a new category should declare `[validation] profile = ""` and register a matching profile in `proteobench/validation/profiles.py`. 2. Add the settings directory to `MODULE_SETTINGS_DIRS` in `modules/constants.py` 3. Register the parse settings class in `MODULE_TO_CLASS` in `parse_settings.py` 4. Create TOML configs: `module_settings.toml` + per-tool parse settings in the new directory diff --git a/docs/developer-guide/adding-module.rst b/docs/developer-guide/adding-module.rst index 1ddd62e2a..0f436a489 100644 --- a/docs/developer-guide/adding-module.rst +++ b/docs/developer-guide/adding-module.rst @@ -44,7 +44,7 @@ that allow for a more modular and portable implementation. Backend ------- -The backend is organized into six main components that you can extend or customize: +The backend is organized into seven main components that you can extend or customize: **1. Module implementation** - Define how your benchmarking is performed - For quantification: Subclass :class:`~proteobench.modules.quant.quant_base_module.QuantModule` @@ -88,6 +88,17 @@ The backend is organized into six main components that you can extend or customi - Functions in :file:`proteobench/io/params` parse parameter setting files - Customize per software tool in :file:`proteobench/io/params/json/` +**7. Submission validation** - Check uploaded submissions for consistency + - Package: :file:`proteobench/validation/` (framework-agnostic, registry-driven) + - A quantification module is configuration-only: add a ``[reference_database]`` + section to its ``module_settings.toml`` and the ``quant_lfq`` profile is + resolved automatically from the parser class + - A new module category registers its own *validation profile* + - Validation is non-blocking: findings are shown to the submitter and added to + the pull-request description, but submission always proceeds + - See :doc:`submission-validation` for integrating, extending, and maintaining + the checks + Architecture example .................... @@ -1053,3 +1064,9 @@ a new type of module: ``REPO_MODULE_REGISTRY`` with the format: ``"Results_repo_name": ("module_id", ModuleClass, path_to_params_json)`` This enables automatic datapoint reprocessing for your module. +12. **Configure submission validation.** For a quantification module, add a + ``[reference_database]`` section (with ``fasta_url``) to the module's + ``module_settings.toml``; the ``quant_lfq`` profile is resolved automatically. + For a new module category, register a *validation profile* and point the + module at it via ``[validation] profile = ""``. + See :doc:`submission-validation`. diff --git a/docs/developer-guide/index.rst b/docs/developer-guide/index.rst index 428e91075..343747b53 100644 --- a/docs/developer-guide/index.rst +++ b/docs/developer-guide/index.rst @@ -20,7 +20,8 @@ Developer guide repo_layout adding-module - + submission-validation + modules/index .. toctree:: diff --git a/docs/developer-guide/submission-validation.rst b/docs/developer-guide/submission-validation.rst new file mode 100644 index 000000000..72ca83cfd --- /dev/null +++ b/docs/developer-guide/submission-validation.rst @@ -0,0 +1,336 @@ +.. _submission-validation: + +###################### +Submission validation +###################### + +ProteoBench validates an uploaded submission before the public datapoint is +created. The validation layer checks that the standardized results and the +parsed parameters are internally consistent and consistent with the module +reference database, and returns a structured ``ValidationReport``. + +Validation is **non-blocking**. Every finding, including ``error``-severity +ones, is shown to the submitter and embedded in the pull-request description for +the reviewers, but submission always proceeds. Severity controls only display +prominence and inclusion in the pull-request summary. It does not gate the +submission flow. + +The layer is framework-agnostic and **registry-driven**. Each module maps to a +*validation profile* (a named, ordered set of checks). Adding a new module of an +existing category is configuration-only. Adding a genuinely new category only +requires registering a new profile. The orchestrator never needs to change. + +The code lives in the ``proteobench.validation`` package. The Streamlit glue +lives in :file:`webinterface/pages/base_pages/utils/validation_ui.py`. + + +Package layout +============== + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - File + - Contents + * - ``report.py`` + - ``Severity`` enum (``error`` / ``warning`` / ``info``), ``ValidationIssue``, + and ``ValidationReport`` (issue collection plus ``has_errors``, ``passed``, + ``summary()``, ``raise_if_errors()``). + * - ``context.py`` + - ``ValidationContext``: bundles everything a check might need + (``standard_df``, ``parameters``, ``config``, ``fasta``, ``input_format``, + a generic ``reference``, and an ``extras`` dict) so every check has the + uniform signature ``ctx -> list[ValidationIssue]``. + * - ``config.py`` + - ``ModuleValidationConfig``: column names, contaminant flag, decoy + prefixes, reference FASTA location, and the resolved + ``validation_profile``. Built with ``from_parse_settings(...)``. + * - ``checks.py`` + - Pure, individually testable check functions (protein IDs, charge range, + peptide length, enzyme, modifications, maximum modifications, mass + tolerances, PSM FDR, run consistency). + * - ``profiles.py`` + - ``Check``, ``ValidationProfile``, the profile **registry** + (``register_profile`` / ``unregister_profile`` / ``get_profile`` / + ``available_profiles``), and the built-in ``quant_lfq`` and ``denovo`` + profiles. This is the extensibility surface. + * - ``validator.py`` + - ``validate_submission(...)``: resolves the profile, builds the context, + runs the profile's checks, and returns the report. Each check is + fault-tolerant: an unexpected exception becomes a warning. + * - ``fasta.py`` + - ``FastaReference``: builds the expected protein set from FASTA text, a + path, bytes, a zip / gzip, or a URL. + * - ``protein_ids.py`` + - Helpers to split protein groups, extract identifiers, and skip decoys and + contaminants. + * - ``exceptions.py`` + - ``SubmissionValidationError``, which wraps a report for programmatic + callers that opt in to raising. + + +Data flow +========= + +.. code-block:: text + + module_settings.toml + parser --> ModuleValidationConfig.from_parse_settings(...) + reference FASTA --> FastaReference.from_url(...) + | + standardized DataFrame + params -----+--> validate_submission(...) + | | + | +-- resolve profile (registry) + | +-- build ValidationContext + | +-- run each Check (fault-tolerant) + v + ValidationReport --> UI display + PR summary + +The core validator performs no I/O. Any reference data (a FASTA, a ground-truth +table) is supplied through the arguments. The front end is responsible for +obtaining the standardized DataFrame and the reference, which is what the +Streamlit glue does. + + +Built-in profiles +================== + +``quant_lfq`` + Runs, in order: ``protein_ids`` (against the reference FASTA), + ``charge_range``, ``peptide_length``, ``enzyme``, ``modifications``, + ``max_modifications``, ``mass_tolerances``, ``fdr_psm``, and + ``run_consistency``. ``protein_ids``, ``charge_range``, and ``peptide_length`` + default to ``error`` severity; the rest default to ``warning``. + +``denovo`` + Runs ``run_consistency`` plus a ``denovo_pending`` informational placeholder. + De novo uses a different standardized schema and a ground-truth table rather + than a FASTA, so content checks are a documented to-do in ``profiles.py``. + +Checks are reusable across profiles. For example, ``run_consistency`` is shared +by both built-in profiles. + + +Integrating validation for a new module +======================================== + +Existing category (quantification) +----------------------------------- + +For a quantification module no code is required. Two configuration steps are +enough: + +1. Add a reference database to the module's ``module_settings.toml`` (beside + ``[species_expected_ratio]`` and ``[general]``): + + .. code-block:: toml + + [reference_database] + "fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + # "fasta_filename" = "optional_member_name_inside_the_archive.fasta" + + If ``[reference_database]`` is absent, the protein-identifier check is skipped + with an informational message and the other checks still run. + +2. The profile resolves automatically to ``quant_lfq`` from the module's parser + class (``ParseSettingsQuant``), so no profile declaration is needed. You may + pin it explicitly if you prefer: + + .. code-block:: toml + + [validation] + "profile" = "quant_lfq" + +The orchestrator and the submission tab already run the resolved profile. The +profile is resolved by ``ModuleValidationConfig.from_parse_settings`` in this +order of precedence: + +1. an explicit ``[validation].profile`` key in ``module_settings.toml``; +2. inference from the parser class via the ``MODULE_TO_CLASS`` registry + (``ParseSettingsQuant`` -> ``quant_lfq``, ``ParseSettingsDeNovo`` -> ``denovo``); +3. the ``DEFAULT_VALIDATION_PROFILE`` fallback (``quant_lfq``). + +An unregistered profile name produces a single ``unknown_validation_profile`` +warning and runs nothing. It never blocks. + +New category +------------ + +A genuinely new category of module needs a profile of its own: + +1. Write the checks it needs (see below), or reuse existing ones. +2. Register a ``ValidationProfile`` in ``profiles.py`` (or, from third-party + code, via ``register_profile``). +3. Point the module at it with ``[validation] profile = ""`` in + ``module_settings.toml``. If you want the profile to be inferred from the + parser class instead, add an entry to ``_PROFILE_BY_PARSER_CLASS`` in + ``config.py``. + +The orchestrator (``validate_submission``) is generic and does not change. + + +Extending and maintaining the checks +===================================== + +Adding a check +-------------- + +A check is a pure function with the signature ``ctx -> list[ValidationIssue]``. +Add it to :file:`proteobench/validation/checks.py`, keeping it independently +unit-testable, then register it in the relevant profile in ``profiles.py``. + +.. code-block:: python + + # proteobench/validation/checks.py + from proteobench.validation.context import ValidationContext + from proteobench.validation.report import ValidationReport, ValidationIssue + from typing import List + + + def check_my_constraint(standard_df, parameters, config) -> List[ValidationIssue]: + """Check some property of the standardized results against a parameter.""" + report = ValidationReport() + + # Parameter-dependent checks must self-report when the value was not + # parsed, and never crash. + limit = getattr(parameters, "my_limit", None) + if limit is None: + report.add_warning( + "my_limit_absent", + "The parameter 'my_limit' could not be parsed; the check was skipped.", + "my_constraint", + ) + return report.issues + + offending = standard_df[standard_df["some_column"] > limit] + if not offending.empty: + report.add_error( + "my_constraint_violated", + f"{len(offending)} row(s) exceed my_limit ({limit}).", + "my_constraint", + observed=int(offending["some_column"].max()), + expected=f"<= {limit}", + examples=offending["some_column"].head(10).tolist(), + ) + return report.issues + +Then add it to a profile. Trivial checks that only forward context fields are +written as a lambda adapter; checks that need orchestration logic (such as +deciding whether a reference is available) are written as named functions in +``profiles.py``. + +.. code-block:: python + + # proteobench/validation/profiles.py + QUANT_LFQ_PROFILE = ValidationProfile( + name="quant_lfq", + checks=[ + # ... existing checks ... + Check( + "my_constraint", + lambda ctx: check_my_constraint(ctx.standard_df, ctx.parameters, ctx.config), + "What this check verifies.", + ), + ], + ) + +Guidelines for checks: + +- Use ``ValidationContext`` fields: ``ctx.standard_df``, ``ctx.parameters``, + ``ctx.config``, ``ctx.fasta``, ``ctx.input_format``. +- Choose severity by intent only. ``error`` and ``warning`` differ in display + prominence and pull-request inclusion. Neither blocks submission. +- A parameter-dependent check should emit a ``warning`` when the constraint was + not parsed, rather than failing. The orchestrator also wraps every check so an + unexpected exception becomes a ``check_failed`` warning, but checks should not + rely on that. + +Registering a profile +---------------------- + +.. code-block:: python + + from proteobench.validation import Check, ValidationProfile, register_profile + + register_profile( + ValidationProfile( + name="my_module", + description="Checks for the new module category.", + checks=[Check("my_check", my_check_func, "what it does")], + ) + ) + +The registry helpers are ``register_profile`` (with ``overwrite=False`` by +default), ``unregister_profile``, ``get_profile``, and ``available_profiles``. + +The report object +------------------ + +``validate_submission`` returns a ``ValidationReport``. Useful members: + +- ``report.errors`` / ``report.warnings`` / ``report.infos``: issues by severity. +- ``report.has_errors`` and ``report.passed``: informational only. The Streamlit + flow does not gate on them. +- ``report.summary()``: a compact Markdown summary embedded in the pull-request + description. +- ``report.raise_if_errors()``: optional path for programmatic callers that + prefer an exception (``SubmissionValidationError``). + +Each ``ValidationIssue`` carries a machine-readable ``code``, a ``severity``, a +human-readable ``message``, the originating ``check`` name, and optional +``field``, ``observed``, ``expected``, and ``examples`` values. + + +Web integration +================ + +Validation runs inside ``submit_to_repository`` in +:file:`webinterface/pages/base_pages/tabs/tab6_submit_results.py`, after the +confirmation button and before the pull request is created. The standardized +DataFrame is re-derived by rerunning the existing parser on the input DataFrame +already in session state, so no tool-specific parsing logic is duplicated. The +glue functions are in +:file:`webinterface/pages/base_pages/utils/validation_ui.py`: + +- ``run_submission_validation(variables, ionmodule, user_input, params)`` runs + the full flow (re-parse, load and cache the FASTA, run the checks) and returns + a ``ValidationReport``. It is fault-tolerant: any infrastructure problem + (missing input, parser failure, FASTA download failure) becomes a warning. +- ``render_validation_report(report)`` displays errors and warnings, with info + items inside a collapsed expander. + +The findings are rendered in the UI and appended to the pull-request description +through ``report.summary()``. None of them block the pull request. The local +Tab 2 upload path is unaffected. + + +Testing +======= + +Validation is covered by :file:`test/test_validation.py`, which tests FASTA +parsing, protein-identifier matching, the individual checks, the profile +registry (resolution, custom-profile registration, unknown-profile handling, de +novo routing), report serialization, and integration through the real MaxQuant +and Sage parsers. A small reference FASTA fixture lives at +:file:`test/data/validation/ProteoBench_validation_reference.fasta`. + +When you add a check, add unit tests for it directly (it is a pure function). +When you add a profile, add a test that the profile resolves for the module and +that its checks run. + + +Documented limitations +======================= + +Some checks are intentionally limited because the required reference data is not +available: + +- Full enzyme-specificity checks need reference protein sequences. Only internal + K/R counting is done for the trypsin family, as a heuristic. +- Cross-tool modification normalization is not implemented. Tools encode + modifications differently (human-readable names, UniMod accessions, raw mass + dictionaries); only matching human-readable names against the declared + modifications is attempted. +- Run-level matching (raw file, sample, experiment) is not possible because + ``ProteoBenchParameters`` does not expose those fields. Only software identity + is compared in ``run_consistency``. diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/Astral/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/Astral/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/Astral/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/Astral/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/QExactive/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/QExactive/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/QExactive/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/ion/QExactive/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/peptidoform/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/peptidoform/module_settings.toml index 7453c6885..302aa6636 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/peptidoform/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DDA/peptidoform/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "peptidoform" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/AIF/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/AIF/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/AIF/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/AIF/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/Astral/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/Astral/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/Astral/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/Astral/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/ZenoTOF/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/ZenoTOF/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/ZenoTOF/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/ZenoTOF/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/diaPASEF/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/diaPASEF/module_settings.toml index 8835759b3..aecbc659b 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/diaPASEF/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/diaPASEF/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/plasma/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/plasma/module_settings.toml index 31740dfba..13722ca85 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/plasma/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/plasma/module_settings.toml @@ -14,3 +14,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_Distler_PYE.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/singlecell/module_settings.toml b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/singlecell/module_settings.toml index 97f8c7934..4a91bb890 100644 --- a/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/singlecell/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/Quant/lfq/DIA/ion/singlecell/module_settings.toml @@ -10,3 +10,13 @@ [general] "min_count_multispec" = 1 "level" = "ion" + +[reference_database] +# Reference protein database used to validate uploaded protein identifiers. +# Read by proteobench.validation (ModuleValidationConfig). +"fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HY.zip" + +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +"profile" = "quant_lfq" diff --git a/proteobench/io/parsing/io_parse_settings/denovo/DDA/HCD/module_settings.toml b/proteobench/io/parsing/io_parse_settings/denovo/DDA/HCD/module_settings.toml index e69de29bb..60818bad0 100644 --- a/proteobench/io/parsing/io_parse_settings/denovo/DDA/HCD/module_settings.toml +++ b/proteobench/io/parsing/io_parse_settings/denovo/DDA/HCD/module_settings.toml @@ -0,0 +1,6 @@ +[validation] +# Validation profile applied to submissions for this module. +# Resolved by proteobench.validation (ModuleValidationConfig.from_parse_settings). +# De novo modules use a different standardized schema than quant modules, so the +# quant checks (protein IDs, charge, peptide length) do not apply. +"profile" = "denovo" diff --git a/proteobench/io/parsing/parse_settings.py b/proteobench/io/parsing/parse_settings.py index 49f4097ed..b5ffa439e 100644 --- a/proteobench/io/parsing/parse_settings.py +++ b/proteobench/io/parsing/parse_settings.py @@ -848,7 +848,6 @@ def convert_to_peptidoform(proforma): "quant_lfq_DDA_ion_Astral": ParseSettingsQuant, "quant_lfq_DDA_ion_QExactive": ParseSettingsQuant, "quant_lfq_DDA_peptidoform": ParseSettingsQuant, - "quant_lfq_DDA_ion_Astral": ParseSettingsQuant, "quant_lfq_DIA_ion_AIF": ParseSettingsQuant, "quant_lfq_DIA_ion_diaPASEF": ParseSettingsQuant, "quant_lfq_DIA_ion_singlecell": ParseSettingsQuant, diff --git a/proteobench/validation/__init__.py b/proteobench/validation/__init__.py new file mode 100644 index 000000000..cf0fe5198 --- /dev/null +++ b/proteobench/validation/__init__.py @@ -0,0 +1,67 @@ +""" +Submission-validation layer for ProteoBench. + +This package validates uploaded benchmark submissions before a public datapoint +is created. It checks that the standardized results and parsed parameters are +internally consistent and consistent with the module reference database, and +returns a structured :class:`ValidationReport` (overall status plus per-issue +severity, machine-readable code, message, field, observed/expected values, and +example offending rows). + +The layer is generic and registry-driven. Each module maps to a *validation +profile* (a named, ordered set of checks). Adding a new module of an existing +category requires only configuration; adding a new category requires only +registering a new profile via :func:`register_profile`. + +Typical use:: + + from proteobench.validation import validate_submission, FastaReference, ModuleValidationConfig + + config = ModuleValidationConfig.from_parse_settings(parse_settings_dir, module_id, input_format) + fasta = FastaReference.from_url(config.fasta_url, member_filename=config.fasta_filename) + report = validate_submission(standard_df, parameters=params, fasta=fasta, config=config, + input_format=input_format) + if report.has_errors: + ... # block public submission + +Registering a custom profile:: + + from proteobench.validation import Check, ValidationProfile, register_profile + + register_profile(ValidationProfile( + name="my_module", + checks=[Check("my_check", my_check_func, "what it does")], + )) +""" + +from proteobench.validation.config import ModuleValidationConfig +from proteobench.validation.context import ValidationContext +from proteobench.validation.exceptions import SubmissionValidationError +from proteobench.validation.fasta import FastaReference +from proteobench.validation.profiles import ( + Check, + ValidationProfile, + available_profiles, + get_profile, + register_profile, + unregister_profile, +) +from proteobench.validation.report import Severity, ValidationIssue, ValidationReport +from proteobench.validation.validator import validate_submission + +__all__ = [ + "validate_submission", + "ValidationReport", + "ValidationIssue", + "Severity", + "ValidationContext", + "FastaReference", + "ModuleValidationConfig", + "SubmissionValidationError", + "Check", + "ValidationProfile", + "register_profile", + "unregister_profile", + "get_profile", + "available_profiles", +] diff --git a/proteobench/validation/checks.py b/proteobench/validation/checks.py new file mode 100644 index 000000000..eb83ff657 --- /dev/null +++ b/proteobench/validation/checks.py @@ -0,0 +1,1078 @@ +""" +Individual validation checks operating on the standardized result DataFrame. + +Every check is a pure function that takes the standardized DataFrame, the +parsed :class:`~proteobench.io.params.ProteoBenchParameters` (or any object with +the same attributes), and a :class:`~proteobench.validation.config.ModuleValidationConfig`, +and returns a list of :class:`~proteobench.validation.report.ValidationIssue`. + +The checks are deliberately generic: they read the standardized columns +(``Proteins``, ``Sequence``, ``Charge``, ``proforma``) and the parameter +attributes, never tool-specific result columns. Missing or unparsed parameters +yield warnings rather than errors, so a submission is never blocked merely +because a value could not be parsed. + +Documented limitations and intentionally skipped checks: + +* **Enzyme specificity**: a missed-cleavage heuristic is implemented for common + C-terminal cleaving enzymes (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C, + chymotrypsin) and only as a *warning*. It ignores protein N-/C-termini and + ragged ends (resolving those would need the reference protein sequences), and + N-terminal cleavers (Asp-N, Lys-N) are skipped. +* **Modifications**: cross-tool modification representations are not normalized + (human-readable names, UniMod accessions, and raw masses all occur). Only + human-readable modification names observed in the ``proforma`` column are + compared, as warnings; mass-only / UniMod-only tokens are skipped. The + maximum-modifications count includes any fixed modifications written into the + sequence, so it is an upper bound (warning only). +* **Mass tolerances**: there is no per-result tolerance to compare against, so + the precursor/fragment tolerances are only sanity-checked (present, numeric, + positive), as warnings. An optional plausibility ceiling + (``max_plausible_ppm`` / ``max_plausible_dalton`` on the config) has no + default; the implausible-value check is skipped unless a module configures it. +* **PSM FDR**: validated against the valid ``[0, 1]`` range and the benchmark's + recommended maximum (configurable), as warnings. +* **Run identity**: ``ProteoBenchParameters`` does not expose raw-file, sample, + or experiment identifiers, so result-vs-parameter run matching is limited to + software identity. This is reported as info. +""" + +from __future__ import annotations + +import re +from typing import Any, List, Optional + +import numpy as np +import pandas as pd + +from proteobench.validation.config import ModuleValidationConfig +from proteobench.validation.fasta import FastaReference +from proteobench.validation.protein_ids import extract_identifiers, is_decoy_or_contaminant, split_protein_groups +from proteobench.validation.report import ValidationIssue, ValidationReport + +#: Maximum number of example offending protein identifiers to report. +MAX_PROTEIN_EXAMPLES = 20 + +#: Maximum number of example offending rows to report for other checks. +MAX_ROW_EXAMPLES = 10 + +#: Matches a bracketed modification label inside a ProForma string. +_PROFORMA_MOD = re.compile(r"\[([^\]]+)\]") + +#: C-terminal cleavage rules per normalized enzyme name: a tuple of +#: (residues the enzyme cleaves after, whether it cleaves when proline follows). +#: A value of ``None`` marks an N-terminal cleaver, for which the simple +#: internal-site count does not apply (skipped with an info message). +#: The rules follow the MaxQuant built-in enzyme defaults (e.g. Glu-C cleaves +#: after D and E). Because the missed-cleavage check is warning-only, these are +#: convention-dependent heuristics, not authoritative cleavage definitions. +_ENZYME_CLEAVAGE_RULES = { + "trypsin": ("KR", False), + "trypsin/p": ("KR", True), + "lysc": ("K", True), + "argc": ("R", False), + "gluc": ("DE", True), + "chymotrypsin": ("FYWL", False), + "lysn": None, + "aspn": None, +} + +#: Matches a signed number (including scientific notation) and an optional unit +#: in a tolerance string such as ``"[-20.0 ppm, 20.0 ppm]"`` or ``"2e-3 Da"``. +_TOLERANCE_TOKEN = re.compile(r"(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\s*(ppm|da|th|mmu|amu)?", re.IGNORECASE) + + +def _normalize_enzyme(name: str) -> str: + """ + Normalize an enzyme name for cleavage-rule lookup. + + Lower-cases the name and removes spaces, underscores, and hyphens so that + ``"Lys-C"``, ``"lys_c"``, and ``"LysC"`` all map to ``"lysc"`` (the slash in + ``"Trypsin/P"`` is preserved). + + Parameters + ---------- + name : str + The raw enzyme name. + + Returns + ------- + str + The normalized enzyme key. + """ + text = str(name).strip().lower() + for char in (" ", "_", "-"): + text = text.replace(char, "") + return text + + +def _is_missing(value: Any) -> bool: + """ + Determine whether a parameter value should be treated as "not provided". + + Treats ``None``, ``np.nan``, and the literal strings ``"None"``/``"nan"``/``""`` + as missing (matching how :class:`ProteoBenchParameters` represents absent values). + + Parameters + ---------- + value : Any + The value to test. + + Returns + ------- + bool + ``True`` if the value is missing. + """ + if value is None: + return True + if isinstance(value, float) and np.isnan(value): + return True + if isinstance(value, str) and value.strip().lower() in {"", "none", "nan"}: + return True + return False + + +def _as_int(value: Any) -> Optional[int]: + """ + Coerce a value to ``int`` if possible. + + Parameters + ---------- + value : Any + The value to coerce. + + Returns + ------- + int or None + The integer value, or ``None`` if it is missing or not convertible. + """ + if _is_missing(value): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def _as_float(value: Any) -> Optional[float]: + """ + Coerce a value to ``float`` if possible. + + Parameters + ---------- + value : Any + The value to coerce. + + Returns + ------- + float or None + The float value, or ``None`` if it is missing or not convertible. + """ + if _is_missing(value): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _format_range(minimum: Optional[int], maximum: Optional[int]) -> str: + """ + Format an inclusive numeric range for display. + + Parameters + ---------- + minimum : int or None + Lower bound (``None`` means unbounded). + maximum : int or None + Upper bound (``None`` means unbounded). + + Returns + ------- + str + A string such as ``"[2, 4]"`` or ``"[2, unbounded]"``. + """ + low = "unbounded" if minimum is None else str(minimum) + high = "unbounded" if maximum is None else str(maximum) + return f"[{low}, {high}]" + + +def _identifier_series(df: pd.DataFrame, config: ModuleValidationConfig) -> Optional[pd.Series]: + """ + Pick the best per-row identifier column for example reporting. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + pandas.Series or None + A series of human-readable row identifiers, or ``None`` if unavailable. + """ + for column in ("precursor ion", "peptidoform", config.sequence_column): + if column in df.columns: + return df[column].astype(str) + return None + + +def check_protein_ids( + df: pd.DataFrame, + fasta: FastaReference, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Validate protein identifiers against the reference FASTA accession set. + + Splits protein groups, skips decoy and contaminant identifiers, and reports + as an error any remaining identifier that is not found in the reference. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + fasta : FastaReference + Reference protein identifiers. + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Issues describing missing protein identifiers (or an info confirming all + identifiers were found). + """ + report = ValidationReport() + + if config.protein_column not in df.columns: + report.add_warning( + "protein_column_missing", + f"Protein column '{config.protein_column}' not found in the standardized results; " + "protein-identifier validation was skipped.", + "protein_ids", + field=config.protein_column, + ) + return report.issues + + # Collect unique, target (non-decoy / non-contaminant) protein tokens. + target_tokens: set = set() + for cell in df[config.protein_column].dropna().unique(): + for token in split_protein_groups(cell, config.protein_group_separators): + if is_decoy_or_contaminant(token, config.contaminant_flag, config.decoy_prefixes): + continue + target_tokens.add(token) + + if not target_tokens: + report.add_warning( + "no_protein_ids", + "No target protein identifiers were found in the results (all empty, decoy, or contaminant).", + "protein_ids", + field=config.protein_column, + ) + return report.issues + + missing = [token for token in target_tokens if not fasta.contains_any(extract_identifiers(token))] + + n_unique = len(target_tokens) + n_missing = len(missing) + n_found = n_unique - n_missing + + if n_missing > 0: + examples = sorted(missing)[:MAX_PROTEIN_EXAMPLES] + report.add_error( + "protein_not_in_fasta", + f"{n_missing} of {n_unique} unique protein identifiers are not present in the reference " + f"database ({n_found} found). These are non-decoy, non-contaminant identifiers and likely " + "indicate the wrong FASTA was used or proteins outside the benchmark.", + "protein_ids", + field=config.protein_column, + observed={"n_unique": n_unique, "n_found": n_found, "n_missing": n_missing}, + expected="all identifiers present in the module reference database", + examples=examples, + ) + else: + report.add_info( + "protein_ids_ok", + f"All {n_unique} unique protein identifiers were found in the reference database.", + "protein_ids", + field=config.protein_column, + observed={"n_unique": n_unique, "n_found": n_found, "n_missing": 0}, + ) + + return report.issues + + +def check_charge_range( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Validate that observed precursor charges fall within the parsed charge range. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + params : Any + Parsed parameters (object with ``min_precursor_charge`` / + ``max_precursor_charge`` attributes). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Issues describing out-of-range charges, or warnings when the constraint + or column is unavailable. + """ + report = ValidationReport() + check = "charge_range" + + min_charge = _as_int(getattr(params, "min_precursor_charge", None)) + max_charge = _as_int(getattr(params, "max_precursor_charge", None)) + + if min_charge is None and max_charge is None: + report.add_warning( + "charge_range_not_parsed", + "Could not validate precursor charge because no minimum/maximum charge constraint " + "was parsed from the parameter file.", + check, + field="precursor_charge", + ) + return report.issues + + if config.charge_column not in df.columns: + report.add_warning( + "charge_column_missing", + f"Charge column '{config.charge_column}' not found in the standardized results; " + "charge-range validation was skipped.", + check, + field=config.charge_column, + ) + return report.issues + + charges = pd.to_numeric(df[config.charge_column], errors="coerce") + valid = charges.notna() + + mask = pd.Series(False, index=df.index) + if min_charge is not None: + mask = mask | (valid & (charges < min_charge)) + if max_charge is not None: + mask = mask | (valid & (charges > max_charge)) + + n_offending = int(mask.sum()) + if n_offending > 0: + offending_charges = sorted({int(c) for c in charges[mask].dropna().unique()}) + identifiers = _identifier_series(df, config) + if identifiers is not None: + examples = identifiers[mask].unique().tolist()[:MAX_ROW_EXAMPLES] + else: + examples = offending_charges[:MAX_ROW_EXAMPLES] + report.add_error( + "charge_out_of_range", + f"{n_offending} result row(s) have a precursor charge outside the searched range " + f"{_format_range(min_charge, max_charge)} (observed charges: {offending_charges}).", + check, + field=config.charge_column, + observed=offending_charges, + expected=_format_range(min_charge, max_charge), + examples=examples, + ) + + return report.issues + + +def check_peptide_length( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Validate that peptide lengths fall within the parsed peptide-length range. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + params : Any + Parsed parameters (object with ``min_peptide_length`` / + ``max_peptide_length`` attributes). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Issues describing out-of-range peptide lengths, or warnings when the + constraint or column is unavailable. + """ + report = ValidationReport() + check = "peptide_length" + + min_len = _as_int(getattr(params, "min_peptide_length", None)) + max_len = _as_int(getattr(params, "max_peptide_length", None)) + + if min_len is None and max_len is None: + report.add_warning( + "peptide_length_not_parsed", + "Could not validate peptide length because no minimum/maximum peptide-length " + "constraint was parsed from the parameter file.", + check, + field="peptide_length", + ) + return report.issues + + if config.sequence_column not in df.columns: + report.add_warning( + "sequence_column_missing", + f"Sequence column '{config.sequence_column}' not found in the standardized results; " + "peptide-length validation was skipped.", + check, + field=config.sequence_column, + ) + return report.issues + + sequences = df[config.sequence_column].astype(str) + lengths = sequences.str.count(r"[A-Za-z]") + + mask = pd.Series(False, index=df.index) + if min_len is not None: + mask = mask | (lengths < min_len) + if max_len is not None: + mask = mask | (lengths > max_len) + + n_offending = int(mask.sum()) + if n_offending > 0: + examples = sequences[mask].unique().tolist()[:MAX_ROW_EXAMPLES] + offending_lengths = sorted({int(length) for length in lengths[mask].unique()}) + report.add_error( + "peptide_length_out_of_range", + f"{n_offending} result row(s) have a peptide length outside the searched range " + f"{_format_range(min_len, max_len)} (observed lengths: {offending_lengths}).", + check, + field=config.sequence_column, + observed=offending_lengths, + expected=_format_range(min_len, max_len), + examples=examples, + ) + + return report.issues + + +def _count_missed_cleavages(sequence: str, residues: str, cleave_before_proline: bool) -> int: + """ + Count internal missed cleavages for a C-terminal cleaving enzyme. + + A missed cleavage is an internal cleavage residue (one not at the + C-terminus). For proline-restricted enzymes a cleavage residue immediately + followed by ``P`` does not count. + + Parameters + ---------- + sequence : str + Peptide sequence (plain amino-acid letters). + residues : str + Residues the enzyme cleaves C-terminal to (e.g. ``"KR"`` for trypsin). + cleave_before_proline : bool + ``True`` if the enzyme still cleaves when proline follows the residue. + + Returns + ------- + int + Number of internal missed cleavages. + """ + seq = "".join(ch for ch in str(sequence) if ch.isalpha()).upper() + if len(seq) < 2: + return 0 + residue_set = set(residues) + count = 0 + for i in range(len(seq) - 1): + if seq[i] in residue_set and (cleave_before_proline or seq[i + 1] != "P"): + count += 1 + return count + + +def check_enzyme( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Best-effort enzyme/specificity check (missed cleavages, warning only). + + Supports common C-terminal cleaving enzymes via :data:`_ENZYME_CLEAVAGE_RULES` + (trypsin, trypsin/P, Lys-C, Arg-C, Glu-C, chymotrypsin). For each unique + peptide it counts internal cleavage residues and warns when more peptides + than allowed exceed ``allowed_miscleavages``. This is a heuristic: it ignores + ragged termini and protein ends, so it can only be a warning. N-terminal + cleavers (Asp-N, Lys-N) and unknown enzymes are reported as info (skipped). + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + params : Any + Parsed parameters (object with ``enzyme``, ``semi_enzymatic``, + ``allowed_miscleavages`` attributes). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Warnings for peptides exceeding the allowed missed cleavages, or + info/warning describing why the check was skipped. + """ + report = ValidationReport() + check = "enzyme" + + enzyme = getattr(params, "enzyme", None) + if _is_missing(enzyme): + report.add_warning( + "enzyme_not_parsed", + "Could not validate enzyme specificity because no enzyme was parsed from the parameter file.", + check, + field="enzyme", + ) + return report.issues + + normalized = _normalize_enzyme(enzyme) + if normalized not in _ENZYME_CLEAVAGE_RULES: + report.add_info( + "enzyme_check_unsupported", + f"Enzyme-specificity validation is not implemented for enzyme '{enzyme}'; check skipped.", + check, + field="enzyme", + observed=enzyme, + ) + return report.issues + + rule = _ENZYME_CLEAVAGE_RULES[normalized] + if rule is None: + report.add_info( + "enzyme_check_unsupported", + f"Enzyme '{enzyme}' cleaves N-terminal to its residue; the missed-cleavage heuristic " + "does not apply and the check was skipped.", + check, + field="enzyme", + observed=enzyme, + ) + return report.issues + residues, cleave_before_proline = rule + + if bool(getattr(params, "semi_enzymatic", False)) is True: + report.add_info( + "enzyme_semi_skipped", + "Semi-enzymatic search detected; the missed-cleavage heuristic was skipped.", + check, + field="semi_enzymatic", + ) + return report.issues + + allowed = _as_int(getattr(params, "allowed_miscleavages", None)) + if allowed is None: + report.add_warning( + "miscleavages_not_parsed", + "Could not validate missed cleavages because 'allowed_miscleavages' was not parsed " + "from the parameter file.", + check, + field="allowed_miscleavages", + ) + return report.issues + + if config.sequence_column not in df.columns: + report.add_warning( + "sequence_column_missing", + f"Sequence column '{config.sequence_column}' not found; missed-cleavage check skipped.", + check, + field=config.sequence_column, + ) + return report.issues + + sequences = df[config.sequence_column].astype(str) + unique_sequences = pd.Series(sequences.unique()) + missed = unique_sequences.apply(lambda s: _count_missed_cleavages(s, residues, cleave_before_proline)) + offending = unique_sequences[missed > allowed] + + if len(offending) > 0: + examples = [f"{seq} ({_count_missed_cleavages(seq, residues, cleave_before_proline)} MC)" for seq in offending][ + :MAX_ROW_EXAMPLES + ] + report.add_warning( + "missed_cleavages_exceeded", + f"{len(offending)} unique peptide sequence(s) appear to exceed the allowed missed " + f"cleavages ({allowed}) for {enzyme}. This is a heuristic (ignores ragged termini and " + "protein ends); review before submitting.", + check, + field="allowed_miscleavages", + observed=f"{len(offending)} sequences with > {allowed} internal cleavage sites", + expected=f"<= {allowed} internal missed cleavages", + examples=examples, + ) + + return report.issues + + +def check_modifications( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Best-effort modification compatibility check (warnings only). + + Compares human-readable modification names observed in the ``proforma`` + column against the parsed fixed/variable modification settings. Mass-only + and UniMod-only modification tokens are not compared because their + representation is not normalized across tools. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + params : Any + Parsed parameters (object with ``fixed_mods`` / ``variable_mods``). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Warnings for observed modification names not found in the declared + settings, or a warning/info describing why the check was limited. + """ + report = ValidationReport() + check = "modifications" + + if config.proforma_column not in df.columns: + report.add_info( + "modifications_no_proforma", + f"No '{config.proforma_column}' column in the results; modification validation was skipped.", + check, + field=config.proforma_column, + ) + return report.issues + + observed: set = set() + for value in df[config.proforma_column].dropna().astype(str).unique(): + for match in _PROFORMA_MOD.findall(value): + observed.add(match.strip()) + + if not observed: + report.add_info( + "modifications_none_observed", + "No modifications were observed in the results; nothing to validate.", + check, + ) + return report.issues + + fixed_mods = getattr(params, "fixed_mods", None) + variable_mods = getattr(params, "variable_mods", None) + + declared_parts = [] + for value in (fixed_mods, variable_mods): + if not _is_missing(value): + declared_parts.append(str(value)) + declared_text = " ".join(declared_parts).lower() + + if not declared_text: + report.add_warning( + "modifications_not_parsed", + "Could not validate modifications because no fixed/variable modification settings were " + "parsed from the parameter file.", + check, + field="variable_mods", + observed=sorted(observed)[:MAX_ROW_EXAMPLES], + ) + return report.issues + + # Only compare clean, human-readable modification names; skip mass/UniMod tokens. + unmatched = [] + for token in sorted(observed): + name = token.replace(" ", "") + if len(name) < 3 or not name.isalpha(): + continue + if name.lower() not in declared_text: + unmatched.append(token) + + if unmatched: + report.add_warning( + "modification_not_declared", + f"{len(unmatched)} observed modification name(s) were not found in the declared " + "fixed/variable modifications. Modification names differ across tools, so this is a " + "heuristic; review before submitting.", + check, + field="variable_mods", + observed=unmatched[:MAX_ROW_EXAMPLES], + expected="modifications declared in the parameter file", + examples=unmatched[:MAX_ROW_EXAMPLES], + ) + + return report.issues + + +def check_run_consistency( + df: pd.DataFrame, + params: Any, + input_format: Optional[str], + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Check that the parameter file matches the submitted run, where feasible. + + Only software identity can be compared, because + :class:`ProteoBenchParameters` does not expose raw-file, sample, or + experiment identifiers. A mismatch in software identity is reported as an + error; the unavailable run-level matching is reported as info. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame (unused for software identity but kept + for signature consistency and future extension). + params : Any + Parsed parameters (object with ``software_name`` / ``software_version``). + input_format : str or None + The selected software tool used to parse the results. + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Issues describing software-identity mismatches and the documented + limitation on run-level matching. + """ + report = ValidationReport() + check = "run_consistency" + + software_name = getattr(params, "software_name", None) + if input_format and not _is_missing(software_name): + if str(software_name).strip().lower() != str(input_format).strip().lower(): + report.add_error( + "software_mismatch", + f"The parameter file reports software '{software_name}', but the results were " + f"submitted as '{input_format}'. The parameter file may belong to a different run.", + check, + field="software_name", + observed=software_name, + expected=input_format, + ) + + if _is_missing(getattr(params, "software_version", None)): + report.add_info( + "software_version_missing", + "The software version could not be parsed from the parameter file.", + check, + field="software_version", + ) + + report.add_info( + "run_identity_limited", + "Run-level matching (raw-file, sample, or experiment names) is not available because the " + "parsed parameters do not expose these identifiers; only software identity was checked.", + check, + ) + + return report.issues + + +def check_max_modifications( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Check that no peptide carries more modifications than allowed (warning only). + + Counts the bracketed modifications in each ``proforma`` string and warns + when more than ``max_mods`` are present. This is a heuristic: the count + includes any fixed modifications written into the sequence, so it is an + upper bound on the number of variable modifications. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame. + params : Any + Parsed parameters (object with a ``max_mods`` attribute). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + A warning for peptides exceeding ``max_mods``, or a warning/info + describing why the check was skipped. + """ + report = ValidationReport() + check = "max_modifications" + + max_mods = _as_int(getattr(params, "max_mods", None)) + if max_mods is None: + report.add_warning( + "max_mods_not_parsed", + "Could not validate the maximum number of modifications because 'max_mods' was not " + "parsed from the parameter file.", + check, + field="max_mods", + ) + return report.issues + + if config.proforma_column not in df.columns: + report.add_info( + "max_mods_no_proforma", + f"No '{config.proforma_column}' column in the results; the maximum-modifications check was skipped.", + check, + field=config.proforma_column, + ) + return report.issues + + proforma = df[config.proforma_column].dropna().astype(str) + unique_proforma = pd.Series(proforma.unique()) + mod_counts = unique_proforma.apply(lambda s: len(_PROFORMA_MOD.findall(s))) + offending = unique_proforma[mod_counts > max_mods] + + if len(offending) > 0: + examples = [f"{seq} ({len(_PROFORMA_MOD.findall(seq))} mods)" for seq in offending][:MAX_ROW_EXAMPLES] + report.add_warning( + "max_modifications_exceeded", + f"{len(offending)} unique peptidoform(s) carry more than the allowed {max_mods} " + "modification(s). The count includes any fixed modifications present in the sequence, " + "so it is an upper bound; review before submitting.", + check, + field="max_mods", + observed=f"{len(offending)} peptidoforms with > {max_mods} modifications", + expected=f"<= {max_mods} modifications per peptidoform", + examples=examples, + ) + + return report.issues + + +def _parse_tolerance(text: Any) -> tuple: + """ + Parse a tolerance string into a magnitude and unit. + + Handles bracketed signed ranges such as ``"[-20.0 ppm, 20.0 ppm]"`` by + returning the largest absolute magnitude and the (lower-cased) unit. + + Parameters + ---------- + text : Any + The tolerance value (typically a formatted string). + + Returns + ------- + tuple + ``(magnitude, unit)`` where ``magnitude`` is a float (or ``None`` if no + number could be parsed) and ``unit`` is a lower-case string or ``None``. + """ + if _is_missing(text): + return None, None + magnitudes = [] + unit = None + for number, parsed_unit in _TOLERANCE_TOKEN.findall(str(text)): + try: + magnitudes.append(abs(float(number))) + except ValueError: + continue + if parsed_unit: + unit = parsed_unit.lower() + if not magnitudes: + return None, None + return max(magnitudes), unit + + +def _check_one_tolerance( + report: ValidationReport, value: Any, label: str, field: str, config: ModuleValidationConfig +) -> None: + """ + Sanity-check a single mass-tolerance value and append any issue. + + The "implausibly large" sub-check runs only when the relevant plausibility + ceiling is configured (``config.max_plausible_ppm`` / + ``config.max_plausible_dalton``). These have no default, so the sub-check is + skipped when they are unset. The present/numeric/positive checks always run. + + Parameters + ---------- + report : ValidationReport + Report to append issues to. + value : Any + The tolerance value from the parameters. + label : str + Human-readable label (e.g. ``"precursor mass tolerance"``). + field : str + The parameter field name (used in the issue ``field`` and codes). + config : ModuleValidationConfig + Module validation configuration (provides the plausibility ceilings). + """ + check = "mass_tolerance" + if _is_missing(value): + report.add_warning( + f"{field}_not_parsed", + f"Could not validate the {label} because it was not parsed from the parameter file.", + check, + field=field, + ) + return + + magnitude, unit = _parse_tolerance(value) + if magnitude is None: + report.add_warning( + f"{field}_unparsable", + f"The {label} ('{value}') could not be interpreted as a numeric tolerance.", + check, + field=field, + observed=value, + ) + return + + if magnitude <= 0: + report.add_warning( + f"{field}_non_positive", + f"The {label} ('{value}') is zero or negative, which is not a valid search tolerance.", + check, + field=field, + observed=value, + ) + return + + if unit == "ppm": + ceiling = config.max_plausible_ppm + elif unit in {"da", "th", "amu"}: + ceiling = config.max_plausible_dalton + elif unit == "mmu": + # 1 mmu = 1e-3 Da, so the Dalton ceiling becomes 1000x larger in mmu. + ceiling = None if config.max_plausible_dalton is None else config.max_plausible_dalton * 1000 + else: + ceiling = None + + if ceiling is not None and magnitude > ceiling: + report.add_warning( + f"{field}_implausible", + f"The {label} ('{value}') is unusually large and may indicate a mis-parsed value; " + "review before submitting.", + check, + field=field, + observed=value, + expected=f"<= {ceiling:g} {unit}", + ) + + +def check_mass_tolerances( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Sanity-check the precursor and fragment mass tolerances (warning only). + + There is no per-result tolerance to compare against, so this validates that + the parsed ``precursor_mass_tolerance`` and ``fragment_mass_tolerance`` are + present, numeric, and positive. When the module configures a plausibility + ceiling (``config.max_plausible_ppm`` / ``config.max_plausible_dalton``, + which have no default), tolerances above it are also flagged; otherwise that + sub-check is skipped. Mis-parsed or nonsensical values are flagged as + warnings. + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame (unused; kept for signature consistency). + params : Any + Parsed parameters (object with ``precursor_mass_tolerance`` / + ``fragment_mass_tolerance`` attributes). + config : ModuleValidationConfig + Module validation configuration. + + Returns + ------- + list of ValidationIssue + Warnings for missing, unparsable, or implausible tolerances. + """ + report = ValidationReport() + _check_one_tolerance( + report, + getattr(params, "precursor_mass_tolerance", None), + "precursor mass tolerance", + "precursor_mass_tolerance", + config, + ) + _check_one_tolerance( + report, + getattr(params, "fragment_mass_tolerance", None), + "fragment mass tolerance", + "fragment_mass_tolerance", + config, + ) + return report.issues + + +def check_fdr_psm( + df: pd.DataFrame, + params: Any, + config: ModuleValidationConfig, +) -> List[ValidationIssue]: + """ + Sanity-check the PSM-level FDR (warning only). + + Validates that ``ident_fdr_psm`` is present, within ``[0, 1]``, and not + above the benchmark's recommended maximum + (:attr:`ModuleValidationConfig.recommended_max_fdr_psm`, default 0.01). + + Parameters + ---------- + df : pandas.DataFrame + The standardized result DataFrame (unused; kept for signature consistency). + params : Any + Parsed parameters (object with an ``ident_fdr_psm`` attribute). + config : ModuleValidationConfig + Module validation configuration (provides ``recommended_max_fdr_psm``). + + Returns + ------- + list of ValidationIssue + Warnings for a missing, out-of-range, or above-recommended PSM FDR. + """ + report = ValidationReport() + check = "fdr" + + fdr = _as_float(getattr(params, "ident_fdr_psm", None)) + if fdr is None: + report.add_warning( + "fdr_psm_not_parsed", + "Could not validate the PSM FDR because 'ident_fdr_psm' was not parsed from the parameter file.", + check, + field="ident_fdr_psm", + ) + return report.issues + + if fdr < 0 or fdr > 1: + report.add_warning( + "fdr_psm_out_of_range", + f"The PSM FDR ({fdr}) is outside the valid range [0, 1]; the value may be mis-parsed.", + check, + field="ident_fdr_psm", + observed=fdr, + expected="[0, 1]", + ) + return report.issues + + recommended = getattr(config, "recommended_max_fdr_psm", None) + if recommended is not None and fdr > recommended: + report.add_warning( + "fdr_psm_above_recommended", + f"The PSM FDR ({fdr}) is higher than the recommended maximum of {recommended} for this benchmark.", + check, + field="ident_fdr_psm", + observed=fdr, + expected=f"<= {recommended}", + ) + + return report.issues diff --git a/proteobench/validation/config.py b/proteobench/validation/config.py new file mode 100644 index 000000000..b961b8c7e --- /dev/null +++ b/proteobench/validation/config.py @@ -0,0 +1,261 @@ +""" +Module-level validation configuration. + +:class:`ModuleValidationConfig` collects the small amount of per-module +information the validator needs that is not part of the standardized result +DataFrame or the parsed parameters: the standardized column names, the +protein-group separators, the contaminant flag and decoy prefixes used to skip +non-target identifiers, and the reference FASTA location. + +The ``validation_profile`` field selects which set of checks the orchestrator +runs. It is the name of a profile registered in +:mod:`proteobench.validation.profiles`. It is resolved (in order of precedence): + +1. an explicit ``[validation].profile`` key in the module's ``module_settings.toml`` + (the declarative path: adding a new module of an existing category is config-only); +2. inferred from the module's parser class via the existing ``MODULE_TO_CLASS`` + registry (``ParseSettingsQuant`` -> ``"quant_lfq"``, ``ParseSettingsDeNovo`` -> ``"denovo"``); +3. the :data:`DEFAULT_VALIDATION_PROFILE` fallback. + +A genuinely new category of module is supported by registering a new profile +in ``profiles.py`` (or from third-party code) and pointing the module at it via +the TOML key; the orchestrator itself never changes. + +The reference FASTA is read from an optional ``[reference_database]`` section in +the module's ``module_settings.toml`` (beside ``[species_expected_ratio]`` and +``[general]``). Module types whose reference is not a FASTA (e.g. de novo, which +compares against a ground-truth table) simply omit ``fasta_url``. + +Example ``module_settings.toml`` sections:: + + [reference_database] + "fasta_url" = "https://proteobench.cubimed.rub.de/fasta/ProteoBenchFASTA_MixedSpecies_HYE.zip" + + [validation] + "profile" = "quant_lfq" + # optional mass-tolerance plausibility ceilings (no default; skipped if unset): + # "max_plausible_ppm" = 1000.0 + # "max_plausible_dalton" = 10.0 +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import Optional, Tuple + +import toml + +from proteobench.validation.protein_ids import DEFAULT_GROUP_SEPARATORS + +#: Profile used when none can be resolved from config or the parser class. +DEFAULT_VALIDATION_PROFILE = "quant_lfq" + +#: Maps a parser class name to the default validation profile for that family. +#: Resolution falls back to this when no ``[validation].profile`` is declared. +_PROFILE_BY_PARSER_CLASS = { + "ParseSettingsQuant": "quant_lfq", + "ParseSettingsDeNovo": "denovo", +} + +#: Common decoy-identifier prefixes. The ParseSettings configuration marks +#: decoys via a boolean ``Reverse`` column rather than an accession prefix, so +#: these defaults provide a tool-agnostic fallback for skipping decoy proteins. +DEFAULT_DECOY_PREFIXES = ("rev_", "rev__", "decoy_", "decoy", "reverse_", "##") + + +def _resolve_profile(module_id: str, declared_profile: Optional[str]) -> str: + """ + Resolve the validation profile name for a module. + + Resolution order: an explicit profile declared in ``module_settings.toml`` + wins; otherwise the profile is inferred from the module's parser class via + the existing ``MODULE_TO_CLASS`` registry; otherwise + :data:`DEFAULT_VALIDATION_PROFILE` is used. + + Parameters + ---------- + module_id : str + The module identifier. + declared_profile : str or None + The profile name declared in ``[validation].profile``, if any. + + Returns + ------- + str + The resolved profile name. + """ + if isinstance(declared_profile, str) and declared_profile: + return declared_profile + + try: + from proteobench.io.parsing.parse_settings import MODULE_TO_CLASS + + parser_cls = MODULE_TO_CLASS.get(module_id) + if parser_cls is not None: + inferred = _PROFILE_BY_PARSER_CLASS.get(parser_cls.__name__) + if inferred: + return inferred + except Exception: + pass + + return DEFAULT_VALIDATION_PROFILE + + +@dataclass +class ModuleValidationConfig: + """ + Per-module configuration for submission validation. + + Attributes + ---------- + protein_column : str, optional + Column holding protein identifiers in the standardized DataFrame. + Default ``"Proteins"``. + sequence_column : str, optional + Column holding the (plain) peptide sequence. Default ``"Sequence"``. + charge_column : str, optional + Column holding the precursor charge. Default ``"Charge"``. + proforma_column : str, optional + Column holding the ProForma modified sequence. Default ``"proforma"``. + contaminant_column : str, optional + Boolean column flagging contaminant rows. Default ``"contaminant"``. + contaminant_flag : str, optional + Substring marking contaminant proteins (from the tool parse settings, + e.g. ``"Cont_"``). + decoy_prefixes : tuple of str, optional + Prefixes marking decoy proteins. Defaults to :data:`DEFAULT_DECOY_PREFIXES`. + protein_group_separators : tuple of str, optional + Separators used to split protein groups. Defaults to + :data:`~proteobench.validation.protein_ids.DEFAULT_GROUP_SEPARATORS`. + fasta_url : str, optional + URL of the reference FASTA / zip / gzip for the module. + fasta_filename : str, optional + Preferred FASTA member name when the resource is an archive. + species_flags : tuple of str, optional + Species names configured for the module (e.g. ``("YEAST", "ECOLI", "HUMAN")``), + derived from the tool's species mapper. Currently informational. + recommended_max_fdr_psm : float, optional + Recommended maximum PSM-level FDR for the benchmark. A parsed FDR above + this value produces a warning. Default ``0.01`` (1%). Set to ``None`` to + disable the recommendation check. + max_plausible_ppm : float, optional + Plausibility ceiling for ppm mass tolerances. A parsed tolerance above + this value produces a warning. No default (``None``); when unset, the + implausible-value check is skipped. Set via ``[validation]`` in + ``module_settings.toml``. + max_plausible_dalton : float, optional + Plausibility ceiling for absolute (Da / Th / amu) mass tolerances, scaled + by 1000 for mmu. No default (``None``); when unset, the implausible-value + check is skipped. Set via ``[validation]`` in ``module_settings.toml``. + validation_profile : str, optional + Name of the registered profile whose checks the orchestrator runs. Set + automatically by :meth:`from_parse_settings`; defaults to + :data:`DEFAULT_VALIDATION_PROFILE` for direct construction so that the + existing quant behaviour is preserved. + """ + + protein_column: str = "Proteins" + sequence_column: str = "Sequence" + charge_column: str = "Charge" + proforma_column: str = "proforma" + contaminant_column: str = "contaminant" + contaminant_flag: Optional[str] = None + decoy_prefixes: Tuple[str, ...] = DEFAULT_DECOY_PREFIXES + protein_group_separators: Tuple[str, ...] = tuple(DEFAULT_GROUP_SEPARATORS) + fasta_url: Optional[str] = None + fasta_filename: Optional[str] = None + species_flags: Tuple[str, ...] = field(default_factory=tuple) + recommended_max_fdr_psm: Optional[float] = 0.01 + max_plausible_ppm: Optional[float] = None + max_plausible_dalton: Optional[float] = None + validation_profile: str = DEFAULT_VALIDATION_PROFILE + + @classmethod + def from_parse_settings( + cls, + parse_settings_dir: str, + module_id: str, + input_format: str, + ) -> "ModuleValidationConfig": + """ + Build a config from the existing parse settings of a module/tool. + + This reuses :class:`~proteobench.io.parsing.parse_settings.ParseSettingsBuilder` + to read the contaminant flag and species flags for the selected tool, + reads the optional ``[reference_database]`` and ``[validation]`` sections + from the module's ``module_settings.toml``, and resolves the validation + profile. + + Parameters + ---------- + parse_settings_dir : str + Directory containing the module's parse settings (the module's + ``parse_settings_dir`` attribute). + module_id : str + The module identifier (e.g. ``"quant_lfq_DDA_ion_QExactive"``). + input_format : str + The selected software tool (e.g. ``"MaxQuant"``). + + Returns + ------- + ModuleValidationConfig + Configuration populated from the parse settings. Falls back to the + defaults for any value that cannot be read. + """ + config = cls() + + # Best effort: read the contaminant flag and species from the tool parser. + # Wrapped defensively so validation never crashes on a parser issue. + try: + from proteobench.io.parsing.parse_settings import ParseSettingsBuilder + + builder = ParseSettingsBuilder(parse_settings_dir=parse_settings_dir, module_id=module_id) + parser = builder.build_parser(input_format) + config.contaminant_flag = getattr(parser, "contaminant_flag", None) + species = parser.species_dict() if hasattr(parser, "species_dict") else {} + config.species_flags = tuple(species.values()) + except Exception: + pass + + # Read the module settings directly from disk (independent of the parser) + # so the reference and profile resolve even if the parser cannot be built. + module_settings = {} + try: + module_settings = toml.load(os.path.join(parse_settings_dir, "module_settings.toml")) + except Exception: + module_settings = {} + + reference = module_settings.get("reference_database", {}) or {} + config.fasta_url = reference.get("fasta_url") + config.fasta_filename = reference.get("fasta_filename") + + validation_section = module_settings.get("validation", {}) or {} + declared_profile = validation_section.get("profile") + config.validation_profile = _resolve_profile(module_id, declared_profile) + config.max_plausible_ppm = validation_section.get("max_plausible_ppm") + config.max_plausible_dalton = validation_section.get("max_plausible_dalton") + + return config + + @staticmethod + def read_reference_database(parse_settings_dir: str) -> dict: + """ + Read the ``[reference_database]`` section of a module's settings. + + Parameters + ---------- + parse_settings_dir : str + Directory containing the module's ``module_settings.toml``. + + Returns + ------- + dict + The ``[reference_database]`` table, or an empty dict if absent. + """ + path = os.path.join(parse_settings_dir, "module_settings.toml") + try: + module_settings = toml.load(path) + except Exception: + return {} + return module_settings.get("reference_database", {}) or {} diff --git a/proteobench/validation/context.py b/proteobench/validation/context.py new file mode 100644 index 000000000..9a358a14f --- /dev/null +++ b/proteobench/validation/context.py @@ -0,0 +1,61 @@ +""" +Validation context passed to every check. + +A :class:`ValidationContext` bundles all inputs a check might need behind a +single, uniform object so that every check has the signature +``check(ctx) -> list[ValidationIssue]``. This decouples individual checks from +the orchestrator and from each other: a new check can read whatever it needs +from the context without changing any call site. + +The context carries the concrete inputs available today (standardized +DataFrame, parsed parameters, reference FASTA, module config, selected tool) +plus a generic ``reference`` slot and an ``extras`` dict so future module types +can supply their own reference data (for example a de novo ground-truth table) +without changing the context shape. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +import pandas as pd + +from proteobench.validation.config import ModuleValidationConfig +from proteobench.validation.fasta import FastaReference + + +@dataclass +class ValidationContext: + """ + Inputs available to a validation check. + + Attributes + ---------- + standard_df : pandas.DataFrame + The standardized result DataFrame produced by the module parser. + parameters : Any, optional + Parsed parameters (a :class:`ProteoBenchParameters` or any object with + the same attributes). ``None`` when no parameter file was provided. + config : ModuleValidationConfig + Module validation configuration (column names, flags, FASTA location, + resolved profile). + fasta : FastaReference, optional + Reference protein identifiers, for profiles that validate against a + sequence database. ``None`` when unavailable or not applicable. + input_format : str, optional + The selected software tool used to produce the results. + reference : Any, optional + Generic reference object for profiles whose reference is not a FASTA + (for example a de novo ground-truth table). ``None`` when unused. + extras : dict, optional + Escape hatch for additional, profile-specific inputs. + """ + + standard_df: pd.DataFrame + parameters: Any = None + config: ModuleValidationConfig = field(default_factory=ModuleValidationConfig) + fasta: Optional[FastaReference] = None + input_format: Optional[str] = None + reference: Any = None + extras: Dict[str, Any] = field(default_factory=dict) diff --git a/proteobench/validation/exceptions.py b/proteobench/validation/exceptions.py new file mode 100644 index 000000000..4ffd81f01 --- /dev/null +++ b/proteobench/validation/exceptions.py @@ -0,0 +1,40 @@ +""" +Exceptions for the ProteoBench submission-validation layer. + +The validation layer primarily communicates through a structured +:class:`proteobench.validation.report.ValidationReport`. This exception is a +thin convenience for callers (notebooks, CLI, programmatic submission) that +prefer to fail fast instead of inspecting the report. + +Classes +------- +SubmissionValidationError + Raised when a submission fails validation with at least one error. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from proteobench.validation.report import ValidationReport + + +class SubmissionValidationError(Exception): + """ + Raised when a submission fails validation. + + The originating :class:`~proteobench.validation.report.ValidationReport` + is attached as the ``report`` attribute so callers can inspect every issue. + + Parameters + ---------- + report : ValidationReport + The validation report that triggered the error. + """ + + def __init__(self, report: "ValidationReport"): + self.report = report + n_errors = len(report.errors) + message = f"Submission validation failed with {n_errors} error(s):\n{report.summary()}" + super().__init__(message) diff --git a/proteobench/validation/fasta.py b/proteobench/validation/fasta.py new file mode 100644 index 000000000..245b16f7a --- /dev/null +++ b/proteobench/validation/fasta.py @@ -0,0 +1,331 @@ +""" +FASTA / reference-database parsing for submission validation. + +:class:`FastaReference` builds the set of accepted protein identifiers from a +FASTA file. It parses common UniProt-style headers (``sp|P49327|FAS_HUMAN``, +``tr|...|...``) as well as bare accession-like headers, indexing both the +accession and the entry name so that result protein identifiers can be matched +regardless of which form a tool reports. + +The class can be built from raw text, a local path (plain, ``.gz``, or ``.zip``), +in-memory bytes, or an explicit iterable of identifiers. Downloading from a URL +is supported via :meth:`FastaReference.from_url`; the actual network call is +performed lazily so that importing this module never requires network access. +""" + +from __future__ import annotations + +import gzip +import io +import os +import zipfile +from typing import Iterable, Optional, Set + +from proteobench.validation.protein_ids import extract_identifiers + +#: File extensions recognised as FASTA when picking a member from an archive. +_FASTA_EXTENSIONS = (".fasta", ".fa", ".faa", ".fas") + + +def _looks_like_zip(data: bytes) -> bool: + """ + Heuristically determine whether a byte string is a ZIP archive. + + Parameters + ---------- + data : bytes + Raw bytes to inspect. + + Returns + ------- + bool + ``True`` if the bytes start with the ZIP magic number. + """ + return data[:2] == b"PK" + + +def _looks_like_gzip(data: bytes) -> bool: + """ + Heuristically determine whether a byte string is gzip-compressed. + + Parameters + ---------- + data : bytes + Raw bytes to inspect. + + Returns + ------- + bool + ``True`` if the bytes start with the gzip magic number. + """ + return data[:2] == b"\x1f\x8b" + + +def _pick_fasta_member(zf: zipfile.ZipFile, member_filename: Optional[str]) -> str: + """ + Choose the FASTA member to read from a ZIP archive. + + Parameters + ---------- + zf : zipfile.ZipFile + Open ZIP archive. + member_filename : str, optional + Preferred member name. If not found, the first member with a FASTA + extension is used. + + Returns + ------- + str + Name of the member to read. + + Raises + ------ + ValueError + If no suitable FASTA member can be found. + """ + names = zf.namelist() + if member_filename: + for name in names: + if os.path.basename(name) == member_filename or name == member_filename: + return name + for name in names: + if name.lower().endswith(_FASTA_EXTENSIONS): + return name + raise ValueError(f"No FASTA member found in archive (members: {names}).") + + +def parse_fasta_header(header: str) -> Set[str]: + """ + Parse a single FASTA header line into candidate identifiers. + + Parameters + ---------- + header : str + A FASTA header line, with or without the leading ``>``. + + Returns + ------- + set of str + Candidate identifiers (accession, entry name, isoform base, ...). + """ + text = header.lstrip(">").strip() + if not text: + return set() + first_token = text.split()[0] + return extract_identifiers(first_token) + + +class FastaReference: + """ + Set of protein identifiers derived from a FASTA / reference database. + + Parameters + ---------- + identifiers : iterable of str, optional + Pre-computed identifiers to seed the reference with. + """ + + def __init__(self, identifiers: Optional[Iterable[str]] = None): + """ + Initialize the reference from an optional iterable of identifiers. + + Parameters + ---------- + identifiers : iterable of str, optional + Pre-computed identifiers to seed the reference with. + """ + self._ids: Set[str] = set() + if identifiers: + for identifier in identifiers: + if identifier: + self._ids.add(str(identifier).strip()) + self._ids_ci: Set[str] = {i.lower() for i in self._ids} + + def __len__(self) -> int: + """ + Return the number of indexed identifiers. + + Returns + ------- + int + Count of unique identifiers in the reference. + """ + return len(self._ids) + + @property + def identifiers(self) -> Set[str]: + """ + Return all indexed identifiers. + + Returns + ------- + set of str + The identifier set (accessions and entry names). + """ + return set(self._ids) + + def contains(self, identifier: str) -> bool: + """ + Test whether an identifier is present (case-insensitive). + + Parameters + ---------- + identifier : str + Identifier to test. + + Returns + ------- + bool + ``True`` if the identifier is in the reference. + """ + if identifier is None: + return False + return str(identifier).strip().lower() in self._ids_ci + + def contains_any(self, identifiers: Iterable[str]) -> bool: + """ + Test whether any of several identifiers is present. + + Parameters + ---------- + identifiers : iterable of str + Candidate identifiers for a single protein. + + Returns + ------- + bool + ``True`` if at least one candidate is in the reference. + """ + return any(self.contains(identifier) for identifier in identifiers) + + @classmethod + def from_text(cls, text: str) -> "FastaReference": + """ + Build a reference from raw FASTA text. + + Parameters + ---------- + text : str + FASTA content (one or more records). + + Returns + ------- + FastaReference + Reference indexing every header's identifiers. + """ + ids: Set[str] = set() + for line in text.splitlines(): + if line.startswith(">"): + ids.update(parse_fasta_header(line)) + return cls(ids) + + @classmethod + def from_bytes( + cls, + data: bytes, + source_name: Optional[str] = None, + member_filename: Optional[str] = None, + encoding: str = "utf-8", + ) -> "FastaReference": + """ + Build a reference from in-memory bytes (plain, gzip, or zip). + + Parameters + ---------- + data : bytes + Raw file content. + source_name : str, optional + Original file name or URL, used to detect the compression type. + member_filename : str, optional + Preferred FASTA member name when ``data`` is a ZIP archive. + encoding : str, optional + Text encoding used to decode the FASTA content. Default ``"utf-8"``. + + Returns + ------- + FastaReference + Reference indexing every header's identifiers. + """ + name = (source_name or "").lower() + + if name.endswith(".zip") or _looks_like_zip(data): + with zipfile.ZipFile(io.BytesIO(data)) as zf: + member = _pick_fasta_member(zf, member_filename) + text = zf.read(member).decode(encoding, errors="replace") + elif name.endswith(".gz") or _looks_like_gzip(data): + text = gzip.decompress(data).decode(encoding, errors="replace") + else: + text = data.decode(encoding, errors="replace") + + return cls.from_text(text) + + @classmethod + def from_path(cls, path: str, member_filename: Optional[str] = None) -> "FastaReference": + """ + Build a reference from a local file path (plain, ``.gz``, or ``.zip``). + + Parameters + ---------- + path : str + Path to the FASTA, gzip, or zip file. + member_filename : str, optional + Preferred FASTA member name when ``path`` is a ZIP archive. + + Returns + ------- + FastaReference + Reference indexing every header's identifiers. + """ + with open(path, "rb") as handle: + data = handle.read() + return cls.from_bytes(data, source_name=path, member_filename=member_filename) + + @classmethod + def from_url( + cls, + url: str, + member_filename: Optional[str] = None, + timeout: int = 60, + ) -> "FastaReference": + """ + Build a reference by downloading a FASTA / zip / gzip from a URL. + + ``requests`` is imported lazily so that importing this module does not + require network access. + + Parameters + ---------- + url : str + URL of the FASTA, gzip, or zip resource. + member_filename : str, optional + Preferred FASTA member name when the resource is a ZIP archive. + timeout : int, optional + Request timeout in seconds. Default ``60``. + + Returns + ------- + FastaReference + Reference indexing every header's identifiers. + """ + import requests + + response = requests.get(url, timeout=timeout) + response.raise_for_status() + return cls.from_bytes(response.content, source_name=url, member_filename=member_filename) + + @classmethod + def from_identifiers(cls, identifiers: Iterable[str]) -> "FastaReference": + """ + Build a reference directly from an iterable of identifiers. + + Parameters + ---------- + identifiers : iterable of str + Identifiers to index (e.g. accessions extracted elsewhere). + + Returns + ------- + FastaReference + Reference indexing the supplied identifiers. + """ + return cls(identifiers) diff --git a/proteobench/validation/profiles.py b/proteobench/validation/profiles.py new file mode 100644 index 000000000..df5cb36b5 --- /dev/null +++ b/proteobench/validation/profiles.py @@ -0,0 +1,319 @@ +""" +Validation checks, profiles, and the profile registry. + +This module is the extensibility surface of the validation layer. It models +validation as two composable pieces: + +* a :class:`Check` wraps a single ``ctx -> list[ValidationIssue]`` function with + a stable name and description; +* a :class:`ValidationProfile` is an ordered list of checks that applies to one + category of module. + +Profiles are looked up by name in a module-level registry. A module declares +which profile it uses (via ``[validation].profile`` in ``module_settings.toml``, +or by inference from its parser class); the orchestrator then runs that +profile's checks. Adding a new module of an existing category requires no code. +Adding a genuinely new category requires only registering a new profile here (or +from third-party code via :func:`register_profile`), without touching the +orchestrator. + +Checks are reusable across profiles: for example ``run_consistency`` is shared +by both the quant and de novo profiles. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Callable, Dict, List, Optional + +from proteobench.validation.checks import ( + check_charge_range, + check_enzyme, + check_fdr_psm, + check_mass_tolerances, + check_max_modifications, + check_modifications, + check_peptide_length, + check_protein_ids, + check_run_consistency, +) +from proteobench.validation.context import ValidationContext +from proteobench.validation.report import ValidationIssue, ValidationReport + +#: Type alias for a check function: takes a context, returns a list of issues. +CheckFunc = Callable[[ValidationContext], List[ValidationIssue]] + + +@dataclass +class Check: + """ + A single, named validation check. + + Attributes + ---------- + name : str + Stable identifier used in fallback error messages and progress display. + func : callable + A function ``ctx -> list[ValidationIssue]``. + description : str, optional + Human-readable description of what the check verifies. + """ + + name: str + func: CheckFunc + description: str = "" + + def run(self, ctx: ValidationContext) -> List[ValidationIssue]: + """ + Execute the check against a validation context. + + Parameters + ---------- + ctx : ValidationContext + The inputs available to the check. + + Returns + ------- + list of ValidationIssue + Issues produced by the check (possibly empty). + """ + return self.func(ctx) + + +@dataclass +class ValidationProfile: + """ + An ordered set of checks that applies to one category of module. + + Attributes + ---------- + name : str + Unique profile name (the routing key declared by modules). + checks : list of Check + Checks to run, in order. + description : str, optional + Human-readable description of the profile. + """ + + name: str + checks: List[Check] = field(default_factory=list) + description: str = "" + + @property + def check_names(self) -> List[str]: + """ + Return the names of the checks in this profile. + + Returns + ------- + list of str + The ordered check names. + """ + return [check.name for check in self.checks] + + +# --------------------------------------------------------------------------- # +# Registry +# --------------------------------------------------------------------------- # + +_PROFILES: Dict[str, ValidationProfile] = {} + + +def register_profile(profile: ValidationProfile, overwrite: bool = False) -> None: + """ + Register a validation profile under its name. + + Parameters + ---------- + profile : ValidationProfile + The profile to register. + overwrite : bool, optional + If ``False`` (default), registering a name that already exists raises. + Set ``True`` to replace an existing profile. + + Raises + ------ + ValueError + If a profile with the same name is already registered and + ``overwrite`` is ``False``. + """ + if profile.name in _PROFILES and not overwrite: + raise ValueError( + f"A validation profile named '{profile.name}' is already registered. " "Pass overwrite=True to replace it." + ) + _PROFILES[profile.name] = profile + + +def unregister_profile(name: str) -> None: + """ + Remove a profile from the registry if present. + + Parameters + ---------- + name : str + Name of the profile to remove. + """ + _PROFILES.pop(name, None) + + +def get_profile(name: str) -> Optional[ValidationProfile]: + """ + Look up a registered profile by name. + + Parameters + ---------- + name : str + Profile name. + + Returns + ------- + ValidationProfile or None + The registered profile, or ``None`` if no profile has that name (or if + ``name`` is not a string). + """ + if not isinstance(name, str): + return None + return _PROFILES.get(name) + + +def available_profiles() -> List[str]: + """ + List the names of all registered profiles. + + Returns + ------- + list of str + Sorted profile names. + """ + return sorted(_PROFILES) + + +# --------------------------------------------------------------------------- # +# Check adapters +# +# Trivial pass-throughs that simply forward context fields to the underlying +# pure check functions in ``checks.py`` are defined inline as lambdas in the +# profile definitions below. Checks that need extra orchestration logic (such +# as deciding whether a reference is available) are defined as named functions +# here. +# --------------------------------------------------------------------------- # + + +def _protein_ids_check(ctx: ValidationContext) -> List[ValidationIssue]: + """ + Validate protein identifiers against the reference FASTA, if available. + + Parameters + ---------- + ctx : ValidationContext + The validation context (uses ``fasta``, ``standard_df``, ``config``). + + Returns + ------- + list of ValidationIssue + Protein-ID issues, or an info message if no FASTA reference is available. + """ + if ctx.fasta is not None and len(ctx.fasta) > 0: + return check_protein_ids(ctx.standard_df, ctx.fasta, ctx.config) + report = ValidationReport() + report.add_info( + "no_fasta_reference", + "No reference FASTA was available; protein-identifier validation was skipped.", + "protein_ids", + ) + return report.issues + + +def _denovo_pending_check(ctx: ValidationContext) -> List[ValidationIssue]: + """ + Emit an informational notice that de novo checks are not yet implemented. + + Parameters + ---------- + ctx : ValidationContext + The validation context (unused; present for the uniform signature). + + Returns + ------- + list of ValidationIssue + A single info issue. + """ + report = ValidationReport() + report.add_info( + "denovo_validation_pending", + "De novo content checks are not yet implemented. Quant checks (protein IDs, charge, " + "peptide length) do not apply to this module type. Implement de novo checks and add " + "them to the 'denovo' profile in proteobench/validation/profiles.py.", + "input", + ) + return report.issues + + +# --------------------------------------------------------------------------- # +# Built-in profiles +# --------------------------------------------------------------------------- # + +QUANT_LFQ_PROFILE = ValidationProfile( + name="quant_lfq", + description="LFQ quantification modules (HYE/PYE): protein IDs, charge, peptide length, enzyme, mods.", + checks=[ + Check("protein_ids", _protein_ids_check, "Protein identifiers present in the reference FASTA."), + Check( + "charge_range", + lambda ctx: check_charge_range(ctx.standard_df, ctx.parameters, ctx.config), + "Precursor charges within the searched charge range.", + ), + Check( + "peptide_length", + lambda ctx: check_peptide_length(ctx.standard_df, ctx.parameters, ctx.config), + "Peptide lengths within the searched length range.", + ), + Check( + "enzyme", + lambda ctx: check_enzyme(ctx.standard_df, ctx.parameters, ctx.config), + "Trypsin-family missed-cleavage heuristic (warning only).", + ), + Check( + "modifications", + lambda ctx: check_modifications(ctx.standard_df, ctx.parameters, ctx.config), + "Observed modifications declared in the parameter file (warning only).", + ), + Check( + "max_modifications", + lambda ctx: check_max_modifications(ctx.standard_df, ctx.parameters, ctx.config), + "Number of modifications per peptidoform within max_mods (warning only).", + ), + Check( + "mass_tolerances", + lambda ctx: check_mass_tolerances(ctx.standard_df, ctx.parameters, ctx.config), + "Precursor/fragment mass tolerances are present and positive; the plausibility " + "ceiling is checked only when configured (warning only).", + ), + Check( + "fdr_psm", + lambda ctx: check_fdr_psm(ctx.standard_df, ctx.parameters, ctx.config), + "PSM-level FDR within the valid range and recommended maximum (warning only).", + ), + Check( + "run_consistency", + lambda ctx: check_run_consistency(ctx.standard_df, ctx.parameters, ctx.input_format, ctx.config), + "Parameter file matches the submitted run (software identity).", + ), + ], +) + +DENOVO_PROFILE = ValidationProfile( + name="denovo", + description="De novo sequencing modules. Reuses run-consistency; content checks are pending.", + checks=[ + Check( + "run_consistency", + lambda ctx: check_run_consistency(ctx.standard_df, ctx.parameters, ctx.input_format, ctx.config), + "Parameter file matches the submitted run (software identity).", + ), + Check("denovo_pending", _denovo_pending_check, "Placeholder for future de novo content checks."), + ], +) + +register_profile(QUANT_LFQ_PROFILE, overwrite=True) +register_profile(DENOVO_PROFILE, overwrite=True) diff --git a/proteobench/validation/protein_ids.py b/proteobench/validation/protein_ids.py new file mode 100644 index 000000000..20c1dcd14 --- /dev/null +++ b/proteobench/validation/protein_ids.py @@ -0,0 +1,160 @@ +""" +Protein-identifier extraction helpers for submission validation. + +ProteoBench tool outputs store protein identifiers in the standardized +``Proteins`` column. The representation is not fully normalized across tools: + +* a single protein may be a UniProt-style triplet such as ``sp|P49327|FAS_HUMAN`` + (the ``|`` separates database/accession/entry-name), a bare accession such as + ``P49327``, or an isoform such as ``P49327-2``; +* multiple proteins (protein groups) are joined with ``;`` (e.g. MaxQuant) or + ``,`` (e.g. the FragPipe loader combines ``Protein`` and ``Mapped Proteins``). + +These helpers split protein-group strings into individual proteins and extract +the candidate identifiers (accession, entry name, isoform base) used to match +against a FASTA-derived accession set. They are deliberately generic so the +core validator does not embed tool-specific assumptions. +""" + +from __future__ import annotations + +import re +from typing import Iterable, List, Set + +#: Default separators used to split a protein-group string into individual proteins. +#: The ``|`` character is intentionally excluded because it is a *within-protein* +#: separator in UniProt identifiers (``db|accession|entryname``). +DEFAULT_GROUP_SEPARATORS = (";", ",") + +#: Matches a UniProt-style ``db|accession|entryname`` token. +_UNIPROT_TRIPLET = re.compile(r"^(?:sp|tr|up)\|([^|]+)\|(\S+)$", re.IGNORECASE) + +#: Matches a trailing isoform suffix such as ``-2`` on an accession. +_ISOFORM_SUFFIX = re.compile(r"-\d+$") + + +def split_protein_groups(value: str, separators: Iterable[str] = DEFAULT_GROUP_SEPARATORS) -> List[str]: + """ + Split a protein-group cell into individual protein tokens. + + Parameters + ---------- + value : str + The raw value of a ``Proteins`` cell (may contain several proteins). + separators : iterable of str, optional + Characters that separate proteins within a group. Defaults to + :data:`DEFAULT_GROUP_SEPARATORS` (``;`` and ``,``). + + Returns + ------- + list of str + Stripped, non-empty individual protein tokens. + """ + if value is None: + return [] + text = str(value).strip() + if not text: + return [] + + seps = [s for s in separators if s] + if not seps: + tokens = [text] + else: + pattern = "|".join(re.escape(s) for s in seps) + tokens = re.split(pattern, text) + + return [t.strip() for t in tokens if t and t.strip()] + + +def extract_identifiers(protein_token: str) -> Set[str]: + """ + Extract candidate identifiers from a single protein token. + + For a UniProt triplet such as ``sp|P49327|FAS_HUMAN`` this returns the + accession (``P49327``), the entry name (``FAS_HUMAN``), and (for isoforms) + the isoform base accession. For a bare accession it returns the accession + and its isoform base. For any other token it returns the token unchanged. + + Parameters + ---------- + protein_token : str + A single protein identifier (one element of a protein group). + + Returns + ------- + set of str + Candidate identifiers usable for FASTA membership testing. + """ + if protein_token is None: + return set() + token = str(protein_token).strip() + if not token: + return set() + + identifiers: Set[str] = {token} + + triplet = _UNIPROT_TRIPLET.match(token) + if triplet: + accession, entry_name = triplet.group(1), triplet.group(2) + identifiers.add(accession) + identifiers.add(entry_name) + base = _ISOFORM_SUFFIX.sub("", accession) + if base != accession: + identifiers.add(base) + elif "|" in token: + # Unknown ``a|b|c`` shape: keep every part as a candidate. + for part in token.split("|"): + part = part.strip() + if part: + identifiers.add(part) + else: + base = _ISOFORM_SUFFIX.sub("", token) + if base != token: + identifiers.add(base) + + return identifiers + + +def is_decoy_or_contaminant( + protein_token: str, + contaminant_flag: str = None, + decoy_prefixes: Iterable[str] = (), +) -> bool: + """ + Determine whether a protein token is a decoy or contaminant marker. + + The check is case-insensitive and matches the contaminant flag as a + substring (mirroring ParseSettings contaminant detection) and the decoy + markers as case-insensitive prefixes. + + Parameters + ---------- + protein_token : str + A single protein identifier. + contaminant_flag : str, optional + Substring marking contaminant proteins (from the tool parse settings, + e.g. ``"Cont_"``). ``None`` disables contaminant detection. + decoy_prefixes : iterable of str, optional + Prefixes marking decoy proteins (e.g. ``"rev_"``, ``"DECOY_"``). + + Returns + ------- + bool + ``True`` if the token is a decoy or contaminant identifier. + """ + if protein_token is None: + return False + token = str(protein_token).strip() + if not token: + return False + + lowered = token.lower() + + if contaminant_flag and str(contaminant_flag).lower() in lowered: + return True + + for prefix in decoy_prefixes: + if prefix and lowered.startswith(str(prefix).lower()): + return True + + return False diff --git a/proteobench/validation/report.py b/proteobench/validation/report.py new file mode 100644 index 000000000..8050175ba --- /dev/null +++ b/proteobench/validation/report.py @@ -0,0 +1,375 @@ +""" +Structured validation report objects for ProteoBench submission validation. + +This module defines the data model returned by the validation layer +(:func:`proteobench.validation.validator.validate_submission`). The report is a +plain, framework-agnostic container so that it can be produced in the core +library and rendered by any front end (Streamlit, notebooks, CLI). + +It exposes three objects: ``Severity`` (the issue severity enumeration), +``ValidationIssue`` (a single machine- and human-readable finding), and +``ValidationReport`` (a collection of issues with overall pass/fail helpers). +""" + +from __future__ import annotations + +from dataclasses import dataclass +from dataclasses import field as dc_field +from enum import Enum +from typing import Any, Dict, List, Optional + + +class Severity(str, Enum): + """ + Severity level of a validation issue. + + Severity controls only display prominence and inclusion in the pull-request + summary; it does not gate the Streamlit submission flow (no severity blocks + submission). It also drives the optional programmatic + :meth:`ValidationReport.raise_if_errors` path. + """ + + ERROR = "error" + WARNING = "warning" + INFO = "info" + + +@dataclass +class ValidationIssue: + """ + A single validation finding. + + Attributes + ---------- + code : str + Machine-readable issue code (stable identifier, e.g. ``"protein_not_in_fasta"``). + severity : Severity + Severity of the issue. + message : str + Human-readable description of the issue. + check : str + Name of the check that produced the issue (e.g. ``"protein_ids"``). + field : str, optional + Relevant field, file, or column name the issue refers to. + observed : Any, optional + Observed value (or a short summary of it). + expected : Any, optional + Expected value or allowed range, where applicable. + examples : list, optional + A small number of example offending rows or identifiers. + """ + + code: str + severity: Severity + message: str + check: str + field: Optional[str] = None + observed: Any = None + expected: Any = None + examples: List[Any] = dc_field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the issue to a JSON-serialisable dictionary. + + Returns + ------- + dict + Dictionary representation of the issue. + """ + return { + "code": self.code, + "severity": self.severity.value, + "message": self.message, + "check": self.check, + "field": self.field, + "observed": self.observed, + "expected": self.expected, + "examples": list(self.examples), + } + + +@dataclass +class ValidationReport: + """ + Collection of validation issues with overall status helpers. + + Attributes + ---------- + issues : list of ValidationIssue + Issues collected during validation. + """ + + issues: List[ValidationIssue] = dc_field(default_factory=list) + + def add( + self, + code: str, + severity: Severity, + message: str, + check: str, + field: Optional[str] = None, + observed: Any = None, + expected: Any = None, + examples: Optional[List[Any]] = None, + ) -> "ValidationReport": + """ + Append a new issue to the report. + + Parameters + ---------- + code : str + Machine-readable issue code. + severity : Severity + Severity of the issue. + message : str + Human-readable description. + check : str + Name of the originating check. + field : str, optional + Relevant field, file, or column name. + observed : Any, optional + Observed value. + expected : Any, optional + Expected value or allowed range. + examples : list, optional + Example offending rows or identifiers. + + Returns + ------- + ValidationReport + The report itself, to allow chaining. + """ + self.issues.append( + ValidationIssue( + code=code, + severity=severity, + message=message, + check=check, + field=field, + observed=observed, + expected=expected, + examples=list(examples) if examples else [], + ) + ) + return self + + def add_error(self, code: str, message: str, check: str, **kwargs: Any) -> "ValidationReport": + """ + Append an ``ERROR`` issue. + + Parameters + ---------- + code : str + Machine-readable issue code. + message : str + Human-readable description. + check : str + Name of the originating check. + **kwargs : dict + Optional ``field``, ``observed``, ``expected``, and ``examples`` values. + + Returns + ------- + ValidationReport + The report itself, to allow chaining. + """ + return self.add(code, Severity.ERROR, message, check, **kwargs) + + def add_warning(self, code: str, message: str, check: str, **kwargs: Any) -> "ValidationReport": + """ + Append a ``WARNING`` issue. + + Parameters + ---------- + code : str + Machine-readable issue code. + message : str + Human-readable description. + check : str + Name of the originating check. + **kwargs : dict + Optional ``field``, ``observed``, ``expected``, and ``examples`` values. + + Returns + ------- + ValidationReport + The report itself, to allow chaining. + """ + return self.add(code, Severity.WARNING, message, check, **kwargs) + + def add_info(self, code: str, message: str, check: str, **kwargs: Any) -> "ValidationReport": + """ + Append an ``INFO`` issue. + + Parameters + ---------- + code : str + Machine-readable issue code. + message : str + Human-readable description. + check : str + Name of the originating check. + **kwargs : dict + Optional ``field``, ``observed``, ``expected``, and ``examples`` values. + + Returns + ------- + ValidationReport + The report itself, to allow chaining. + """ + return self.add(code, Severity.INFO, message, check, **kwargs) + + def extend(self, issues: List[ValidationIssue]) -> "ValidationReport": + """ + Append several issues at once. + + Parameters + ---------- + issues : list of ValidationIssue + Issues to add. + + Returns + ------- + ValidationReport + The report itself, to allow chaining. + """ + self.issues.extend(issues) + return self + + @property + def errors(self) -> List[ValidationIssue]: + """ + Return all ``ERROR`` issues. + + Returns + ------- + list of ValidationIssue + The error-level issues. + """ + return [i for i in self.issues if i.severity == Severity.ERROR] + + @property + def warnings(self) -> List[ValidationIssue]: + """ + Return all ``WARNING`` issues. + + Returns + ------- + list of ValidationIssue + The warning-level issues. + """ + return [i for i in self.issues if i.severity == Severity.WARNING] + + @property + def infos(self) -> List[ValidationIssue]: + """ + Return all ``INFO`` issues. + + Returns + ------- + list of ValidationIssue + The info-level issues. + """ + return [i for i in self.issues if i.severity == Severity.INFO] + + @property + def has_errors(self) -> bool: + """ + Whether the report contains any ``ERROR`` issue. + + Returns + ------- + bool + ``True`` if at least one error is present. + """ + return any(i.severity == Severity.ERROR for i in self.issues) + + @property + def passed(self) -> bool: + """ + Overall pass status (no ``ERROR`` issues). + + This is informational only: the Streamlit submission flow does not gate + on it (submission is never blocked). It is used for display and by the + optional :meth:`raise_if_errors` path. + + Returns + ------- + bool + ``True`` when there are no ``ERROR`` issues (warnings allowed). + """ + return not self.has_errors + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the report to a JSON-serialisable dictionary. + + Returns + ------- + dict + Dictionary with overall status and the list of issues. + """ + return { + "passed": self.passed, + "n_errors": len(self.errors), + "n_warnings": len(self.warnings), + "n_infos": len(self.infos), + "issues": [i.to_dict() for i in self.issues], + } + + def summary(self, include_info: bool = False) -> str: + """ + Build a compact Markdown summary of the report. + + Useful for embedding the findings into pull-request text or logs. The + wording is neutral: submission validation does not block submission, it + only surfaces points for the submitter and reviewers to consider. + + Parameters + ---------- + include_info : bool, optional + Whether to include ``INFO`` issues in the summary. Default ``False``. + + Returns + ------- + str + Markdown-formatted summary. + """ + lines = ["### Automated submission checks"] + + n_flagged = len(self.errors) + len(self.warnings) + if n_flagged == 0: + lines.append("All automated checks passed.") + else: + lines.append( + f"{len(self.errors)} item(s) to review and {len(self.warnings)} note(s) were flagged " + "for reviewer attention (these do not block submission)." + ) + + selected = list(self.errors) + list(self.warnings) + if include_info: + selected += list(self.infos) + + for issue in selected: + line = f"- {issue.message}" + if issue.examples: + shown = ", ".join(str(e) for e in issue.examples[:5]) + line += f" Examples: {shown}" + lines.append(line) + + return "\n".join(lines) + + def raise_if_errors(self) -> None: + """ + Raise :class:`SubmissionValidationError` if any error issue is present. + + Raises + ------ + SubmissionValidationError + If the report contains at least one ``ERROR`` issue. + """ + if self.has_errors: + from proteobench.validation.exceptions import SubmissionValidationError + + raise SubmissionValidationError(self) diff --git a/proteobench/validation/validator.py b/proteobench/validation/validator.py new file mode 100644 index 000000000..ea3b74071 --- /dev/null +++ b/proteobench/validation/validator.py @@ -0,0 +1,135 @@ +""" +Central submission-validation API. + +:func:`validate_submission` resolves the module's validation profile, builds a +:class:`~proteobench.validation.context.ValidationContext`, and runs the +profile's checks, returning a single structured +:class:`~proteobench.validation.report.ValidationReport`. The caller decides +what to do with the report (typically: block public submission when +``report.has_errors`` is true, but allow it through with warnings). + +The orchestrator is generic: it does not know about any particular module type. +Which checks run is determined entirely by the resolved profile +(:mod:`proteobench.validation.profiles`). Adding a new module of an existing +category needs no code; adding a new category needs only a new registered +profile. + +The function is framework-agnostic and performs no I/O: any reference data (a +FASTA, a ground-truth table) is supplied via the arguments / context. Front ends +are responsible for obtaining the standardized DataFrame and the reference. +""" + +from __future__ import annotations + +from typing import Any, Optional + +import pandas as pd + +from proteobench.validation.config import ModuleValidationConfig +from proteobench.validation.context import ValidationContext +from proteobench.validation.fasta import FastaReference +from proteobench.validation.profiles import Check, available_profiles, get_profile +from proteobench.validation.report import ValidationReport + + +def validate_submission( + standard_df: pd.DataFrame, + parameters: Any = None, + fasta: Optional[FastaReference] = None, + config: Optional[ModuleValidationConfig] = None, + input_format: Optional[str] = None, + profile: Optional[str] = None, +) -> ValidationReport: + """ + Validate a benchmark submission and return a structured report. + + The set of checks run is determined by the validation profile, resolved from + (in order): the explicit ``profile`` argument, ``config.validation_profile``, + or the default. Each check is fault-tolerant: a check that raises an + unexpected exception is converted to a warning so that validation itself + never crashes the submission flow. + + Parameters + ---------- + standard_df : pandas.DataFrame + The standardized result DataFrame produced by the module parser. + parameters : Any, optional + Parsed parameters (a :class:`ProteoBenchParameters` or any object with + the same attributes). Parameter-dependent checks degrade to warnings + when values are missing. + fasta : FastaReference, optional + Reference protein identifiers, for profiles that validate against a + sequence database. + config : ModuleValidationConfig, optional + Module validation configuration. Defaults to a generic configuration + (which selects the default profile). + input_format : str, optional + The selected software tool, used for run-consistency checks. + profile : str, optional + Explicit profile name, overriding ``config.validation_profile``. Mostly + useful for testing. + + Returns + ------- + ValidationReport + The aggregated validation report. + """ + config = config or ModuleValidationConfig() + report = ValidationReport() + + if not isinstance(standard_df, pd.DataFrame) or standard_df.empty: + report.add_error( + "empty_results", + "The standardized results are empty; nothing could be validated.", + "input", + ) + return report + + profile_name = profile or config.validation_profile + profile_obj = get_profile(profile_name) + + if profile_obj is None: + report.add_warning( + "unknown_validation_profile", + f"No validation profile named '{profile_name}' is registered " + f"(available: {available_profiles()}); no checks were run.", + "input", + ) + return report + + ctx = ValidationContext( + standard_df=standard_df, + parameters=parameters, + config=config, + fasta=fasta, + input_format=input_format, + reference=fasta, + ) + + for check in profile_obj.checks: + _run_check(report, check, ctx) + + return report + + +def _run_check(report: ValidationReport, check: Check, ctx: ValidationContext) -> None: + """ + Run a single check and absorb unexpected failures as warnings. + + Parameters + ---------- + report : ValidationReport + The report to extend with the check's issues. + check : Check + The check to run. + ctx : ValidationContext + The validation context passed to the check. + """ + try: + report.extend(check.run(ctx)) + except Exception as exc: # noqa: BLE001 - validation must never crash the flow + report.add_warning( + "check_failed", + f"The '{check.name}' validation check could not be completed ({type(exc).__name__}: {exc}).", + check.name, + ) diff --git a/test/data/validation/ProteoBench_validation_reference.fasta b/test/data/validation/ProteoBench_validation_reference.fasta new file mode 100644 index 000000000..4d04e4abd --- /dev/null +++ b/test/data/validation/ProteoBench_validation_reference.fasta @@ -0,0 +1,12 @@ +>sp|P10001|AAA_HUMAN Test protein AAA OS=Homo sapiens OX=9606 GN=AAA +MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFE +>tr|P10002|BBB_YEAST Test protein BBB OS=Saccharomyces cerevisiae OX=559292 +MSIPETQKGVIFYESHGKLEYKDIPVPKPKANELLINVKYSGVCHTDLHAWHGDWPLPVK +>P10003 +MVLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKK +>sp|P49327|FAS_HUMAN Fatty acid synthase OS=Homo sapiens OX=9606 GN=FASN +MEEVVIAGMSGKLPESENLQEFWDNLIGGVDMVTDDDRRWKAGLYGLPRRSGKLKDLSRFD +>P00330 ADH1_YEAST Alcohol dehydrogenase 1 +SIPETQKGVIFYESHGKLEYKDIPVPKPKANELLINVKYSGVCHTDLHAWHGDWPLPVKLP +>sp|P10005|EEE_ECOLI Test protein EEE OS=Escherichia coli OX=83333 GN=eee +MKLLNVINTQETALNDLKAFAQTSGTNNTNVTVTLLGNQKDLAEAITAEKNGKLNVTVKLP diff --git a/test/test_validation.py b/test/test_validation.py new file mode 100644 index 000000000..765382807 --- /dev/null +++ b/test/test_validation.py @@ -0,0 +1,717 @@ +""" +Unit and lightweight integration tests for the submission-validation layer +(:mod:`proteobench.validation`). +""" + +import io +import os +import zipfile +from types import SimpleNamespace + +import pandas as pd +import pytest + +from proteobench.io.parsing.parse_ion import load_input_file +from proteobench.io.parsing.parse_settings import ParseSettingsBuilder +from proteobench.validation import ( + Check, + FastaReference, + ModuleValidationConfig, + Severity, + SubmissionValidationError, + ValidationContext, + ValidationProfile, + ValidationReport, + available_profiles, + get_profile, + register_profile, + unregister_profile, + validate_submission, +) +from proteobench.validation.checks import ( + check_charge_range, + check_enzyme, + check_fdr_psm, + check_mass_tolerances, + check_max_modifications, + check_modifications, + check_peptide_length, + check_protein_ids, + check_run_consistency, +) +from proteobench.validation.checks import _parse_tolerance +from proteobench.validation.config import _resolve_profile +from proteobench.validation.fasta import parse_fasta_header +from proteobench.validation.protein_ids import ( + extract_identifiers, + is_decoy_or_contaminant, + split_protein_groups, +) + +HERE = os.path.dirname(__file__) +FASTA_FIXTURE = os.path.join(HERE, "data", "validation", "ProteoBench_validation_reference.fasta") + +QEXACTIVE_DATA_DIR = os.path.join(HERE, "data", "quant", "quant_lfq_ion_DDA_QExactive") +QEXACTIVE_SETTINGS_DIR = os.path.abspath( + os.path.join( + HERE, "..", "proteobench", "io", "parsing", "io_parse_settings", "Quant", "lfq", "DDA", "ion", "QExactive" + ) +) +QEXACTIVE_MODULE_ID = "quant_lfq_DDA_ion_QExactive" + + +def make_params(**overrides): + """Build a parameters stub with sensible defaults, overridable per test.""" + defaults = dict( + software_name="MaxQuant", + software_version="2.5.1.0", + search_engine="Andromeda", + min_precursor_charge=2, + max_precursor_charge=4, + min_peptide_length=6, + max_peptide_length=30, + enzyme="Trypsin", + semi_enzymatic=False, + allowed_miscleavages=2, + fixed_mods="Carbamidomethyl (C)", + variable_mods="Oxidation (M),Acetyl (Protein N-term)", + max_mods=3, + precursor_mass_tolerance="[-20.0 ppm, 20.0 ppm]", + fragment_mass_tolerance="[-20.0 ppm, 20.0 ppm]", + ident_fdr_psm=0.01, + ) + defaults.update(overrides) + return SimpleNamespace(**defaults) + + +def make_standard_df(**overrides): + """Build a small valid standardized result DataFrame.""" + data = dict( + Proteins=[ + "sp|P10001|AAA_HUMAN", + "tr|P10002|BBB_YEAST", + "P10003", + "sp|P49327|FAS_HUMAN", + ], + Sequence=["PEPTIDEK", "ELVISLIVESR", "SAMPLERPEPK", "VFRRDTHK"], + Charge=[2, 3, 2, 2], + proforma=["PEPTIDEK", "ELVISLIVESR", "SAMPLERPEPK", "VFR[Oxidation]RDTHK"], + ) + data["precursor ion"] = [f"{s}/{c}" for s, c in zip(data["proforma"], data["Charge"])] + df = pd.DataFrame(data) + for key, value in overrides.items(): + df[key] = value + return df + + +@pytest.fixture +def reference_fasta(): + return FastaReference.from_path(FASTA_FIXTURE) + + +# --------------------------------------------------------------------------- # +# protein-id helpers and FASTA parsing +# --------------------------------------------------------------------------- # + + +def test_split_protein_groups_handles_separators(): + assert split_protein_groups("sp|P1|A_HUMAN;sp|P2|B_YEAST") == ["sp|P1|A_HUMAN", "sp|P2|B_YEAST"] + assert split_protein_groups("sp|P1|A_HUMAN,sp|P2|B_YEAST") == ["sp|P1|A_HUMAN", "sp|P2|B_YEAST"] + assert split_protein_groups("") == [] + assert split_protein_groups(None) == [] + + +def test_extract_identifiers_uniprot_bare_and_isoform(): + assert extract_identifiers("sp|P49327|FAS_HUMAN") >= {"P49327", "FAS_HUMAN"} + assert extract_identifiers("tr|A0A024|X_HUMAN") >= {"A0A024", "X_HUMAN"} + assert extract_identifiers("P12345") >= {"P12345"} + # isoform base accession is added + assert "P49327" in extract_identifiers("P49327-2") + + +def test_is_decoy_or_contaminant(): + assert is_decoy_or_contaminant("Cont_keratin", contaminant_flag="Cont_") + assert is_decoy_or_contaminant("rev_sp|P1|A_HUMAN", decoy_prefixes=("rev_",)) + assert not is_decoy_or_contaminant("sp|P1|A_HUMAN", contaminant_flag="Cont_", decoy_prefixes=("rev_",)) + + +def test_parse_fasta_header_forms(): + assert parse_fasta_header(">sp|P49327|FAS_HUMAN Fatty acid synthase") >= {"P49327", "FAS_HUMAN"} + assert parse_fasta_header(">P00330 ADH1") >= {"P00330"} + assert parse_fasta_header(">") == set() + + +def test_fasta_reference_from_path(reference_fasta): + assert len(reference_fasta) > 0 + assert reference_fasta.contains("P49327") + assert reference_fasta.contains("FAS_HUMAN") + assert reference_fasta.contains("p49327") # case-insensitive + assert not reference_fasta.contains("P99999") + assert reference_fasta.contains_any(extract_identifiers("sp|P49327|FAS_HUMAN")) + + +def test_fasta_reference_from_zip_bytes(): + text = ">sp|P10001|AAA_HUMAN x\nMKWV\n>P10003\nMVLS\n" + buffer = io.BytesIO() + with zipfile.ZipFile(buffer, "w") as zf: + zf.writestr("reference.fasta", text) + ref = FastaReference.from_bytes(buffer.getvalue(), source_name="reference.zip") + assert ref.contains("P10001") + assert ref.contains("P10003") + + +# --------------------------------------------------------------------------- # +# protein-id check +# --------------------------------------------------------------------------- # + + +def test_protein_ids_all_present_passes(reference_fasta): + df = make_standard_df() + issues = check_protein_ids(df, reference_fasta, ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + + +def test_protein_ids_missing_reports_error(reference_fasta): + df = make_standard_df() + df.loc[0, "Proteins"] = "sp|P99999|MISSING_HUMAN" + issues = check_protein_ids(df, reference_fasta, ModuleValidationConfig()) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert len(errors) == 1 + assert errors[0].code == "protein_not_in_fasta" + assert "sp|P99999|MISSING_HUMAN" in errors[0].examples + assert errors[0].observed["n_missing"] == 1 + + +def test_protein_groups_partial_membership(reference_fasta): + # Group with one known + one unknown -> unknown reported, known not. + df = make_standard_df() + df.loc[0, "Proteins"] = "sp|P49327|FAS_HUMAN;sp|P98765|GHOST_HUMAN" + issues = check_protein_ids(df, reference_fasta, ModuleValidationConfig()) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert len(errors) == 1 + assert errors[0].examples == ["sp|P98765|GHOST_HUMAN"] + + +def test_protein_ids_decoy_and_contaminant_ignored(reference_fasta): + df = make_standard_df() + df.loc[0, "Proteins"] = "Cont_keratin" + df.loc[1, "Proteins"] = "rev_sp|P77777|DECOY_HUMAN" + config = ModuleValidationConfig(contaminant_flag="Cont_") + issues = check_protein_ids(df, reference_fasta, config) + assert not any(i.severity == Severity.ERROR for i in issues) + + +# --------------------------------------------------------------------------- # +# charge / peptide-length checks +# --------------------------------------------------------------------------- # + + +def test_charge_out_of_range_errors(): + df = make_standard_df(Charge=[2, 3, 9, 2]) + issues = check_charge_range(df, make_params(), ModuleValidationConfig()) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert len(errors) == 1 + assert errors[0].code == "charge_out_of_range" + assert 9 in errors[0].observed + + +def test_charge_within_range_passes(): + issues = check_charge_range(make_standard_df(), make_params(), ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + + +def test_charge_missing_param_warns_not_errors(): + params = make_params(min_precursor_charge=None, max_precursor_charge="None") + issues = check_charge_range(make_standard_df(Charge=[9, 9, 9, 9]), params, ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + assert any(i.code == "charge_range_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +def test_peptide_length_out_of_range_errors(): + df = make_standard_df(Sequence=["AAA", "A" * 40, "PEPTIDEK", "ELVISLIVESR"]) + issues = check_peptide_length(df, make_params(), ModuleValidationConfig()) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert len(errors) == 1 + assert errors[0].code == "peptide_length_out_of_range" + assert "AAA" in errors[0].examples + + +def test_peptide_length_missing_param_warns(): + params = make_params(min_peptide_length=None, max_peptide_length=None) + issues = check_peptide_length(make_standard_df(), params, ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + assert any(i.code == "peptide_length_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +# --------------------------------------------------------------------------- # +# enzyme / modifications / run consistency +# --------------------------------------------------------------------------- # + + +def test_enzyme_missed_cleavages_warning_only(): + df = make_standard_df(Sequence=["PEPKTIDKEKR", "ELVISLIVESR", "SAMPLEK", "VFRDTHK"]) + params = make_params(enzyme="Trypsin", allowed_miscleavages=0) + issues = check_enzyme(df, params, ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + assert any(i.code == "missed_cleavages_exceeded" and i.severity == Severity.WARNING for i in issues) + + +def test_enzyme_chymotrypsin_is_supported(): + # Chymotrypsin is now a supported enzyme, so it is NOT skipped as unsupported. + issues = check_enzyme(make_standard_df(), make_params(enzyme="Chymotrypsin"), ModuleValidationConfig()) + assert not any(i.code == "enzyme_check_unsupported" for i in issues) + + +def test_enzyme_missing_warns(): + issues = check_enzyme(make_standard_df(), make_params(enzyme=None), ModuleValidationConfig()) + assert any(i.code == "enzyme_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +def test_modifications_missing_param_warns(): + params = make_params(fixed_mods=None, variable_mods="None") + issues = check_modifications(make_standard_df(), params, ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + assert any(i.code == "modifications_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +def test_modifications_declared_pass(): + issues = check_modifications(make_standard_df(), make_params(), ModuleValidationConfig()) + # 'Oxidation' is declared in the default variable_mods -> no mismatch warning. + assert not any(i.code == "modification_not_declared" for i in issues) + + +def test_modifications_undeclared_warns(): + df = make_standard_df() + df["proforma"] = ["PEP[Phospho]TIDEK", "ELVISLIVESR", "SAMPLERPEPK", "VFRDTHK"] + issues = check_modifications(df, make_params(), ModuleValidationConfig()) + assert any(i.code == "modification_not_declared" and i.severity == Severity.WARNING for i in issues) + + +def test_run_consistency_software_mismatch_error(): + params = make_params(software_name="FragPipe") + issues = check_run_consistency(make_standard_df(), params, "MaxQuant", ModuleValidationConfig()) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert any(i.code == "software_mismatch" for i in errors) + + +def test_run_consistency_match_no_error(): + issues = check_run_consistency(make_standard_df(), make_params(), "MaxQuant", ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + + +# --------------------------------------------------------------------------- # +# enzyme: multiple proteolytic enzymes +# --------------------------------------------------------------------------- # + + +def test_enzyme_trypsin_proline_rule(): + # "PEPKPTIDER": K at index 3 is followed by P -> not a missed cleavage for Trypsin, + # but IS one for Trypsin/P. + df = make_standard_df(Sequence=["PEPKPTIDER", "PEPKPTIDER", "PEPKPTIDER", "PEPKPTIDER"]) + trypsin = check_enzyme(df, make_params(enzyme="Trypsin", allowed_miscleavages=0), ModuleValidationConfig()) + assert not any(i.code == "missed_cleavages_exceeded" for i in trypsin) + trypsin_p = check_enzyme(df, make_params(enzyme="Trypsin/P", allowed_miscleavages=0), ModuleValidationConfig()) + assert any(i.code == "missed_cleavages_exceeded" for i in trypsin_p) + + +def test_enzyme_lysc_supported(): + # Lys-C cleaves after K only. "PEPKTIDER" has one internal K -> 1 missed cleavage. + df = make_standard_df(Sequence=["PEPKTIDER", "SAMPLEK", "PEPTIDEK", "ELVISK"]) + issues = check_enzyme(df, make_params(enzyme="Lys-C", allowed_miscleavages=0), ModuleValidationConfig()) + assert any(i.code == "missed_cleavages_exceeded" and i.severity == Severity.WARNING for i in issues) + + +def test_enzyme_gluc_supported(): + # Glu-C cleaves after D/E. "PEPDTIDEK" has 4 internal D/E residues + # (E1, D3, D6, E7; the C-terminal K is not a cleavage site) -> 4 missed cleavages. + df = make_standard_df(Sequence=["PEPDTIDEK", "SAMPLEE", "PEPTIDE", "AAAAD"]) + issues = check_enzyme(df, make_params(enzyme="Glu-C", allowed_miscleavages=0), ModuleValidationConfig()) + exceeded = [i for i in issues if i.code == "missed_cleavages_exceeded"] + assert exceeded + assert any("PEPDTIDEK (4 MC)" in e for e in exceeded[0].examples) + + +def test_enzyme_aspn_skipped_as_info(): + # Asp-N is an N-terminal cleaver; the heuristic does not apply. + issues = check_enzyme(make_standard_df(), make_params(enzyme="Asp-N"), ModuleValidationConfig()) + assert any(i.code == "enzyme_check_unsupported" and i.severity == Severity.INFO for i in issues) + + +def test_enzyme_unknown_skipped_as_info(): + issues = check_enzyme(make_standard_df(), make_params(enzyme="ProteinaseK"), ModuleValidationConfig()) + assert any(i.code == "enzyme_check_unsupported" and i.severity == Severity.INFO for i in issues) + + +# --------------------------------------------------------------------------- # +# maximum number of modifications +# --------------------------------------------------------------------------- # + + +def test_max_modifications_exceeded_warns(): + df = make_standard_df() + df["proforma"] = [ + "PEP[Acetyl]T[Oxidation]IDE[Phospho]K", # 3 mods + "ELVISLIVESR", # 0 + "SAM[Oxidation]PLERPEPK", # 1 + "VFR[Oxidation]RDTHK", # 1 + ] + issues = check_max_modifications(df, make_params(max_mods=2), ModuleValidationConfig()) + warnings = [i for i in issues if i.severity == Severity.WARNING] + assert any(i.code == "max_modifications_exceeded" for i in warnings) + assert any("3 mods" in str(e) for i in warnings for e in i.examples) + + +def test_max_modifications_within_limit_passes(): + issues = check_max_modifications(make_standard_df(), make_params(max_mods=3), ModuleValidationConfig()) + assert not any(i.code == "max_modifications_exceeded" for i in issues) + + +def test_max_modifications_missing_param_warns(): + issues = check_max_modifications(make_standard_df(), make_params(max_mods=None), ModuleValidationConfig()) + assert not any(i.severity == Severity.ERROR for i in issues) + assert any(i.code == "max_mods_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +# --------------------------------------------------------------------------- # +# precursor / fragment mass tolerances +# --------------------------------------------------------------------------- # + + +def test_mass_tolerances_valid_passes(): + issues = check_mass_tolerances(make_standard_df(), make_params(), ModuleValidationConfig()) + assert issues == [] + + +def test_mass_tolerance_non_positive_warns(): + params = make_params(precursor_mass_tolerance="[0.0 ppm, 0.0 ppm]") + issues = check_mass_tolerances(make_standard_df(), params, ModuleValidationConfig()) + assert any(i.code == "precursor_mass_tolerance_non_positive" and i.severity == Severity.WARNING for i in issues) + + +def test_mass_tolerance_ceilings_default_to_none(): + # The plausibility ceilings have no float/int default; they default to None. + config = ModuleValidationConfig() + assert config.max_plausible_ppm is None + assert config.max_plausible_dalton is None + + +def test_mass_tolerance_implausible_skipped_without_ceiling(): + # With no ceiling configured (default), the implausible-value sub-check is skipped. + params = make_params(fragment_mass_tolerance="[-5000.0 ppm, 5000.0 ppm]") + issues = check_mass_tolerances(make_standard_df(), params, ModuleValidationConfig()) + assert not any(i.code == "fragment_mass_tolerance_implausible" for i in issues) + + +def test_mass_tolerance_implausible_warns(): + params = make_params(fragment_mass_tolerance="[-5000.0 ppm, 5000.0 ppm]") + config = ModuleValidationConfig(max_plausible_ppm=1000.0, max_plausible_dalton=10.0) + issues = check_mass_tolerances(make_standard_df(), params, config) + assert any(i.code == "fragment_mass_tolerance_implausible" for i in issues) + + +def test_mass_tolerance_missing_warns(): + params = make_params(precursor_mass_tolerance=None, fragment_mass_tolerance="None") + issues = check_mass_tolerances(make_standard_df(), params, ModuleValidationConfig()) + codes = {i.code for i in issues} + assert "precursor_mass_tolerance_not_parsed" in codes + assert "fragment_mass_tolerance_not_parsed" in codes + assert not any(i.severity == Severity.ERROR for i in issues) + + +def test_mass_tolerance_dalton_units(): + config = ModuleValidationConfig(max_plausible_ppm=1000.0, max_plausible_dalton=10.0) + # Plausible Dalton tolerance: no issue. + ok = check_mass_tolerances(make_standard_df(), make_params(fragment_mass_tolerance="[-0.5 Da, 0.5 Da]"), config) + assert not any(i.code == "fragment_mass_tolerance_implausible" for i in ok) + # Implausible Dalton tolerance: warns. + bad = check_mass_tolerances(make_standard_df(), make_params(fragment_mass_tolerance="[-50.0 Da, 50.0 Da]"), config) + assert any(i.code == "fragment_mass_tolerance_implausible" for i in bad) + + +def test_mass_tolerance_mmu_units(): + config = ModuleValidationConfig(max_plausible_ppm=1000.0, max_plausible_dalton=10.0) + # 5000 mmu = 5 Da: plausible (ceiling is 10000 mmu). + ok = check_mass_tolerances(make_standard_df(), make_params(fragment_mass_tolerance="[-5000 mmu, 5000 mmu]"), config) + assert not any(i.code == "fragment_mass_tolerance_implausible" for i in ok) + # 50000 mmu = 50 Da: implausible. + bad = check_mass_tolerances( + make_standard_df(), make_params(fragment_mass_tolerance="[-50000 mmu, 50000 mmu]"), config + ) + assert any(i.code == "fragment_mass_tolerance_implausible" for i in bad) + + +def test_mass_tolerance_scientific_notation(): + # "2e-3 Da" must parse as 0.002 Da, not 3 Da (regression guard for the number regex). + magnitude, unit = _parse_tolerance("2e-3 Da") + assert magnitude == pytest.approx(0.002) + assert unit == "da" + # And a normal bracketed range still parses as before. + assert _parse_tolerance("[-20.0 ppm, 20.0 ppm]") == (pytest.approx(20.0), "ppm") + + +# --------------------------------------------------------------------------- # +# PSM FDR +# --------------------------------------------------------------------------- # + + +def test_fdr_psm_valid_passes(): + issues = check_fdr_psm(make_standard_df(), make_params(ident_fdr_psm=0.01), ModuleValidationConfig()) + assert issues == [] + + +def test_fdr_psm_above_recommended_warns(): + issues = check_fdr_psm(make_standard_df(), make_params(ident_fdr_psm=0.05), ModuleValidationConfig()) + assert any(i.code == "fdr_psm_above_recommended" and i.severity == Severity.WARNING for i in issues) + + +def test_fdr_psm_out_of_range_warns(): + issues = check_fdr_psm(make_standard_df(), make_params(ident_fdr_psm=1.5), ModuleValidationConfig()) + assert any(i.code == "fdr_psm_out_of_range" for i in issues) + + +def test_fdr_psm_missing_warns(): + issues = check_fdr_psm(make_standard_df(), make_params(ident_fdr_psm="None"), ModuleValidationConfig()) + assert any(i.code == "fdr_psm_not_parsed" and i.severity == Severity.WARNING for i in issues) + + +def test_fdr_psm_recommendation_configurable(): + # Disabling the recommendation removes the above-recommended warning. + config = ModuleValidationConfig(recommended_max_fdr_psm=None) + issues = check_fdr_psm(make_standard_df(), make_params(ident_fdr_psm=0.05), config) + assert not any(i.code == "fdr_psm_above_recommended" for i in issues) + + +# --------------------------------------------------------------------------- # +# orchestrator + report semantics +# --------------------------------------------------------------------------- # + + +def test_validate_submission_passes_for_valid(reference_fasta): + report = validate_submission( + make_standard_df(), + parameters=make_params(), + fasta=reference_fasta, + config=ModuleValidationConfig(), + input_format="MaxQuant", + ) + assert report.passed + assert not report.has_errors + + +def test_validate_submission_blocks_on_errors(reference_fasta): + df = make_standard_df(Charge=[2, 9, 2, 2]) + df.loc[0, "Proteins"] = "sp|P99999|MISSING_HUMAN" + report = validate_submission( + df, parameters=make_params(), fasta=reference_fasta, config=ModuleValidationConfig(), input_format="MaxQuant" + ) + assert not report.passed + assert report.has_errors + codes = {i.code for i in report.errors} + assert "protein_not_in_fasta" in codes + assert "charge_out_of_range" in codes + + +def test_validate_submission_without_fasta_skips_protein_check(): + report = validate_submission(make_standard_df(), parameters=make_params(), fasta=None) + assert report.passed + assert any(i.code == "no_fasta_reference" and i.severity == Severity.INFO for i in report.issues) + + +def test_validate_submission_without_params_warns(): + report = validate_submission(make_standard_df(), parameters=None, fasta=None) + assert report.passed + assert not report.has_errors + # Each parameter-dependent check self-reports that its constraint was not parsed. + assert any(i.code == "charge_range_not_parsed" and i.severity == Severity.WARNING for i in report.warnings) + + +def test_validate_submission_empty_df_errors(): + report = validate_submission(pd.DataFrame(), parameters=make_params()) + assert report.has_errors + assert any(i.code == "empty_results" for i in report.errors) + + +def test_report_serialization_and_summary(): + report = ValidationReport() + report.add_error("x", "an error", "check_a", examples=["e1"]) + report.add_warning("y", "a warning", "check_b") + as_dict = report.to_dict() + assert as_dict["passed"] is False + assert as_dict["n_errors"] == 1 and as_dict["n_warnings"] == 1 + summary = report.summary() + assert "Automated submission checks" in summary + assert "an error" in summary and "a warning" in summary + # Wording must stay neutral (validation does not block submission). + assert "PASSED" not in summary and "FAILED" not in summary + + +def test_summary_passed_when_clean(): + report = ValidationReport() + report.add_info("i", "just info", "check_a") + assert "All automated checks passed." in report.summary() + + +def test_raise_if_errors(): + report = ValidationReport() + report.add_error("x", "boom", "check_a") + with pytest.raises(SubmissionValidationError): + report.raise_if_errors() + # No error -> no raise. + ValidationReport().raise_if_errors() + + +# --------------------------------------------------------------------------- # +# profile registry / generic routing +# --------------------------------------------------------------------------- # + + +def test_builtin_profiles_registered(): + profiles = available_profiles() + assert "quant_lfq" in profiles + assert "denovo" in profiles + assert get_profile("quant_lfq") is not None + assert "protein_ids" in get_profile("quant_lfq").check_names + + +def test_default_config_uses_quant_profile(): + assert ModuleValidationConfig().validation_profile == "quant_lfq" + + +def test_denovo_profile_skips_quant_checks(): + # A de novo standard df has none of the quant columns. + df = pd.DataFrame({"spectrum_id": [1, 2], "peptide_str": ["PEPTIDEK", "ELVISLIVESR"]}) + config = ModuleValidationConfig(validation_profile="denovo") + # software_name matches input_format so the shared run_consistency check passes. + report = validate_submission( + df, parameters=make_params(software_name="Casanovo"), config=config, input_format="Casanovo" + ) + assert report.passed + assert any(i.code == "denovo_validation_pending" for i in report.infos) + # No quant-specific errors were produced. + assert not any(i.code in {"protein_not_in_fasta", "charge_out_of_range"} for i in report.issues) + + +def test_unknown_profile_warns_and_runs_nothing(): + config = ModuleValidationConfig(validation_profile="does_not_exist") + report = validate_submission(make_standard_df(), parameters=make_params(), config=config) + assert report.passed # nothing ran, so no errors + assert any(i.code == "unknown_validation_profile" and i.severity == Severity.WARNING for i in report.issues) + + +def test_explicit_profile_overrides_config(): + # Config says quant_lfq, but the explicit profile arg wins. + report = validate_submission( + make_standard_df(), parameters=make_params(), config=ModuleValidationConfig(), profile="denovo" + ) + assert any(i.code == "denovo_validation_pending" for i in report.infos) + + +def test_register_and_run_custom_profile(): + def my_check(ctx: ValidationContext): + r = ValidationReport() + if "MyColumn" not in ctx.standard_df.columns: + r.add_error("missing_my_column", "MyColumn is required", "my_check", field="MyColumn") + return r.issues + + profile = ValidationProfile(name="custom_test_profile", checks=[Check("my_check", my_check)]) + register_profile(profile) + try: + report = validate_submission(make_standard_df(), profile="custom_test_profile") + assert report.has_errors + assert any(i.code == "missing_my_column" for i in report.errors) + finally: + unregister_profile("custom_test_profile") + assert get_profile("custom_test_profile") is None + + +def test_register_duplicate_profile_raises(): + profile = ValidationProfile(name="dup_test_profile", checks=[]) + register_profile(profile) + try: + with pytest.raises(ValueError): + register_profile(ValidationProfile(name="dup_test_profile", checks=[])) + # overwrite=True succeeds. + register_profile(ValidationProfile(name="dup_test_profile", checks=[]), overwrite=True) + finally: + unregister_profile("dup_test_profile") + + +def test_profile_resolution_from_parse_settings(): + quant_cfg = ModuleValidationConfig.from_parse_settings(QEXACTIVE_SETTINGS_DIR, QEXACTIVE_MODULE_ID, "MaxQuant") + assert quant_cfg.validation_profile == "quant_lfq" + + denovo_dir = os.path.abspath( + os.path.join(HERE, "..", "proteobench", "io", "parsing", "io_parse_settings", "denovo", "DDA", "HCD") + ) + denovo_cfg = ModuleValidationConfig.from_parse_settings(denovo_dir, "denovo_DDA_HCD", "Casanovo") + assert denovo_cfg.validation_profile == "denovo" + + +def test_resolve_profile_infers_from_parser_class(): + # With no declared profile, resolution falls back to MODULE_TO_CLASS inference. + assert _resolve_profile("quant_lfq_DDA_ion_QExactive", None) == "quant_lfq" + assert _resolve_profile("denovo_DDA_HCD", None) == "denovo" + # Unknown module_id falls back to the default profile. + assert _resolve_profile("does_not_exist", None) == "quant_lfq" + # An explicit declared profile always wins. + assert _resolve_profile("denovo_DDA_HCD", "custom_xyz") == "custom_xyz" + + +def test_non_string_profile_does_not_crash(): + # A malformed validation_profile must degrade gracefully, never raise. + config = ModuleValidationConfig(validation_profile=["not", "a", "string"]) + report = validate_submission(make_standard_df(), parameters=make_params(), config=config) + assert report.passed + assert any(i.code == "unknown_validation_profile" for i in report.issues) + + +# --------------------------------------------------------------------------- # +# lightweight integration with the real parser (MaxQuant + Sage) +# --------------------------------------------------------------------------- # + +REAL_TOOL_FILES = { + "MaxQuant": "MaxQuant_evidence_sample.txt", + "Sage": "sage_sample_input_lfq.tsv", +} + + +def _standard_df_for_tool(tool): + path = os.path.join(QEXACTIVE_DATA_DIR, REAL_TOOL_FILES[tool]) + if not os.path.isfile(path): + pytest.skip(f"Test data for {tool} not available: {path}") + input_df = load_input_file(path, tool) + parser = ParseSettingsBuilder( + parse_settings_dir=QEXACTIVE_SETTINGS_DIR, module_id=QEXACTIVE_MODULE_ID + ).build_parser(tool) + standard_df, _ = parser.convert_to_standard_format(input_df) + return standard_df + + +def _fasta_from_standard_df(df): + ids = set() + for cell in df["Proteins"].dropna().unique(): + for token in split_protein_groups(cell): + ids |= extract_identifiers(token) + return FastaReference.from_identifiers(ids) + + +@pytest.mark.parametrize("tool", ["MaxQuant", "Sage"]) +def test_integration_real_tool_matching_fasta_passes(tool): + df = _standard_df_for_tool(tool) + fasta = _fasta_from_standard_df(df) + config = ModuleValidationConfig.from_parse_settings(QEXACTIVE_SETTINGS_DIR, QEXACTIVE_MODULE_ID, tool) + issues = check_protein_ids(df, fasta, config) + assert not any(i.severity == Severity.ERROR for i in issues) + + +@pytest.mark.parametrize("tool", ["MaxQuant", "Sage"]) +def test_integration_real_tool_injected_unknown_protein_errors(tool): + df = _standard_df_for_tool(tool) + fasta = _fasta_from_standard_df(df) + df = df.copy() + df.iloc[0, df.columns.get_loc("Proteins")] = "sp|ZZZ999|NOTINFASTA_HUMAN" + config = ModuleValidationConfig.from_parse_settings(QEXACTIVE_SETTINGS_DIR, QEXACTIVE_MODULE_ID, tool) + issues = check_protein_ids(df, fasta, config) + errors = [i for i in issues if i.severity == Severity.ERROR] + assert any(i.code == "protein_not_in_fasta" for i in errors) + assert any("ZZZ999" in str(e) for i in errors for e in i.examples) diff --git a/webinterface/pages/base_pages/tabs/tab6_submit_results.py b/webinterface/pages/base_pages/tabs/tab6_submit_results.py index 2386892c7..9b96cd982 100644 --- a/webinterface/pages/base_pages/tabs/tab6_submit_results.py +++ b/webinterface/pages/base_pages/tabs/tab6_submit_results.py @@ -11,6 +11,7 @@ from proteobench.io.parsing.utils import add_maxquant_fixed_modifications from ..utils.inputs import generate_input_widget +from ..utils.validation_ui import render_validation_report, run_submission_validation def generate_submission_ui_elements(variables, user_input, parsesettingsbuilder=None) -> bool: @@ -182,6 +183,18 @@ def submit_to_repository( if not button_pressed: # if button_pressed is None return None + # Run automated submission checks (results vs parameters vs reference FASTA). + # These never block submission: the findings are shown to the submitter and + # included in the pull-request description for the reviewers. + validation_report = run_submission_validation( + variables=variables, + ionmodule=ionmodule, + user_input=user_input, + params=params, + ) + render_validation_report(validation_report) + validation_summary = validation_report.summary() + # MaxQuant fixed modification handling if user_input["input_format"] == "MaxQuant": st.session_state[variables.result_perf] = add_maxquant_fixed_modifications( @@ -202,6 +215,7 @@ def submit_to_repository( params_from_file=params_from_file, params=params, submission_source=submission_source, + validation_summary=validation_summary, ) if pr_url: @@ -352,6 +366,7 @@ def create_pull_request( params_from_file: dict[str, Any], params: dataclass, submission_source: str = "unknown", + validation_summary: str = "", ) -> Optional[str]: """ Submit the pull request with the benchmark results and returns the PR URL. @@ -360,6 +375,9 @@ def create_pull_request( ---------- params : Any The parameters object. + validation_summary : str, optional + A Markdown summary of the submission-validation report (warnings/info), + appended to the PR description for curator visibility. Returns ------- @@ -370,12 +388,16 @@ def create_pull_request( changed_params_str = compare_dictionaries(params_from_file, params.__dict__) + submission_comments = user_comments + "\n" + changed_params_str + if validation_summary: + submission_comments += "\n\n" + validation_summary + try: pr_url = ionmodule.clone_pr( st.session_state[variables.all_datapoints_submission], params, remote_git=variables.github_link_pr, - submission_comments=user_comments + "\n" + changed_params_str, + submission_comments=submission_comments, submission_source=submission_source, ) except Exception as e: diff --git a/webinterface/pages/base_pages/utils/validation_ui.py b/webinterface/pages/base_pages/utils/validation_ui.py new file mode 100644 index 000000000..b279d7fe9 --- /dev/null +++ b/webinterface/pages/base_pages/utils/validation_ui.py @@ -0,0 +1,296 @@ +""" +Streamlit glue for the submission-validation layer. + +This module bridges the framework-agnostic core validator +(:mod:`proteobench.validation`) and the Streamlit submission flow. It: + +* re-derives the standardized result DataFrame from the already-parsed input + DataFrame by reusing the existing parser (no duplicated tool logic); +* downloads and caches the module reference FASTA; +* runs :func:`proteobench.validation.validate_submission`; +* renders the resulting report in a curator- and user-friendly way. + +All network access and Streamlit calls live here, keeping the core validation +library free of UI and I/O dependencies. +""" + +from __future__ import annotations + +from typing import Any, Optional + +import pandas as pd +import streamlit as st + +from proteobench.io.parsing.parse_settings import ParseSettingsBuilder +from proteobench.validation import ( + FastaReference, + ModuleValidationConfig, + Severity, + ValidationReport, + validate_submission, +) + + +@st.cache_data(show_spinner="Downloading reference FASTA for validation ...") +def _load_fasta_reference(fasta_url: str, fasta_filename: Optional[str]) -> FastaReference: + """ + Download and parse the module reference FASTA (cached). + + Parameters + ---------- + fasta_url : str + URL of the reference FASTA / zip / gzip resource. + fasta_filename : str, optional + Preferred FASTA member name when the resource is an archive. + + Returns + ------- + FastaReference + Reference protein identifiers. + """ + return FastaReference.from_url(fasta_url, member_filename=fasta_filename) + + +def _build_standard_dataframe(ionmodule: Any, input_format: str, input_df: pd.DataFrame) -> pd.DataFrame: + """ + Re-derive the standardized result DataFrame by reusing the module parser. + + Parameters + ---------- + ionmodule : Any + The benchmarking module instance (provides ``parse_settings_dir`` and ``module_id``). + input_format : str + The selected software tool. + input_df : pandas.DataFrame + The raw parsed tool output (as stored in session state). + + Returns + ------- + pandas.DataFrame + The standardized result DataFrame. + """ + parser = ParseSettingsBuilder( + parse_settings_dir=ionmodule.parse_settings_dir, + module_id=ionmodule.module_id, + ).build_parser(input_format) + standard_df, _ = parser.convert_to_standard_format(input_df) + return standard_df + + +def _resolve_input_df(variables): + """ + Fetch the parsed input DataFrame from session state for validation. + + Parameters + ---------- + variables : Any + The module's ``Variables`` dataclass instance (session-state keys). + + Returns + ------- + pandas.DataFrame or None + The submission input DataFrame, or ``None`` if unavailable. + """ + input_df = st.session_state.get(variables.input_df_submission) + if input_df is None: + input_df = st.session_state.get(variables.input_df) + return input_df + + +def _build_config(ionmodule, input_format: str) -> ModuleValidationConfig: + """ + Build the module validation config, falling back to defaults on failure. + + Parameters + ---------- + ionmodule : Any + The benchmarking module instance. + input_format : str + The selected software tool. + + Returns + ------- + ModuleValidationConfig + The resolved configuration (never raises). + """ + try: + return ModuleValidationConfig.from_parse_settings( + parse_settings_dir=ionmodule.parse_settings_dir, + module_id=ionmodule.module_id, + input_format=input_format, + ) + except Exception: # noqa: BLE001 + return ModuleValidationConfig() + + +def _acquire_fasta(config: ModuleValidationConfig, report: ValidationReport): + """ + Obtain the reference FASTA, degrading to a report message on any problem. + + Parameters + ---------- + config : ModuleValidationConfig + The module configuration (provides ``fasta_url`` / ``fasta_filename``). + report : ValidationReport + Report to which a warning/info is added when no FASTA is available. + + Returns + ------- + FastaReference or None + The reference, or ``None`` when not configured or not downloadable. + """ + if not config.fasta_url: + report.add_info( + "no_fasta_configured", + "No reference FASTA is configured for this module ([reference_database] in " + "module_settings.toml); protein-identifier validation was skipped.", + "protein_ids", + ) + return None + try: + return _load_fasta_reference(config.fasta_url, config.fasta_filename) + except Exception as exc: # noqa: BLE001 + report.add_warning( + "fasta_unavailable", + f"Could not download or parse the reference FASTA ({type(exc).__name__}: {exc}); " + "protein-identifier validation was skipped.", + "protein_ids", + field=config.fasta_url, + ) + return None + + +def run_submission_validation(variables, ionmodule, user_input, params) -> ValidationReport: + """ + Validate a submission and return the structured report. + + Designed to be fault-tolerant: any infrastructure problem (missing input, + parser failure, FASTA download failure) is converted into a warning so that + validation never crashes the submission flow. Only genuine consistency + problems produce errors. + + Parameters + ---------- + variables : Any + The module's ``Variables`` dataclass instance (session-state keys). + ionmodule : Any + The benchmarking module instance. + user_input : dict + The submission's user input (provides ``"input_format"``). + params : Any + The parsed/edited :class:`ProteoBenchParameters` to be submitted. + + Returns + ------- + ValidationReport + The aggregated validation report. + """ + report = ValidationReport() + input_format = user_input.get("input_format") + + input_df = _resolve_input_df(variables) + if input_df is None: + report.add_warning( + "no_input_dataframe", + "Could not run submission validation because the parsed result data was not available in the session.", + "input", + ) + return report + + # Re-derive the standardized DataFrame (reuses existing parsing; no duplication). + try: + standard_df = _build_standard_dataframe(ionmodule, input_format, input_df) + except Exception as exc: # noqa: BLE001 - never block submission on a validation infra error + report.add_warning( + "standardization_failed", + f"Could not re-standardize the results for validation ({type(exc).__name__}: {exc}); " + "protein/charge/length checks were skipped.", + "input", + ) + return report + + config = _build_config(ionmodule, input_format) + fasta = _acquire_fasta(config, report) + + try: + core_report = validate_submission( + standard_df, + parameters=params, + fasta=fasta, + config=config, + input_format=input_format, + ) + report.extend(core_report.issues) + except Exception as exc: # noqa: BLE001 - never block submission on a validation infra error + report.add_warning( + "validation_failed", + f"Submission validation could not be completed ({type(exc).__name__}: {exc}); " + "no automated consistency checks were applied.", + "input", + ) + return report + + +def _render_issue(issue) -> None: + """ + Render a single validation issue with its details. + + Parameters + ---------- + issue : ValidationIssue + The issue to render. + """ + header = f"**{issue.message}**" + if issue.severity == Severity.ERROR: + st.error(header, icon="🚫") + elif issue.severity == Severity.WARNING: + st.warning(header, icon="⚠️") + else: + st.info(header, icon="ℹ️") + + details = [] + if issue.expected is not None: + details.append(f"- Expected: `{issue.expected}`") + if issue.observed is not None: + details.append(f"- Observed: `{issue.observed}`") + if issue.examples: + shown = ", ".join(f"`{e}`" for e in issue.examples) + details.append(f"- Examples: {shown}") + if details: + st.markdown("\n".join(details)) + + +def render_validation_report(report: ValidationReport) -> None: + """ + Render a full validation report in the Streamlit UI. + + The checks never block submission; the report is shown so the submitter can + review the findings, which are also included in the pull-request description. + + Parameters + ---------- + report : ValidationReport + The report to display. + """ + n_flagged = len(report.errors) + len(report.warnings) + n_info = len(report.infos) + + st.subheader("Submission checks") + if n_flagged == 0: + st.success("All automated submission checks passed.", icon="✅") + else: + st.info( + f"We flagged {n_flagged} point(s) to review below. You can still submit your results, and " + "these notes will be included in the pull request for the reviewers.", + icon="📝", + ) + + for issue in report.errors: + _render_issue(issue) + for issue in report.warnings: + _render_issue(issue) + + if report.infos: + with st.expander(f"More details ({n_info})"): + for issue in report.infos: + _render_issue(issue)