From 6633f16d9c2babeb93e4ac3b1b5a21e4ee83b13b Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:12:52 +0200 Subject: [PATCH 1/5] Update cluster utils with new aggregation modes and fragment filtering --- alphaquant/cluster/cluster_ions.py | 142 ++++++-- alphaquant/cluster/cluster_utils.py | 461 ++++++++++++++---------- alphaquant/cluster/ml_reorder.py | 22 +- alphaquant/cluster/outlier_filtering.py | 12 +- tests/unit_tests/test_clusterutils.py | 251 +++++++++---- 5 files changed, 587 insertions(+), 301 deletions(-) diff --git a/alphaquant/cluster/cluster_ions.py b/alphaquant/cluster/cluster_ions.py index 8767054b..9e24ee00 100644 --- a/alphaquant/cluster/cluster_ions.py +++ b/alphaquant/cluster/cluster_ions.py @@ -1,3 +1,28 @@ +"""Hierarchical proteoform clustering and differential-expression aggregation. + +This module builds a hierarchical tree for each protein (fragments → peptides +→ modified peptides → unmodified peptides → protein) and performs two +*independent* statistical procedures at each level: + +1. **Proteoform clustering** — pairwise double-differential tests assess + whether sibling ions share the same fold change (null hypothesis: "ions + have equal regulation"). The resulting similarity p-values are corrected + with Benjamini-Yekutieli (appropriate for the intrinsic dependencies among + pairwise comparisons) and then used for hierarchical clustering to separate + proteoforms. + +2. **Differential-expression aggregation** — the per-ion z-values from + individual differential-expression tests (null hypothesis: "no change + between conditions") are combined into parent-level z-values via Stouffer's + method. This aggregation propagates evidence of regulation up the tree. + +The two sets of p-values address fundamentally different questions and are +corrected separately. The Benjamini-Yekutieli correction in step 1 applies +*within* a single protein to the similarity matrix; the final protein-level +p-values produced by step 2 are later corrected across all proteins with +Benjamini-Hochberg (see ``diffquant_table.py``). +""" + import scipy.spatial.distance import scipy.cluster.hierarchy import alphaquant.cluster.cluster_utils as aqcluster_utils @@ -28,7 +53,7 @@ -def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, fragment_outlier_filtering=True): +def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fcdiff_cutoff_clustermerge, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01): """Main entry point for hierarchical clustering and tree-based quantification of a protein. This function creates a hierarchical tree structure from fragment ions up to the protein level @@ -47,7 +72,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i fcfc_threshold: Fold-change difference threshold for clustering take_median_ion: If True, use median-centered ions for clustering fcdiff_cutoff_clustermerge: Fold-change threshold for merging similar clusters - fragment_outlier_filtering: Whether to filter outlier fragments when aggregating to peptides + aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES) Returns: anytree.Node: Root node of the hierarchical tree containing all statistics and clustering results @@ -63,7 +88,7 @@ def get_scored_clusterselected_ions(gene_name, diffions, normed_c1, normed_c2, i root_node = create_hierarchical_ion_grouping(gene_name, diffions) add_reduced_names_to_root(root_node) #LOGGER.info(anytree.RenderTree(root_node)) - root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering) + root_node_clust = cluster_along_specified_levels(root_node, name2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode=aggregation_mode, cluster_threshold_ion_type=cluster_threshold_ion_type) level_sorted_nodes = [[node for node in children] for children in anytree.ZigZagGroupIter(root_node_clust)] level_sorted_nodes.reverse() #the base nodes are first @@ -114,7 +139,7 @@ def add_reduced_names_to_root(node): import pandas as pd -def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, fragment_outlier_filtering=True):#~60% of overall runtime +def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, take_median_ion, aggregation_mode="stouffer_decorrelation", cluster_threshold_ion_type=0.01):#~60% of overall runtime """Performs hierarchical clustering at each level of the tree from bottom to top. Starting from base ions (fragments/MS1), this function iterates through each level @@ -123,6 +148,31 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed pairwise for consistent fold-change differences, clustered hierarchically, and statistics are aggregated to parent nodes. + Important: this function interleaves two *independent* statistical procedures + that address different questions: + + 1. **Proteoform clustering** (``find_fold_change_clusters``): tests whether + pairs of sibling ions (e.g. two peptides of the same protein) have + *different* fold changes, i.e. whether they belong to different + proteoforms. The resulting pairwise p-values form a dependent similarity + matrix that is corrected for multiple testing with Benjamini-Yekutieli + (appropriate for dependent tests) *before* hierarchical clustering. + + 2. **Differential-expression aggregation** (``aggregate_node_properties``): + combines the per-ion z-values (derived from individual differential + expression tests) into a single parent-level z-value via Stouffer's + method. These z-values quantify *how much* each ion changes between + conditions and are unrelated to the proteoform-similarity p-values + from step 1. + + The Benjamini-Yekutieli correction in step 1 therefore precedes the + Stouffer aggregation in step 2 by design, because they operate on + different null hypotheses: step 1 asks "do these two peptides differ + from each other?" while step 2 asks "does this protein change between + conditions?". The final protein-level p-values produced in step 2 + are later corrected with Benjamini-Hochberg across all proteins (see + ``diffquant_table.py``). + Args: root_node: Root of the hierarchical tree (protein level) ionname2diffion: Dictionary mapping ion names to DifferentialIon objects @@ -134,7 +184,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed pval_threshold_basis: P-value threshold for clustering decisions fcfc_threshold: Fold-change threshold for clustering take_median_ion: Whether to use median-centered ions - fragment_outlier_filtering: Whether to filter fragment outliers + aggregation_mode: Strategy for combining child z-values (see cluster_utils.AGGREGATION_MODES) Returns: anytree.Node: The root node with all clustering annotations and aggregated statistics @@ -164,7 +214,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed if take_median_ion: grouped_mainclust_leafs = aqcluster_utils.select_median_fc_leafs(grouped_mainclust_leafs) diffions = aqcluster_utils.map_grouped_leafs_to_diffions(grouped_mainclust_leafs, ionname2diffion) #the diffions are the ions that are actually compared - childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold) #the clustering is performed on the child nodes + childnode2clust = find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=cluster_threshold_ion_type) #the clustering is performed on the child nodes childnode2clust = merge_similar_clusters_if_applicable(childnode2clust, type_node, fcdiff_cutoff_clustermerge = FCDIFF_CUTOFF_CLUSTERMERGE) childnode2clust = aq_cluster_sorting.decide_cluster_order(childnode2clust) @@ -172,7 +222,7 @@ def cluster_along_specified_levels(root_node, ionname2diffion, normed_c1, normed aqcluster_utils.assign_clusterstats_to_type_node(type_node, childnode2clust) aqcluster_utils.annotate_mainclust_leaves(childnode2clust) aqcluster_utils.assign_cluster_number(type_node, childnode2clust) - aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, fragment_outlier_filtering=fragment_outlier_filtering) + aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, aggregation_mode=aggregation_mode) return root_node @@ -183,21 +233,46 @@ def get_childnode2clust_for_single_ion(type_node): return {type_node.children[0]: 0} -def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold): - """Compares the fold changes of the ions corresponding to the nodes that are compared and returns the set of ions with consistent fold changes. +def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis, fcfc_threshold, cluster_threshold_ion_type=0.01): + """Cluster sibling ions by the similarity of their fold changes (proteoform inference). + + For each pair of sibling ion groups, a *double-differential* test + (``evaluate_similarity``) computes a p-value for the null hypothesis that + their fold changes are equal (i.e. they belong to the same proteoform). + These pairwise p-values form a condensed similarity matrix that is + corrected for multiple testing with the Benjamini-Yekutieli procedure + (``get_multiple_testing_corrected_condensed_similarity_matrix``), which is + appropriate here because the pairwise comparisons are intrinsically + dependent. The corrected matrix is then converted to a distance matrix and + subjected to Ward hierarchical clustering. + + Note: the p-values computed and corrected here test *inter-ion similarity* + for proteoform grouping. They are entirely separate from the per-ion + differential-expression p-values that are later aggregated via Stouffer's + method in ``aggregate_node_properties``. Args: - diffions (list[list[ionnames]]): contains the sets of ions to be tested, for example [[fragion1_precursor1, fragion2_precursor1, fragion3_precursor1],[fragion1_precursor2],[fragion1_precursor3, fragion2_precursor3]]. The ions are assumed to be similar in type (e.g. fragment, precursor)! - normed_c1 (ConditionBackground): [description] - normed_c2 (ConditionBackground): [description] - ion2diffDist (dict(ion : SubtractedBackground)): [description] - p2z ([type]): [description] - deedpair2doublediffdist ([type]): [description] - fc_threshold (float, optional): [description]. Defaults to 0. - pval_threshold_basis (float, optional): the threshold at which to merge peptides at the gene level. Defaults to 0.01 + type_node: Parent node whose children are being clustered + diffions: List of lists of DifferentialIon objects, one sublist per + child node, e.g. ``[[fragion1_prec1, fragion2_prec1], [fragion1_prec2]]`` + normed_c1: ConditionBackgrounds for condition 1 + normed_c2: ConditionBackgrounds for condition 2 + ion2diffDist: Mapping of ion pairs to differential background distributions + p2z: Cache for p-value to z-value conversions + deedpair2doublediffdist: Cache for double-differential distributions + pval_threshold_basis: P-value threshold at the gene level for cutting the + clustering dendrogram; thresholds for lower levels are looked up in + ``LEVEL2PVALTHRESH`` + fcfc_threshold: Minimum fold-change difference below which two ion groups + are assumed similar without a formal test + cluster_threshold_ion_type: P-value threshold at the ion_type level + + Returns: + list[tuple[Node, int]]: Pairs of (child node, cluster index) sorted by + node name for reproducibility """ - pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis) + pval_threshold_basis = get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=cluster_threshold_ion_type) diffions_idxs = [[x] for x in range(len(diffions))] diffions_fcs = aqcluster_utils.get_fcs_ions(diffions) #mt_corrected_pval_thresh = pval_threshold_basis/len(diffions) @@ -216,26 +291,36 @@ def find_fold_change_clusters(type_node, diffions, normed_c1, normed_c2, ion2dif return childnode2clust -def get_pval_threshold_basis(type_node, pval_threshold_basis): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary +def get_pval_threshold_basis(type_node, pval_threshold_basis, cluster_threshold_ion_type=0.01): #the pval threshold is only set at the gene level, the rest of the levels are set as specified in the LEVEL2PVALTHRESH dictionary if type_node.level == "gene": return pval_threshold_basis - else: - return LEVEL2PVALTHRESH.get(type_node.level, 0.2) + if type_node.level == "ion_type": + return cluster_threshold_ion_type + return LEVEL2PVALTHRESH.get(type_node.level, 0.2) def get_multiple_testing_corrected_condensed_similarity_matrix(condensed_distance_matrix: np.array): - """ - condensed_distance_matrix contains all p-values of the pairwise comparisons of the ions. They are by definition dependent. + """Apply Benjamini-Yekutieli FDR correction to pairwise ion-similarity p-values. + + The condensed matrix contains p-values from *double-differential* tests + between all pairs of sibling ions (see ``evaluate_similarity``). Each + p-value tests the null hypothesis that two ions share the same fold + change. Because every ion appears in multiple pairwise comparisons, the + tests are intrinsically dependent, which is why the Benjamini-Yekutieli + (``fdr_by``) procedure is used instead of Benjamini-Hochberg. + + These corrected p-values are used exclusively for proteoform clustering + (deciding which ions belong together) and are unrelated to the per-ion + differential-expression p-values that are later aggregated via Stouffer's + method. Args: - condensed_distance_matrix (np.array): Condensed distance matrix containing p-values of pairwise comparisons. + condensed_distance_matrix: 1-D array of pairwise similarity p-values + in scipy condensed-matrix format. Returns: - np.array: Corrected condensed distance matrix. + np.array: Corrected p-values in the same condensed-matrix layout. """ - # Apply Benjamini-Yekutieli correction _, corrected_pvalues, _, _ = multitest.multipletests(condensed_distance_matrix, method='fdr_by') - - # Return the corrected condensed matrix return corrected_pvalues @@ -341,4 +426,3 @@ def exclude_node(node): node.is_included = False for descendant in node.descendants: descendant.is_included = False - diff --git a/alphaquant/cluster/cluster_utils.py b/alphaquant/cluster/cluster_utils.py index a5f1bf5c..90370ecf 100644 --- a/alphaquant/cluster/cluster_utils.py +++ b/alphaquant/cluster/cluster_utils.py @@ -5,7 +5,6 @@ import collections import alphaquant.config.variables as aqvariables from anytree import Node, LevelOrderGroupIter -import alphaquant.utils.diffquant_utils as aq_utils_diffquant import re import alphaquant.config.config as aqconfig @@ -18,14 +17,49 @@ LEVELS_UNIQUE = ["base","ion_type", "mod_seq_charge", "mod_seq", "seq", "gene"] TYPE2LEVEL = dict(zip(TYPES, LEVELS)) - -def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False, fragment_outlier_filtering=True): - """Aggregates statistical properties from child nodes to a parent node in the tree. - - This is the core function for propagating statistics up the hierarchical tree structure. - It combines z-values, fold changes, and quality metrics from child nodes (e.g., peptides) - into parent node (e.g., protein) statistics. The aggregation can optionally exclude - proteoforms (non-main clusters) and filter outlier children. +DEFAULT_AGGREGATION_MODE = "stouffer_decorrelation" +AGGREGATION_MODES = ( + "stouffer_decorrelation", + "mean_z", + "median_z", + "min_median_max_z", + "min_max_z", + "summed_z", +) +_LEGACY_AGGREGATION_MODES = ("stouffer_icc",) + +# Node types where alternative aggregation modes (mean_z, median_z, …) may +# be selected via the aggregation_mode parameter. For all other node types, +# Stouffer is always used. +_DEPENDENT_NODE_TYPES = {"frgion", "ms1_isotopes"} + +def node_is_excluded_from_aggregation(node): + """Return True for children removed by post-clustering correction steps.""" + return ( + getattr(node, "exclude_residual_decorrelation", False) + or getattr(node, "exclude_ptm_fragment_selection", False) + ) + + +def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filtering=False, aggregation_mode=DEFAULT_AGGREGATION_MODE): + """Aggregates differential-expression statistics from child nodes to a parent node. + + This is the core function for propagating statistics up the hierarchical tree + structure. It combines z-values, fold changes, and quality metrics from child + nodes (e.g. peptides) into parent node (e.g. protein) statistics using + Stouffer's Z-score method (``sum_and_re_scale_zvalues``). The aggregation can + optionally exclude proteoform variants (non-main clusters) and filter outlier + children. + + The z-values aggregated here originate from *per-ion differential-expression* + tests (null hypothesis: "no change between conditions for this ion"). They + are *not* the proteoform-similarity p-values computed in + ``find_fold_change_clusters``, which test a different null hypothesis + ("do two ions share the same fold change?") and are corrected separately + with Benjamini-Yekutieli *before* this function is called. In other words, + the Benjamini-Yekutieli correction applied during proteoform clustering and + the Stouffer aggregation performed here address independent statistical + questions and operate on different sets of p-values. Args: node: The parent node whose properties will be computed from its children @@ -33,8 +67,17 @@ def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filterin excluding proteoform variants peptide_outlier_filtering: If True and node is a protein, exclude peptides identified as statistical outliers (default: False) - fragment_outlier_filtering: If True and node is a peptide, exclude extreme - fragment ions before aggregation (default: True) + aggregation_mode: Strategy for combining child z-values at dependent levels + (frgion, ms1_isotopes). Higher levels always use Stouffer. + Can be a single string applied to all dependent levels, or a dict + mapping node types to modes (e.g. ``{"frgion": "stouffer_decorrelation", + "ms1_isotopes": "median_z"}``). Allowed mode strings: + "stouffer_decorrelation" - Stouffer's method used after residual decorrelation (default) + "mean_z" - arithmetic mean of z-values + "median_z" - median z-value + "min_median_max_z" - combine min, median, max z-values assuming independence + "min_max_z" - combine min, max z-values assuming independence (2-point summary) + "summed_z" - classic Stouffer assuming independence (rho=0, ignores ICC) Side effects: Sets node.z_val, node.p_val, node.fc, node.cv, node.min_intensity, @@ -42,11 +85,31 @@ def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filterin optionally node.ml_score based on aggregated child values. """ if only_use_mainclust: - childs = [x for x in node.children if x.is_included & (x.cluster ==0)] + childs = [ + x for x in node.children + if x.is_included & (x.cluster == 0) and not node_is_excluded_from_aggregation(x) + ] else: - childs = [x for x in node.children if x.is_included] + childs = [ + x for x in node.children + if x.is_included and not node_is_excluded_from_aggregation(x) + ] + + if len(childs) == 0: + if any(node_is_excluded_from_aggregation(x) for x in node.children): + # Residual decorrelation and PTM fragment selection together exhausted + # all eligible children. Keep the existing z-value so this node still + # contributes to higher levels — cascading the exclusion upward would + # double-penalise PTM data where both filters operate independently. + return + raise ValueError(f"Node {node.name!r} ({node.type}) has no eligible children to aggregate.") - childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering) + childs_zfiltered = get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node) + + if len(childs_zfiltered) == 0: + if any(node_is_excluded_from_aggregation(x) for x in node.children): + return + raise ValueError(f"Node {node.name!r} ({node.type}) has no eligible children after filtering.") zvals = get_feature_numpy_array_from_nodes(nodes=childs_zfiltered, feature_name="z_val") @@ -64,9 +127,14 @@ def aggregate_node_properties(node, only_use_mainclust, peptide_outlier_filterin fraction_consistent = sum([x.fraction_consistent/len(node.children) for x in childs if x.cluster ==0]) - - - z_normed = sum_and_re_scale_zvalues(zvals) + rho = getattr(node, 'icc_correction', 0.0) + if node.type not in _DEPENDENT_NODE_TYPES: + effective_mode = DEFAULT_AGGREGATION_MODE + elif isinstance(aggregation_mode, dict): + effective_mode = aggregation_mode.get(node.type, DEFAULT_AGGREGATION_MODE) + else: + effective_mode = aggregation_mode + z_normed = combine_zvalues(zvals, rho=rho, mode=effective_mode) p_val = transform_znormed_to_pval(z_normed) p_val = set_bounds_for_p_if_too_extreme(p_val) @@ -125,7 +193,7 @@ def _select_peptides_around_median_z(peptide_nodes, max_peptides=31): return selected_peptides -def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fragment_outlier_filtering=True): +def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node): if peptide_outlier_filtering and node.type == "gene": filtered_childs = [x for x in childs if not x.is_outlier_peptide] # Additional restriction: if more than 31 peptides, keep only 31 closest to median z-value @@ -133,91 +201,67 @@ def get_selected_nodes_for_zvalcalc(childs, peptide_outlier_filtering, node, fra filtered_childs = _select_peptides_around_median_z(filtered_childs, max_peptides=31) return filtered_childs - elif fragment_outlier_filtering and node.type == "frgion": - return remove_outlier_fragion_childs(childs) - else: - return childs - + if node.type == "frgion": + filtered = childs + if aqvariables.ION_OUTLIER_MAD_THRESHOLD is not None: + filtered = _filter_ions_by_mad(filtered, aqvariables.ION_OUTLIER_MAD_THRESHOLD) + if aqvariables.MAX_N_FRAGMENTS is not None and len(filtered) > aqvariables.MAX_N_FRAGMENTS: + filtered = _select_peptides_around_median_z(filtered, max_peptides=aqvariables.MAX_N_FRAGMENTS) + if aqvariables.CLASSIC_FRAGMENT_OUTLIER_FILTERING: + filtered = remove_outlier_fragion_childs(filtered) + return filtered + return childs -def filter_fewpeps_per_protein(peptide_nodes): - peps_filtered = [] - pepnode2zval2numleaves = [] - for pepnode in peptide_nodes: - pepleaves = [x for x in pepnode.leaves if "seq" in getattr(x,"inclusion_levels", [])] - pepnode2zval2numleaves.append((pepnode, pepnode.z_val,len(pepleaves))) - pepnode2zval2numleaves = sorted(pepnode2zval2numleaves, key=lambda x : abs(x[1])) #sort with lowest absolute z-val (least significant) first +def _filter_ions_by_mad(nodes, threshold): + """Remove ion nodes whose z-value is a MAD-outlier among siblings. - return get_median_peptides(pepnode2zval2numleaves) - -def filter_outlier_peptides_old(peptide_nodes, fraction_highly_significant): - """ - Filters outlier peptides based on p-value significance. - - Checks if there's a minority of peptides (<40%) that has substantially more - significant p-values (at least a factor of 5) compared to the median. - Only starts checking if the median p-value is 0.05 or higher. - If this minority case exists, returns only the less significant half of peptides. + Requires at least 4 nodes to attempt filtering, and always retains + at least 2 nodes (the two closest to the median). Args: - peptide_nodes: List of peptide nodes with p_val attributes + nodes: List of child nodes with z_val attributes. + threshold: Number of scaled-MAD units beyond which a node is + considered an outlier (e.g. 3.0). Returns: - Filtered list of peptide nodes + Filtered list of nodes with outliers removed. """ - if len(peptide_nodes) < 4: - return peptide_nodes + if len(nodes) < 4: + return nodes - # Get p-values from peptide nodes - p_values = [node.p_val for node in peptide_nodes] - median_p_val = np.median(p_values) + z_vals = np.array([n.z_val for n in nodes]) + median_z = float(np.median(z_vals)) - # Only check for outliers if median p-value is 0.05 or higher - if median_p_val < 0.05: - return peptide_nodes + robust_std = _robust_std_estimate(z_vals, median_z) + if robust_std == 0: + return nodes - # Check for minority with substantially more significant p-values - threshold_p_val = median_p_val / 5.0 # at least 5x more significant (lower p-value) - highly_significant_nodes = [node for node in peptide_nodes if node.p_val <= threshold_p_val] - remaining_nodes = [node for node in peptide_nodes if node.p_val > threshold_p_val] + cutoff = threshold * robust_std + kept = [n for n in nodes if abs(n.z_val - median_z) <= cutoff] - # Check if this is a minority (<40%) - if len(highly_significant_nodes) / len(peptide_nodes) < 0.3: - return _filter_minority_highly_significant(highly_significant_nodes, remaining_nodes, fraction_highly_significant) + if len(kept) < 2: + kept = sorted(nodes, key=lambda n: abs(n.z_val - median_z))[:2] - return peptide_nodes + return kept -def _filter_minority_highly_significant_old(highly_significant_nodes, remaining_nodes, fraction_highly_significant): - """ - Handle filtering when highly significant nodes are a minority (<40%). +def _robust_std_estimate(values, median): + """Estimate std via scaled MAD, falling back to IQR if MAD is zero.""" + MAD_SCALE = 1.4826 # makes MAD consistent with std for normal data + IQR_SCALE = 1.349 # makes IQR consistent with std for normal data - Args: - highly_significant_nodes: Nodes with p-value <= threshold_p_val - remaining_nodes: All peptide nodes - threshold_p_val: The p-value threshold used to identify highly significant nodes - fraction_highly_significant: Global fraction of highly significant ions + mad = float(np.median(np.abs(values - median))) + if mad > 0: + return MAD_SCALE * mad - Returns: - Filtered list of peptide nodes to exclude for analysis - """ - # if len(highly_significant_nodes) == 1: - # return highly_significant_nodes+remaining_nodes - # Calculate how many highly significant nodes to exclude - num_to_exclude = int(len(highly_significant_nodes) * (fraction_highly_significant / 0.08)) - num_to_exclude_bounded = max(1, min(len(highly_significant_nodes)-1, num_to_exclude)) + q75, q25 = np.percentile(values, [75, 25]) + iqr = float(q75 - q25) + if iqr > 0: + return iqr / IQR_SCALE - # Sort by p-value (most significant first) and exclude the best ones - highly_significant_nodes_sorted = sorted(highly_significant_nodes, key=lambda x: x.p_val) - nodes_to_keep = highly_significant_nodes_sorted[num_to_exclude_bounded:] #keep the least significant ones - return nodes_to_keep + remaining_nodes + return 0.0 import math -def get_median_peptides(pepnode2zval2numleaves): #least significant peptides are sorted first - median_idx = math.floor(len(pepnode2zval2numleaves)/2) - if len(pepnode2zval2numleaves)<3: - return [x[0] for x in pepnode2zval2numleaves] - else: - return [x[0] for x in pepnode2zval2numleaves[:median_idx+1]] def remove_outlier_fragion_childs(childs): """Filters extreme fragment ions before aggregating to peptide level. @@ -237,15 +281,7 @@ def remove_outlier_fragion_childs(childs): list: Filtered subset of fragment ion nodes to use for aggregation """ zvals = get_feature_numpy_array_from_nodes(nodes=childs, feature_name="z_val") - if aqvariables.PTM_FRAGMENT_SELECTION: - sorted_idxs_zvals = np.argsort(np.abs(zvals)) - median_idx = math.floor(len(zvals)/2) - median_idx = 7 if median_idx > 7 else median_idx - if median_idx < len(sorted_idxs_zvals): - idxs_to_use = sorted_idxs_zvals[:median_idx+1] - else: - idxs_to_use = sorted_idxs_zvals - elif len(zvals) > 4: + if len(zvals) > 4: sorted_idxs_zvals = np.argsort(zvals) median_idx = math.floor(len(zvals)/2) idx_start = median_idx - 2 @@ -263,16 +299,153 @@ def remove_outlier_fragion_childs(childs): return [childs[idx] for idx in idxs_to_use] -def sum_and_re_scale_zvalues(zvals): +def apply_ptm_fragment_selection(protnodes, max_keep=8): + """Apply PTM low-|Z| fragment filtering after residual decorrelation. + + For every ``frgion`` parent, currently eligible base-ion children are sorted + by ``abs(z_val)``. The least extreme children through the median rank are + retained, capped by ``max_keep``. The default ``max_keep=8`` matches the + legacy PTM fragment-selection rule in ``remove_outlier_fragion_childs``. + + Returns: + tuple[int, int]: ``(children_dropped, parents_touched)``. + """ + try: + max_keep = int(max_keep) + except (TypeError, ValueError): + max_keep = 8 + max_keep = max(1, max_keep) + + n_dropped = 0 + n_touched = 0 + for protnode in protnodes: + for parent in anytree.PreOrderIter(protnode): + if getattr(parent, "type", None) != "frgion": + continue + eligible = [ + child for child in parent.children + if child.is_included and not node_is_excluded_from_aggregation(child) + ] + n_live = len(eligible) + if n_live <= 1: + continue + + keep_n = min((n_live // 2) + 1, max_keep, n_live) + if keep_n >= n_live: + continue + + zvals = get_feature_numpy_array_from_nodes( + nodes=eligible, feature_name="z_val") + order = np.argsort(np.abs(zvals)) + kept = {eligible[idx] for idx in order[:keep_n]} + dropped_here = 0 + for child in eligible: + keep = child in kept + child.exclude_ptm_fragment_selection = not keep + if not keep: + child.is_outlier_fragment = True + dropped_here += 1 + elif not getattr(child, "is_outlier_fragment", False): + child.is_outlier_fragment = False + if dropped_here: + n_dropped += dropped_here + n_touched += 1 + + return n_dropped, n_touched + + +def combine_zvalues(zvals, rho=0.0, mode=DEFAULT_AGGREGATION_MODE): + """Dispatch function that selects the z-value combination strategy. + + Args: + zvals: Array or list of z-values to combine + rho: Correlation design-effect parameter for Stouffer aggregation. + mode: One of AGGREGATION_MODES + + Returns: + float: Combined z-value on a standard normal scale + """ + if mode not in AGGREGATION_MODES and mode not in _LEGACY_AGGREGATION_MODES: + raise ValueError(f"Unknown aggregation mode: {mode!r}. Choose from {AGGREGATION_MODES}") + + if len(zvals) == 1: + return zvals[0] + + if mode in ("stouffer_decorrelation", "stouffer_icc"): + return sum_and_re_scale_zvalues(zvals, rho=rho) + elif mode == "mean_z": + return _combine_mean_z(zvals) + elif mode == "median_z": + return _combine_median_z(zvals) + elif mode == "min_median_max_z": + return _combine_min_median_max_z(zvals) + elif mode == "min_max_z": + return _combine_min_max_z(zvals) + elif mode == "summed_z": + return _combine_summed_z(zvals) + + +def _combine_mean_z(zvals): + """Arithmetic mean of z-values — treats children as a single effective measurement.""" + return float(np.mean(zvals)) + + +def _combine_median_z(zvals): + """Median z-value — robust to outlier children.""" + return float(np.median(zvals)) + + +def _combine_min_median_max_z(zvals): + """Pick min, median, max z-values and combine via Stouffer assuming independence. + + Provides a 3-point summary that captures the full spread of evidence. + For n <= 3, falls back to Stouffer on all values (the summary would be + the full set anyway). + """ + if len(zvals) <= 3: + return sum_and_re_scale_zvalues(zvals, rho=0.0) + z_min = float(np.min(zvals)) + z_med = float(np.median(zvals)) + z_max = float(np.max(zvals)) + return sum_and_re_scale_zvalues(np.array([z_min, z_med, z_max]), rho=0.0) + + +def _combine_min_max_z(zvals): + """Pick min and max z-values and combine via Stouffer assuming independence. + + A 2-point summary that captures the extreme spread of evidence. + For n <= 2, falls back to Stouffer on all values. + """ + if len(zvals) <= 2: + return sum_and_re_scale_zvalues(zvals, rho=0.0) + z_min = float(np.min(zvals)) + z_max = float(np.max(zvals)) + return sum_and_re_scale_zvalues(np.array([z_min, z_max]), rho=0.0) + + +def _combine_summed_z(zvals): + """Classic Stouffer combination assuming full independence (rho=0). + + Unlike the Stouffer modes, this ignores any estimated ICC correction + and always treats child z-values as independent. + """ + return sum_and_re_scale_zvalues(zvals, rho=0.0) + + +def sum_and_re_scale_zvalues(zvals, rho=0.0): """Combines multiple z-values into a single aggregated z-value using Stouffer's method. This implements Stouffer's Z-score method for meta-analysis: z-values are summed - and divided by sqrt(n) to account for the number of tests. The result is then - rescaled back to a standard normal distribution. This allows combining evidence - from multiple ions/peptides while maintaining proper statistical interpretation. + and divided by sqrt(n * DEFF) to account for both the number of tests and their + correlation. The design effect DEFF = 1 + (n-1) * rho corrects for the fact that + correlated z-values carry less independent information than n truly independent ones. + The result is then rescaled back to a standard normal distribution. Args: zvals: Array or list of z-values to combine + rho: Intraclass correlation (ICC) among the z-values. 0.0 assumes independence + (classic Stouffer), higher values produce more conservative (less significant) + combined z-values. Returns: float: Combined z-value following a standard normal distribution under the null @@ -280,8 +453,10 @@ def sum_and_re_scale_zvalues(zvals): if len(zvals) == 1: return zvals[0] # No aggregation needed for single values - avoids floating-point precision errors + n = len(zvals) + deff = 1.0 + (n - 1) * rho # design effect: inflated variance due to intra-group correlation z_sum = sum(zvals) - p_z = NormalDist(mu = 0, sigma = np.sqrt(len(zvals))).cdf(z_sum) + p_z = NormalDist(mu = 0, sigma = np.sqrt(n * deff)).cdf(z_sum) p_z = set_bounds_for_p_if_too_extreme(p_z) z_normed = NormalDist(mu = 0, sigma=1).inv_cdf(p_z) #this is just a re-scaling of the z-value to a standard normal distribution return z_normed @@ -306,64 +481,6 @@ def set_bounds_for_p_if_too_extreme(p_val): else: return p_val -def calc_fold_change_from_included_leaves_fcs(node): - included_leaves = obtain_all_included_leaves(node) - list_of_fcs = [x.fcs for x in included_leaves] - merged_fcs = np.concatenate(list_of_fcs) - return np.median(merged_fcs) - -def calc_weighted_fold_change_from_included_leaves_fcs(node): - included_leaves = obtain_all_included_leaves(node) - list_of_fcs = [x.fcs for x in included_leaves] - weights = [get_weight_of_leaf(x) for x in included_leaves] - weighted_median = calculate_weighted_median(weights, list_of_fcs) - return weighted_median - -def get_weight_of_leaf(leaf): - if hasattr(leaf, "ml_score_fragion"): - return 2**-leaf.ml_score_fragion - else: - return 1 - -def calculate_weighted_median(weights, fcs): - weighted_fcs = [(fc, weight) for weight, fc_list in zip(weights, fcs) for fc in fc_list] - sorted_weighted_fcs = sorted(weighted_fcs, key=lambda x: x[0]) - sorted_fcs, sorted_weights = zip(*sorted_weighted_fcs) - cumulative_weights = np.cumsum(sorted_weights) - total_weight = cumulative_weights[-1] - median_cutoff = total_weight / 2 - median_idx = np.where(cumulative_weights >= median_cutoff)[0][0] - weighted_median = sorted_fcs[median_idx] - return weighted_median - -def obtain_all_included_leaves(node): - list_of_included_leaves = [] - traverse_and_add_included_leaves(node, list_of_included_leaves) - return list_of_included_leaves - -def traverse_and_add_included_leaves(node, list_of_included_leaves, is_root=True): - """ - Recursively searches for leaves from the given node, where each node in the - path to the leaf has the 'is_included' attribute set to True, except for the initial node. - Fills up the list_of_included_leaves with the included leaves. - - Parameters: - node (anytree.Node): The node to start the search from. - list_of_included_leaves (list): The list to store the included leaves in. - is_root (bool): Indicates if the current node is the root node of the traversal. - """ - - if len(node.children) == 0: # if the node is a leaf - if is_root or (node.is_included and node.cluster == 0): - list_of_included_leaves.append(node) - return - - # If it's the root node or if the current node is included, then proceed to its children - if is_root or (node.is_included and node.cluster == 0): - for child in node.children: - # Recursive call with is_root set to False, as we are now dealing with child nodes - traverse_and_add_included_leaves(child, list_of_included_leaves, is_root=False) - def sum_ml_scores(ml_scores): abs_ml_scores = [abs(x) for x in ml_scores] return sum(abs_ml_scores) @@ -382,15 +499,6 @@ def get_grouped_mainclust_leafs(child_nodes): grouped_leafs.append(child_leaves_mainclust) return grouped_leafs -def select_highid_lowcv_leafs(grouped_leafs): - grouped_leafs_lowcv = [] - for leafs in grouped_leafs: - top_quantile_idx = math.ceil(len(leafs) * 0.2) - leafs_repsorted = sorted(leafs, key = lambda x : x.min_reps)[:top_quantile_idx] - leafs_repsorted_cvsorted = sorted(leafs_repsorted, key = lambda x : x.cv) - grouped_leafs_lowcv.append([leafs_repsorted_cvsorted[0]]) - return grouped_leafs_lowcv - def select_median_fc_leafs(grouped_leafs): grouped_leafs_medianfc = [] for leafs in grouped_leafs: @@ -566,22 +674,6 @@ def remove_unnecessary_attributes(node, attributes_to_remove): import os -def get_nodes_of_type(cond1, cond2, results_folder, node_type = 'mod_seq_charge'): - - tree_sn = aqutils.read_condpair_tree(cond1, cond2, results_folder=results_folder) - tree_sn.type = "asd" - return anytree.findall(tree_sn, filter_= lambda x : (x.type == node_type)) - - - -def get_levelnodes_from_nodeslist(nodeslist, level): - levelnodes = [] - for node in nodeslist: - precursors = anytree.findall(node, filter_= lambda x : (x.type == level)) - levelnodes.extend(precursors) - return levelnodes - - def find_node_parent_at_level(node, level): if node.type == level: return node @@ -609,13 +701,6 @@ def shorten_root_to_level(root, parent_level): -def get_parent2children_dict(tree, parent_level): - parent2children = {} - parent_nodes = anytree.search.findall(tree, filter_=lambda node: node.level == parent_level) - for parent_node in parent_nodes: - parent2children[parent_node.name] = [child.name for child in parent_node.children] - return parent2children - def get_parent2leaves_dict(protein): """Returns a dict that maps the parent node name to the names of the leaves of the parent node """ diff --git a/alphaquant/cluster/ml_reorder.py b/alphaquant/cluster/ml_reorder.py index 0a0d75de..f4546a96 100644 --- a/alphaquant/cluster/ml_reorder.py +++ b/alphaquant/cluster/ml_reorder.py @@ -3,7 +3,7 @@ import anytree -def update_nodes_w_ml_score(protnodes : list[anytree.Node]): +def update_nodes_w_ml_score(protnodes : list[anytree.Node], aggregation_mode="stouffer_decorrelation"): """ Update and re-order clusters within protein nodes based on ML scores. @@ -13,16 +13,17 @@ def update_nodes_w_ml_score(protnodes : list[anytree.Node]): Args: protnodes (list[anytree.Node]): A list of protein nodes to be processed. + aggregation_mode: Strategy for combining child z-values during re-aggregation. Returns: None """ for prot in protnodes: - _re_order_depending_on_ml_score(prot) + _re_order_depending_on_ml_score(prot, aggregation_mode=aggregation_mode) -def _re_order_depending_on_ml_score(protnode : anytree.Node): +def _re_order_depending_on_ml_score(protnode : anytree.Node, aggregation_mode="stouffer_decorrelation"): """ Reorder clusters in a protein node tree based on machine learning scores. @@ -38,6 +39,7 @@ def _re_order_depending_on_ml_score(protnode : anytree.Node): Args: protnode (anytree.Node): The protein node to be processed. + aggregation_mode: Strategy for combining child z-values during re-aggregation. Returns: None @@ -51,13 +53,18 @@ def _re_order_depending_on_ml_score(protnode : anytree.Node): if len(type_nodes)==0: continue for type_node in type_nodes: #go through the nodes, re-order the children. Propagate the values from the newly ordered children to the type node - child_nodes = type_node.children - had_ml_score = hasattr(child_nodes[0], 'ml_score') + child_nodes = [ + x for x in type_node.children + if x.is_included and not aqcluster_utils.node_is_excluded_from_aggregation(x) + ] + if len(child_nodes) == 0: + continue + had_ml_score = all(hasattr(child, 'ml_score') for child in child_nodes) if had_ml_score: clust2newclust = _get_clust2newclust(child_nodes) _re_assign_proteoform_stats(child_nodes, clust2newclust) _re_order_clusters_by_ml_score(child_nodes, clust2newclust) - aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False) + aqcluster_utils.aggregate_node_properties(type_node,only_use_mainclust=True, peptide_outlier_filtering=False, aggregation_mode=aggregation_mode) def _get_clust2newclust(nodes: list[anytree.Node]) -> dict[int, int]: @@ -162,6 +169,3 @@ def _re_order_clusters_by_ml_score(nodes : list[anytree.Node], clust2newclust : """ for node in nodes: node.cluster =clust2newclust.get(node.cluster) - - - diff --git a/alphaquant/cluster/outlier_filtering.py b/alphaquant/cluster/outlier_filtering.py index 7ac38025..d6e8f1d8 100644 --- a/alphaquant/cluster/outlier_filtering.py +++ b/alphaquant/cluster/outlier_filtering.py @@ -1,13 +1,14 @@ import alphaquant.cluster.cluster_utils as aqcluster_utils +import alphaquant.config.variables as aqvariables import anytree import numpy as np -def apply_peptide_outlier_filtering(protnodes: list[anytree.Node]): +def apply_peptide_outlier_filtering(protnodes: list[anytree.Node], aggregation_mode="stouffer_decorrelation"): regulation_score = calculate_regulation_score(protnodes) for protnode in protnodes: _determine_and_annotate_outlier_status_of_peptides(protnode, regulation_score) - aqcluster_utils.aggregate_node_properties(protnode, only_use_mainclust=True, peptide_outlier_filtering=True) + aqcluster_utils.aggregate_node_properties(protnode, only_use_mainclust=True, peptide_outlier_filtering=True, aggregation_mode=aggregation_mode) @@ -23,7 +24,12 @@ def calculate_regulation_score(protnodes: list[anytree.Node]): fraction_sig = num_sig / (num_sig + num_insig) log2fc_ratio_sig_vs_insig = np.median(abs_log2fc[sig_mask_005]) / (np.median(abs_log2fc[nonsig_mask]) + 1e-6) - regulation_score = min(1, log2fc_ratio_sig_vs_insig * fraction_sig/10) #merges the regulation strength and the fraction of significant proteins into one score divided by to normalize it, the normalization factor corresponds to a very stongly regulated dataset + regulation_score = min( + 1, + log2fc_ratio_sig_vs_insig + * fraction_sig + / aqvariables.PEPTIDE_OUTLIER_REGULATION_NORMALIZATION_FACTOR, + ) # merges the regulation strength and the fraction of significant proteins into one score; the normalization factor corresponds to a strongly regulated dataset return regulation_score diff --git a/tests/unit_tests/test_clusterutils.py b/tests/unit_tests/test_clusterutils.py index 001ad134..b9f2caea 100644 --- a/tests/unit_tests/test_clusterutils.py +++ b/tests/unit_tests/test_clusterutils.py @@ -66,43 +66,6 @@ def test_remove_unnecessary_attributes(): -def test_traverse_and_add_included_leaves_anytree(): - # Constructing the tree - root = anytree.Node("root", is_included=True, cluster=0) - node1 = anytree.Node("node1", parent=root, is_included=True, cluster=0) - node2 = anytree.Node("node2", parent=root, is_included=True, cluster=0) - leaf1 = anytree.Node("leaf1", parent=node1, is_included=True, cluster=0) - leaf2 = anytree.Node("leaf2", parent=node1, is_included=False, cluster=1) - leaf3 = anytree.Node("leaf3", parent=node2, is_included=True, cluster=0) - - list_of_included_leaves = [] - aq_clust_clusterutils.traverse_and_add_included_leaves(root, list_of_included_leaves) - print(list_of_included_leaves) - # Assert conditions - assert leaf1 in list_of_included_leaves, "leaf1 is missing from the result." - assert leaf3 in list_of_included_leaves, "leaf3 is missing from the result." - assert len(list_of_included_leaves) == 2, "The number of included leaves is incorrect." - - - root = anytree.Node("root", is_included=True, cluster=0) - node1 = anytree.Node("node1", parent=root, is_included=False, cluster=1) - node2 = anytree.Node("node2", parent=root, is_included=True, cluster=0) - leaf1 = anytree.Node("leaf1", parent=node1, is_included=True, cluster=0) - leaf2 = anytree.Node("leaf2", parent=node1, is_included=False, cluster=1) - leaf3 = anytree.Node("leaf3", parent=node2, is_included=True, cluster=0) - - list_of_included_leaves = [] - aq_clust_clusterutils.traverse_and_add_included_leaves(root, list_of_included_leaves) - print(list_of_included_leaves) - # Assert conditions - assert leaf1 not in list_of_included_leaves, "leaf1 should be excluded" - assert leaf3 in list_of_included_leaves, "leaf3 is missing from the result." - assert len(list_of_included_leaves) == 1, "The number of included leaves is incorrect." - - print("All tests passed!") - - - def test_iterate_through_tree_levels_bottom_to_top(): @@ -216,44 +179,14 @@ def test_remove_outlier_fragion_childs_complete(): assert fragments_10[idx].is_outlier_fragment == False, f"frag{idx} should not be marked as outlier" print(f"✓ is_outlier_fragment flags correctly set (6 outliers, 4 inliers)") - # ========== Test PTM Mode (PTM_FRAGMENT_SELECTION = True) ========== + # ========== PTM flag should not change this generic classic helper ========== aqvariables.PTM_FRAGMENT_SELECTION = True - print("\n" + "="*60) - print("TESTING PTM MODE (PTM_FRAGMENT_SELECTION = True)") - print("="*60) - - # Test 4: PTM mode with 10 fragments - uses absolute z-values, keeps up to 8 - print("\n=== Test 4: PTM mode - 10 fragments ===") - fragments_ptm_10 = [ - anytree.Node(f"frag{i}", z_val=float(i-5)) for i in range(10) - ] - # z_vals: -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0 - # Absolute z_vals: 5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 1.0, 2.0, 3.0, 4.0 - # Sorted by abs: 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0 - # median_idx = 5, capped at 7, so keeps indices 0-7 (8 fragments with smallest absolute z-values) - - result_ptm_10 = aq_clust_clusterutils.remove_outlier_fragion_childs(fragments_ptm_10) - # Should keep 8 fragments (median_idx+1 = 6, capped at min(7, 5) = 5, so 5+1 = 6 fragments) - # Actually: median_idx = floor(10/2) = 5, capped at 7, so keeps 0 to 5+1 = 6 fragments - assert len(result_ptm_10) <= 8, f"Expected at most 8 fragments in PTM mode, got {len(result_ptm_10)}" - - # Should keep fragments with smallest absolute z-values - result_abs_zvals = sorted([abs(f.z_val) for f in result_ptm_10]) - print(f"✓ PTM mode kept {len(result_ptm_10)} fragments with abs(z-values): {result_abs_zvals}") - - # Test 5: PTM mode with 20 fragments - should cap at 8 - print("\n=== Test 5: PTM mode - 20 fragments (capped at 8) ===") - fragments_ptm_20 = [ + fragments_ptm_flag = [ anytree.Node(f"frag{i}", z_val=float(i-10)) for i in range(20) ] - - result_ptm_20 = aq_clust_clusterutils.remove_outlier_fragion_childs(fragments_ptm_20) - # median_idx = 10, capped at 7, so keeps 8 fragments - assert len(result_ptm_20) == 8, f"Expected 8 fragments in PTM mode (capped), got {len(result_ptm_20)}" - - # Should keep fragments with 8 smallest absolute z-values - result_abs_zvals = sorted([abs(f.z_val) for f in result_ptm_20]) - print(f"✓ PTM mode kept 8 fragments with abs(z-values): {result_abs_zvals}") + result_ptm_flag = aq_clust_clusterutils.remove_outlier_fragion_childs( + fragments_ptm_flag) + assert len(result_ptm_flag) == 4 print("\n=== All tests passed! ===") @@ -262,6 +195,180 @@ def test_remove_outlier_fragion_childs_complete(): aqvariables.PTM_FRAGMENT_SELECTION = original_ptm_setting + +# --------------------------------------------------------------------------- +# Tests for combine_zvalues / sum_and_re_scale_zvalues (new on this branch) +# --------------------------------------------------------------------------- +import numpy as np +import pytest + + +class TestCombineZvalues: + """Tests for combine_zvalues dispatch and individual modes.""" + + def test_single_value_returns_unchanged(self): + assert aq_clust_clusterutils.combine_zvalues([1.5], mode="stouffer_decorrelation") == 1.5 + assert aq_clust_clusterutils.combine_zvalues([1.5], mode="mean_z") == 1.5 + assert aq_clust_clusterutils.combine_zvalues([1.5], mode="median_z") == 1.5 + assert aq_clust_clusterutils.combine_zvalues([1.5], mode="min_median_max_z") == 1.5 + + def test_invalid_mode_raises(self): + with pytest.raises(ValueError, match="Unknown aggregation mode"): + aq_clust_clusterutils.combine_zvalues([1.0, 2.0], mode="bogus") + + def test_mean_z(self): + zvals = [1.0, 2.0, 3.0] + result = aq_clust_clusterutils.combine_zvalues(zvals, mode="mean_z") + assert result == pytest.approx(2.0) + + def test_median_z(self): + zvals = [1.0, 5.0, 2.0] + result = aq_clust_clusterutils.combine_zvalues(zvals, mode="median_z") + assert result == pytest.approx(2.0) + + def test_min_median_max_z_small_input(self): + """n <= 3 should fall back to Stouffer on all values.""" + zvals = np.array([1.0, 2.0]) + result_mmm = aq_clust_clusterutils.combine_zvalues(zvals, mode="min_median_max_z") + result_stouffer = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.0) + assert result_mmm == pytest.approx(result_stouffer) + + def test_min_median_max_z_large_input(self): + """n > 3 should use the 3-point summary.""" + zvals = np.array([-2.0, -1.0, 0.0, 1.0, 2.0]) + result = aq_clust_clusterutils.combine_zvalues(zvals, mode="min_median_max_z") + expected = aq_clust_clusterutils.sum_and_re_scale_zvalues( + np.array([-2.0, 0.0, 2.0]), rho=0.0 + ) + assert result == pytest.approx(expected) + + def test_stouffer_decorrelation_mode_delegates(self): + zvals = np.array([1.0, 2.0, 3.0]) + result = aq_clust_clusterutils.combine_zvalues(zvals, rho=0.0, mode="stouffer_decorrelation") + expected = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.0) + assert result == pytest.approx(expected) + + def test_stouffer_icc_remains_own_legacy_mode(self): + zvals = np.array([1.0, 2.0, 3.0]) + result = aq_clust_clusterutils.combine_zvalues(zvals, rho=0.5, mode="stouffer_icc") + expected = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.5) + assert result == pytest.approx(expected) + + +class TestSumAndReScaleZvaluesWithRho: + """Tests for the rho (ICC) parameter added to sum_and_re_scale_zvalues.""" + + def test_rho_zero_is_classic_stouffer(self): + zvals = np.array([1.0, 1.0, 1.0]) + result = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.0) + assert result > 1.0 # combining concordant evidence should amplify + + def test_higher_rho_gives_more_conservative_result(self): + zvals = np.array([2.0, 2.0, 2.0, 2.0]) + z_rho0 = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.0) + z_rho05 = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=0.5) + assert abs(z_rho05) < abs(z_rho0) + + def test_rho_one_returns_same_as_single_value(self): + """With perfect correlation (rho=1), DEFF=n so sigma=n, reducing + sum/sqrt(n*n) = mean, which for identical values is the value itself.""" + zvals = np.array([1.5, 1.5, 1.5]) + result = aq_clust_clusterutils.sum_and_re_scale_zvalues(zvals, rho=1.0) + assert result == pytest.approx(1.5, abs=0.05) + + def test_single_value_ignores_rho(self): + result = aq_clust_clusterutils.sum_and_re_scale_zvalues(np.array([2.5]), rho=0.8) + assert result == pytest.approx(2.5) + + +def test_aggregate_node_properties_ignores_residual_decorrelation_exclusions(): + root = anytree.Node("parent", type="frgion", is_included=True, cluster=0) + anytree.Node( + "keep1", + parent=root, + type="base", + is_included=True, + cluster=0, + z_val=1.0, + fc=1.0, + cv=0.2, + min_intensity=10.0, + total_intensity=10.0, + min_reps=2, + fraction_consistent=1.0, + ) + anytree.Node( + "drop", + parent=root, + type="base", + is_included=True, + cluster=0, + z_val=100.0, + fc=100.0, + cv=0.2, + min_intensity=10.0, + total_intensity=10.0, + min_reps=2, + fraction_consistent=1.0, + exclude_residual_decorrelation=True, + ) + anytree.Node( + "keep2", + parent=root, + type="base", + is_included=True, + cluster=0, + z_val=3.0, + fc=3.0, + cv=0.4, + min_intensity=20.0, + total_intensity=20.0, + min_reps=4, + fraction_consistent=1.0, + ) + + aq_clust_clusterutils.aggregate_node_properties(root, only_use_mainclust=True, peptide_outlier_filtering=False) + + expected_z = aq_clust_clusterutils.combine_zvalues(np.array([1.0, 3.0]), rho=0.0, mode="stouffer_decorrelation") + assert root.z_val == expected_z + assert root.fc == 2.0 + assert root.min_reps == 3.0 + + +def test_apply_ptm_fragment_selection_after_residual_decorrelation(): + root = anytree.Node("protein", type="gene") + frgion = anytree.Node( + "frgion", + parent=root, + type="frgion", + is_included=True, + cluster=0, + ) + zvals = [10.0, -0.1, 3.0, 0.2, -7.0] + children = [] + for idx, z_val in enumerate(zvals): + children.append(anytree.Node( + f"base{idx}", + parent=frgion, + type="base", + is_included=True, + cluster=0, + z_val=z_val, + )) + children[0].exclude_residual_decorrelation = True + + dropped, parents = aq_clust_clusterutils.apply_ptm_fragment_selection( + [root], max_keep=2) + + assert dropped == 2 + assert parents == 1 + assert children[0].exclude_residual_decorrelation is True + assert not getattr(children[1], "exclude_ptm_fragment_selection", False) + assert getattr(children[2], "exclude_ptm_fragment_selection", False) + assert not getattr(children[3], "exclude_ptm_fragment_selection", False) + assert getattr(children[4], "exclude_ptm_fragment_selection", False) + + if __name__ == "__main__": test_remove_outlier_fragion_childs_complete() print("\n" + "="*60) From e420be36476c8cf349d7f7048340ec8926a14510 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:12:57 +0200 Subject: [PATCH 2/5] Update config variables for new features --- alphaquant/config/variables.py | 70 +++++++++++++++++++++++++++--- tests/unit_tests/test_variables.py | 22 ++++++++++ 2 files changed, 85 insertions(+), 7 deletions(-) create mode 100644 tests/unit_tests/test_variables.py diff --git a/alphaquant/config/variables.py b/alphaquant/config/variables.py index 50a87308..4d8cf96c 100644 --- a/alphaquant/config/variables.py +++ b/alphaquant/config/variables.py @@ -5,7 +5,14 @@ PROGRESS_FOLDER = "progress" PREFER_PRECURSORS_FOR_CLUSTERING = True PEPTIDE_OUTLIER_FILTERING = True +PEPTIDE_OUTLIER_REGULATION_NORMALIZATION_FACTOR = 15.0 +OUTLIER_CORRECTION_FACTOR = 1.0 PTM_FRAGMENT_SELECTION = False +MAX_N_FRAGMENTS = None +ION_OUTLIER_MAD_THRESHOLD = None +CLASSIC_FRAGMENT_OUTLIER_FILTERING = False +ICC_NULL_PVAL_THRESHOLD = 0.1 +NUM_BG_CONTEXTS = 10 CONDITION_PAIR_SEPARATOR = "_VS_" #prefixes for the different ion types @@ -15,6 +22,10 @@ FRG = "FRG" ION = "ION" +INPUT_TYPE = None # e.g. "diann_precursor_fragion", set via set_input_config() +CONFIG_DICT = None # the full config dict for the detected input type + + def determine_variables(input_file, input_type): _determine_quant_id(input_file) _determine_prefer_precursors_for_clustering(input_type) @@ -36,18 +47,63 @@ def _determine_prefer_precursors_for_clustering(input_type): else: PREFER_PRECURSORS_FOR_CLUSTERING = False -def set_quant_id(quant_id): - global QUANT_ID - QUANT_ID = quant_id - def set_peptide_outlier_filtering(peptide_outlier_filtering): global PEPTIDE_OUTLIER_FILTERING PEPTIDE_OUTLIER_FILTERING = peptide_outlier_filtering +def set_outlier_correction_factor(outlier_correction_factor): + global OUTLIER_CORRECTION_FACTOR + OUTLIER_CORRECTION_FACTOR = float(outlier_correction_factor) + +def set_max_n_fragments(max_n_fragments): + global MAX_N_FRAGMENTS + MAX_N_FRAGMENTS = int(max_n_fragments) if max_n_fragments is not None else None + +def set_ion_outlier_mad_threshold(threshold): + global ION_OUTLIER_MAD_THRESHOLD + ION_OUTLIER_MAD_THRESHOLD = float(threshold) if threshold is not None else None + +def set_classic_fragment_outlier_filtering(enabled): + global CLASSIC_FRAGMENT_OUTLIER_FILTERING + CLASSIC_FRAGMENT_OUTLIER_FILTERING = bool(enabled) + +def set_icc_null_pval_threshold(threshold): + """Set the ICC null p-value threshold. + + Args: + threshold: Float applied to all ICC-estimation levels. + """ + global ICC_NULL_PVAL_THRESHOLD + ICC_NULL_PVAL_THRESHOLD = float(threshold) + + +def get_icc_null_pval_threshold(node_type=None): + """Return the ICC null p-value threshold for *node_type*. + + ``node_type`` is accepted for compatibility with callers that ask for the + level-specific value, but one global threshold is now used for all levels. + """ + return ICC_NULL_PVAL_THRESHOLD + def set_ptm_fragment_selection(is_ptm: bool): global PTM_FRAGMENT_SELECTION PTM_FRAGMENT_SELECTION = bool(is_ptm) -# Backwards-compat alias -def set_phospho_fragment_selection(is_phospho: bool): - set_ptm_fragment_selection(is_phospho) +def set_input_config(input_type, config_dict): + """Store the detected input type and its full config dict as module globals. + + Called once during pipeline setup so that other modules (e.g. + ``background_distributions``) can inspect the active configuration + without passing it through every function signature. + + Args: + input_type (str): Identifier of the detected input format + (e.g. ``"diann_precursor_fragion"``). + config_dict (dict): The complete YAML config dict for *input_type*. + + Side effects: + Sets ``INPUT_TYPE`` and ``CONFIG_DICT`` at module level. + """ + global INPUT_TYPE, CONFIG_DICT + INPUT_TYPE = input_type + CONFIG_DICT = config_dict diff --git a/tests/unit_tests/test_variables.py b/tests/unit_tests/test_variables.py new file mode 100644 index 00000000..4e35a7fb --- /dev/null +++ b/tests/unit_tests/test_variables.py @@ -0,0 +1,22 @@ +import alphaquant.config.variables as aqvariables + + +class TestSetInputConfig: + def setup_method(self): + self._orig_type = aqvariables.INPUT_TYPE + self._orig_dict = aqvariables.CONFIG_DICT + + def teardown_method(self): + aqvariables.INPUT_TYPE = self._orig_type + aqvariables.CONFIG_DICT = self._orig_dict + + def test_sets_globals(self): + aqvariables.set_input_config("diann_fragion", {"key": "value"}) + assert aqvariables.INPUT_TYPE == "diann_fragion" + assert aqvariables.CONFIG_DICT == {"key": "value"} + + def test_overwrite(self): + aqvariables.set_input_config("type_a", {"a": 1}) + aqvariables.set_input_config("type_b", {"b": 2}) + assert aqvariables.INPUT_TYPE == "type_b" + assert aqvariables.CONFIG_DICT == {"b": 2} From c4941d9495177b66800fef640fec7c2502440e4a Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:13:04 +0200 Subject: [PATCH 3/5] Wire new features into condpair analysis --- alphaquant/classify/ml_info_table.py | 7 +- alphaquant/diffquant/condpair_analysis.py | 136 +++++++++++++++++++++- alphaquant/diffquant/diff_analysis.py | 7 +- 3 files changed, 140 insertions(+), 10 deletions(-) diff --git a/alphaquant/classify/ml_info_table.py b/alphaquant/classify/ml_info_table.py index a92b7915..a975a3a9 100644 --- a/alphaquant/classify/ml_info_table.py +++ b/alphaquant/classify/ml_info_table.py @@ -64,10 +64,12 @@ def _adapt_precursor_name_to_modification_type(self): def _define_ml_info_filename(self): - self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv") + self.ml_info_filename = aq_utils.get_progress_folder_filename(self._input_file, ".ml_info_table.tsv.zip") def _write_ml_info_table(self): - self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False) + archive_name = self.ml_info_filename.split("/")[-1].removesuffix(".zip") + compression = {"method": "zip", "archive_name": archive_name} + self._ml_info_df.to_csv(self.ml_info_filename, sep="\t", index=False, compression=compression) class MLInfoTableLoader(): @@ -81,4 +83,3 @@ def __init__(self, ml_info_file, samples_used): def _subset_df_to_relevant_samples(self): self.ml_info_df = self.ml_info_df[self.ml_info_df["sample_ID"].isin(self._samples_used)] self.ml_info_df = self.ml_info_df.drop(columns=["sample_ID"]) - diff --git a/alphaquant/diffquant/condpair_analysis.py b/alphaquant/diffquant/condpair_analysis.py index 00ad54c3..f1a785a5 100644 --- a/alphaquant/diffquant/condpair_analysis.py +++ b/alphaquant/diffquant/condpair_analysis.py @@ -1,10 +1,12 @@ import alphaquant.diffquant.background_distributions as aqbg import alphaquant.diffquant.diff_analysis as aqdiff +import alphaquant.diffquant.intensity_summarization as aq_summarization import alphaquant.norm.normalization as aqnorm import alphaquant.plotting.pairwise as aq_plot_pairwise import alphaquant.diffquant.diffutils as aqutils import alphaquant.cluster.cluster_ions as aqclust import alphaquant.classify.classify_precursors as aq_class_precursors +import alphaquant.diffquant.variance_predictor as aq_variance_predictor import alphaquant.cluster.ml_reorder as aq_clust_mlreorder import alphaquant.tables.diffquant_table as aq_tablewriter_protein import alphaquant.tables.proteoformtable as aq_tablewriter_proteoform @@ -12,6 +14,7 @@ import alphaquant.cluster.cluster_utils as aqclust_utils import alphaquant.cluster.cluster_missingval as aq_clust_missingval import alphaquant.cluster.outlier_filtering as aq_clust_outlier +import alphaquant.cluster.residual_decorrelation as aq_clust_resid import pandas as pd import numpy as np @@ -66,10 +69,28 @@ def analyze_condpair(*,runconfig, condpair): df_c1_normed, df_c2_normed = aqnorm.normalize_if_specified(df_c1 = df_c1, df_c2 = df_c2, c1_samples = c1_samples, c2_samples = c2_samples, normalize_within_conds = runconfig.normalize, normalize_between_conds = runconfig.normalize, runtime_plots = runconfig.runtime_plots, protein_subset_for_normalization_file=runconfig.protein_subset_for_normalization_file, pep2prot = pep2prot)#, "./test_data/normed_intensities.tsv") + summarization_nodes = getattr(runconfig, 'summarization_nodes', []) + if summarization_nodes: + df_c1_normed, df_c2_normed, pep2prot = aq_summarization.apply_summarization( + df_c1_normed, df_c2_normed, pep2prot, summarization_nodes + ) + if runconfig.results_dir != None: write_out_normed_df(df_c1_normed, df_c2_normed, pep2prot, runconfig.results_dir, condpair) - normed_c1 = aqbg.ConditionBackgrounds(df_c1_normed, p2z) - normed_c2 = aqbg.ConditionBackgrounds(df_c2_normed, p2z) + + ion_index = df_c1_normed.index.union(df_c2_normed.index) + ion_variance = _compute_pooled_ion_variance(df_c1_normed, df_c2_normed, ion_index) + ion_median_intensity = _compute_pooled_median_intensity(df_c1_normed, df_c2_normed, ion_index) + if getattr(runconfig, 'use_variance_predictor', False): + ion2varscore = _load_variance_predictor_scores(runconfig, c1_samples, c2_samples, + ion_index, ion_variance, + ion_median_intensity) + else: + ion2varscore = None + + split_backgrounds_if_possible = getattr(runconfig, 'split_ion_backgrounds', False) + normed_c1 = aqbg.ConditionBackgrounds(df_c1_normed, p2z, ion2varscore=ion2varscore, split_by_ion_type=split_backgrounds_if_possible) + normed_c2 = aqbg.ConditionBackgrounds(df_c2_normed, p2z, ion2varscore=ion2varscore, split_by_ion_type=split_backgrounds_if_possible) ions_to_check = normed_c1.ion2nonNanvals.keys() & normed_c2.ion2nonNanvals.keys() ions_to_check = sorted(ions_to_check) @@ -113,7 +134,8 @@ def analyze_condpair(*,runconfig, condpair): clustered_prot_node = aqclust.get_scored_clusterselected_ions(prot, ions, normed_c1, normed_c2, bgpair2diffDist, p2z, deedpair2doublediffdist, pval_threshold_basis = runconfig.cluster_threshold_pval, fcfc_threshold = runconfig.cluster_threshold_fcfc, take_median_ion=runconfig.take_median_ion, fcdiff_cutoff_clustermerge= runconfig.fcdiff_cutoff_clustermerge, - fragment_outlier_filtering=runconfig.fragment_outlier_filtering) + aggregation_mode=runconfig.aggregation_mode, + cluster_threshold_ion_type=getattr(runconfig, "cluster_threshold_ion_type", 0.01)) protnodes.append(clustered_prot_node) if count_prots%100==0: @@ -121,6 +143,23 @@ def analyze_condpair(*,runconfig, condpair): count_prots+=1 + aggregation_mode = getattr(runconfig, "aggregation_mode", "stouffer_decorrelation") + uses_stouffer_decorrelation = ( + aggregation_mode == "stouffer_decorrelation" + or ( + isinstance(aggregation_mode, dict) + and "stouffer_decorrelation" in aggregation_mode.values() + ) + ) + if uses_stouffer_decorrelation: + aq_clust_resid.apply_residual_decorrelation( + protnodes, + df_c1_normed, + df_c2_normed, + tolerance=getattr(runconfig, "residual_decorrelation_tolerance", 0.10), + min_keep=getattr(runconfig, "residual_decorrelation_min_keep", 1), + aggregation_mode=runconfig.aggregation_mode, + ) if len(prot2missingval_diffions.keys())>0: LOGGER.info(f"start analysis of proteins w. completely missing values") @@ -146,7 +185,7 @@ def analyze_condpair(*,runconfig, condpair): if ml_successfull and (ml_performance_dict["r2_score"] >0.05): #only use the ml score if it is meaningful - aq_clust_mlreorder.update_nodes_w_ml_score(protnodes) + aq_clust_mlreorder.update_nodes_w_ml_score(protnodes, aggregation_mode=runconfig.aggregation_mode) LOGGER.info(f"ML based quality score above quality threshold and added to the nodes.") runconfig.ml_based_quality_score = True else: @@ -154,7 +193,7 @@ def analyze_condpair(*,runconfig, condpair): runconfig.ml_based_quality_score = False if runconfig.peptide_outlier_filtering: - aq_clust_outlier.apply_peptide_outlier_filtering(protnodes) + aq_clust_outlier.apply_peptide_outlier_filtering(protnodes, aggregation_mode=runconfig.aggregation_mode) protnodes_combined = protnodes + protnodes_missingval condpair_node = aqclust_utils.get_condpair_node(protnodes_combined, condpair) @@ -165,6 +204,93 @@ def analyze_condpair(*,runconfig, condpair): return res_df, pep_df +def _compute_pooled_ion_variance(df_c1_normed, df_c2_normed, ion_index): + """Compute pooled within-condition variance for each ion. + + For each ion, the variance is computed within each condition + (row-wise across replicate columns) and then averaged. Ions that + appear in only one condition get that condition's variance. + + Args: + df_c1_normed (pd.DataFrame): Normalised intensities for condition 1 + (ions x samples). + df_c2_normed (pd.DataFrame): Normalised intensities for condition 2. + ion_index (pd.Index): Union of ions from both conditions. + + Returns: + pd.Series: Per-ion pooled variance, indexed by ion id. + """ + var_c1 = df_c1_normed.var(axis=1) + var_c2 = df_c2_normed.var(axis=1) + pooled = pd.concat([var_c1, var_c2], axis=1).mean(axis=1) + return pooled.reindex(ion_index) + + +def _compute_pooled_median_intensity(df_c1_normed, df_c2_normed, ion_index): + """Compute per-ion median intensity pooled across conditions. + + For each ion, the row-wise median is computed within each condition + and then averaged. + + Args: + df_c1_normed (pd.DataFrame): Normalised intensities for condition 1. + df_c2_normed (pd.DataFrame): Normalised intensities for condition 2. + ion_index (pd.Index): Union of ions from both conditions. + + Returns: + pd.Series: Per-ion pooled median intensity, indexed by ion id. + """ + med_c1 = df_c1_normed.median(axis=1) + med_c2 = df_c2_normed.median(axis=1) + pooled = pd.concat([med_c1, med_c2], axis=1).mean(axis=1) + return pooled.reindex(ion_index) + + +def _load_variance_predictor_scores(runconfig, c1_samples, c2_samples, + ion_index, ion_variance, + ion_median_intensity): + """Load quality metrics and fit a linear model predicting ion variance. + + Uses the quality-metric columns from the ml_info_table together with + the per-ion median intensity as predictors, and the observed per-ion + variance as the regression target. The predicted values serve as + scores for background-distribution ordering. + + Args: + runconfig: Pipeline run configuration object. Relevant attributes are + ``variance_predictor_cols`` (list[str] | None) and + ``ml_input_file`` (str | None). + c1_samples (list[str]): Sample names for condition 1. + c2_samples (list[str]): Sample names for condition 2. + ion_index (pd.Index): Ion identifiers to score. + ion_variance (pd.Series): Observed per-ion pooled variance. + ion_median_intensity (pd.Series): Per-ion pooled median intensity. + + Returns: + dict[str, float] | None: Mapping from ion id to predicted variance + score, or None when the configuration is missing or fitting fails + (triggering fallback to intensity-based sorting). + """ + variance_predictor_cols = getattr(runconfig, 'variance_predictor_cols', None) + ml_input_file = getattr(runconfig, 'ml_input_file', None) + + if not variance_predictor_cols or not ml_input_file: + return None + + try: + return aq_variance_predictor.load_variance_predictor_scores( + ml_info_file=ml_input_file, + samples_used=c1_samples + c2_samples, + variance_predictor_cols=variance_predictor_cols, + ion_index=ion_index, + ion_variance=ion_variance, + ion_median_intensity=ion_median_intensity, + ) + except Exception as e: + LOGGER.warning("Failed to load variance predictor scores: %s. Falling back to intensity.", e) + return None + + import alphaquant.diffquant.diffutils as aqutils def get_unnormed_df_condpair(input_file:str, samplemap_df:pd.DataFrame, condpair:str, file_has_alphaquant_format: bool) -> pd.DataFrame: diff --git a/alphaquant/diffquant/diff_analysis.py b/alphaquant/diffquant/diff_analysis.py index 30b9453a..52552316 100644 --- a/alphaquant/diffquant/diff_analysis.py +++ b/alphaquant/diffquant/diff_analysis.py @@ -6,6 +6,7 @@ from scipy.stats import ttest_ind from scipy.stats import t as student_t import alphaquant.diffquant.diffutils as aqdiffutils +import alphaquant.config.variables as aqvariables class DifferentialIon(): """Computes differential statistics for an ion using empirical background distributions. @@ -188,7 +189,7 @@ def _calc_ttest_peptide(self, noNanvals_from, noNanvals_to, p2z, outlier_correct if outlier_correction and se_standard > 0 and n1 > 1 and n2 > 1: se_robust = _calc_robust_se_ttest(noNanvals_from, noNanvals_to) if se_robust > 0: - scaling_factor = max(1.0, min(5.0, se_robust / se_standard)) + scaling_factor = max(1.0, min(5.0, se_robust / se_standard * aqvariables.OUTLIER_CORRECTION_FACTOR)) t_adj = t_stat / scaling_factor p_val = 2.0 * float(student_t.sf(abs(t_adj), df)) @@ -207,6 +208,8 @@ def calc_outlier_scaling_factor(noNanvals_from, noNanvals_to, diffDist): (e.g., due to biological variability or technical outliers), the variance estimate is inflated accordingly. This makes the test more conservative when data quality is poor. + The result is further scaled by ``aqvariables.OUTLIER_CORRECTION_FACTOR`` (default 1.0). + Args: noNanvals_from: Log2 intensities from condition 1 noNanvals_to: Log2 intensities from condition 2 @@ -228,7 +231,7 @@ def calc_outlier_scaling_factor(noNanvals_from, noNanvals_to, diffDist): highest_SD_to = max(between_rep_SD_to, sd_to) highest_SD_combined = math.sqrt(highest_SD_from**2 + highest_SD_to**2) - scaling_factor = max(1.0, highest_SD_combined/diffDist.SD) + scaling_factor = max(1.0, highest_SD_combined/diffDist.SD * aqvariables.OUTLIER_CORRECTION_FACTOR) return scaling_factor def _robust_sd(x): From 7feed675717e7ed351b47252d30f408d94fddf2b Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:13:08 +0200 Subject: [PATCH 4/5] Update run_pipeline, dashboard and PTM wiring --- .gitignore | 1 + alphaquant/ptm/ptmsite_mapping.py | 131 +++--------------- alphaquant/run_pipeline.py | 83 +++++++++-- alphaquant/ui/dashboard_parts_run_pipeline.py | 83 +++-------- 4 files changed, 118 insertions(+), 180 deletions(-) diff --git a/.gitignore b/.gitignore index c66905f5..2f5b38ee 100644 --- a/.gitignore +++ b/.gitignore @@ -229,3 +229,4 @@ paper_analyses alphaquant/resources/reference_databases alphaquant/resources/phosphopred_databases +.claude/settings.local.json diff --git a/alphaquant/ptm/ptmsite_mapping.py b/alphaquant/ptm/ptmsite_mapping.py index 4a2fdcb3..f26d7ac0 100644 --- a/alphaquant/ptm/ptmsite_mapping.py +++ b/alphaquant/ptm/ptmsite_mapping.py @@ -9,6 +9,8 @@ aqconfig.setup_logging() LOGGER = logging.getLogger(__name__) import alphaquant.config.variables as aq_variables +import io +import zipfile #helper classes headers_dicts = {'Spectronaut' : {"label_column" : "R.Label", "fg_id_column" : "FG.Id", 'sequence' : "PEP.StrippedSequence", 'proteins' : "PG.UniProtIds", 'precursor_mz' : "FG.PrecMz", "precursor_charge" : "FG.Charge", @@ -563,26 +565,6 @@ def get_site_prob_overview(modpeps, refprot, refgene): return series_collected -# Cell -def add_ptmsite_infos_spectronaut(input_df, ptm_ids_df): - intersect_columns = input_df.columns.intersection(ptm_ids_df.columns) - if(len(intersect_columns)==2): - LOGGER.info(f"assigning ptms based on columns {intersect_columns}") - input_df = input_df.merge(ptm_ids_df, on=list(intersect_columns), how= 'left') - else: - raise Exception(f"Number of intersecting columns {intersect_columns} not as expected") - input_df = add_ptm_precursor_names_spectronaut(input_df) - input_df = input_df[~input_df["conditions"].isna()] - return input_df - -# Cell -def add_ptm_precursor_names_spectronaut(ptm_annotated_input): - delimiter = pd.Series(["_" for x in range(len(ptm_annotated_input.index))]) - ptm_annotated_input[QUANT_ID] = ptm_annotated_input["PEP.StrippedSequence"] + delimiter + ptm_annotated_input["FG.PrecMz"].astype('str') + delimiter + ptm_annotated_input["FG.Charge"].astype('str') + delimiter + ptm_annotated_input["REFPROT"] + delimiter +ptm_annotated_input["site"].astype('str') - ptm_annotated_input.gene.fillna('', inplace=True) - ptm_annotated_input["site_id"] = ptm_annotated_input["gene"].astype('str')+delimiter+ptm_annotated_input["REFPROT"].astype('str') + delimiter +ptm_annotated_input["site"].astype('str') - return ptm_annotated_input - # Cell def filter_input_table(input_type, modification_type,input_df): if input_type == "Spectronaut": @@ -699,7 +681,7 @@ def merge_ptmsite_mappings_write_table(spectronaut_file, mapped_df, modification # Write deduplicated result LOGGER.info(f"Writing deduplicated PTM table with {len(deduplicated_df)} rows to {ptmmapped_table_filename}") - deduplicated_df.to_csv(ptmmapped_table_filename, sep='\t', index=False) + write_dataframe_to_single_file_zip(deduplicated_df, ptmmapped_table_filename) else: # Write chunks directly for non-Spectronaut data (DIANN, etc.) @@ -767,7 +749,17 @@ def get_ptmmapped_filename(spectronaut_file): foldername = os.path.dirname(spectronaut_file_abspath) filename = os.path.basename(spectronaut_file_abspath) filename_reduced = filename.replace(".tsv", "") - return f"{foldername}/{filename_reduced}.ptmsite_mapped.tsv" #this file is not written to the progress folder + return f"{foldername}/{filename_reduced}.ptmsite_mapped.tsv.zip" #this file is not written to the progress folder + + +def write_dataframe_to_single_file_zip(df, zip_filename, sep="\t", index=False): + archive_name = os.path.basename(zip_filename) + if archive_name.endswith(".zip"): + archive_name = archive_name[:-4] + with zipfile.ZipFile(zip_filename, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: + with zf.open(archive_name, mode="w", force_zip64=True) as buffer: + with io.TextIOWrapper(buffer, encoding="utf-8", newline="") as text: + df.to_csv(text, sep=sep, index=index) @@ -777,9 +769,15 @@ def add_ptmsite_info_to_subtable(spectronaut_df, labelid2ptmid, labelid2site, mo spectronaut_df = spectronaut_df[[x in labelid2ptmid.keys() for x in spectronaut_df["labelid"]]].copy() #drop peptides that have no ptm spectronaut_df["ptm_id"] = np.array([labelid2ptmid.get(x) for x in spectronaut_df["labelid"]]) #add the ptm_id row to the spectronaut table - modseq_typereplaced = np.array([str(x.replace(modification_type, "")) for x in spectronaut_df["EG.ModifiedSequence"]]) #EG.ModifiedSequence already determines a localization of the modification type. Replace all localizations and add the new localizations below - sites = np.array([str(labelid2site.get(x)) for x in spectronaut_df["labelid"]]) - spectronaut_df["ptm_mapped_modseq"] = np.char.add(modseq_typereplaced, sites) + modseq_typereplaced = pd.Series( + [str(x).replace(modification_type, "") for x in spectronaut_df["EG.ModifiedSequence"]], + index=spectronaut_df.index, + ) #EG.ModifiedSequence already determines a localization of the modification type. Replace all localizations and add the new localizations below + sites = pd.Series( + [str(labelid2site.get(x)) for x in spectronaut_df["labelid"]], + index=spectronaut_df.index, + ) + spectronaut_df["ptm_mapped_modseq"] = modseq_typereplaced + sites return spectronaut_df @@ -791,86 +789,3 @@ def get_ptmid_mappings(mapped_df): labelid2ptmid = dict(zip(labelid, ptm_ids)) labelid2site = dict(zip(labelid, site)) return labelid2ptmid, labelid2site - - - - -# Detect Changes in site occupancy - -import pandas as pd -import numpy as np - - -def initialize_ptmsite_df(ptmsite_file, samplemap_file): - """returns ptmsite_df, samplemap_df from files""" - samplemap_df, _ = initialize_sample2cond(samplemap_file) - ptmsite_df = pd.read_csv(ptmsite_file, sep = "\t") - return ptmsite_df, samplemap_df - -def detect_site_occupancy_change(cond1, cond2, ptmsite_df ,samplemap_df, min_valid_values = 2, threshold_prob = 0.05): - """ - uses a PTMsite df with headers "REFPROT", "gene","site", and headers for sample1, sample2, etc and determines - whether a site appears/dissappears between conditions based on some probability threshold - """ - - ptmsite_df["site_id"] = ptmsite_df["REFPROT"] + ptmsite_df["site"].astype("str") - ptmsite_df = ptmsite_df.set_index("site_id") - cond1_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond1)]["sample"]).intersection(set(ptmsite_df.columns))) - cond2_samples = list(set(samplemap_df[(samplemap_df["condition"]==cond2)]["sample"]).intersection(set(ptmsite_df.columns))) - - ptmsite_df = ptmsite_df[cond1_samples + cond2_samples + ["REFPROT", "gene", "site"]] - filtvec = [(sum(~np.isnan(x))>0) for _, x in ptmsite_df[cond1_samples + cond2_samples].iterrows()] - ptmsite_df = ptmsite_df[filtvec] - ptmsite_df = ptmsite_df.sort_index() - - regulated_sites = [] - count = 0 - for ptmsite in ptmsite_df.index.unique(): - - site_df = ptmsite_df.loc[[ptmsite]] - if count%1000 ==0: - num_checks = len(ptmsite_df.index.unique()) - LOGGER.info(f"{count} of {num_checks} {count/num_checks :.2f}") - count+=1 - - cond1_vals = site_df[cond1_samples].to_numpy() - cond2_vals = site_df[cond2_samples].to_numpy() - - cond1_vals = cond1_vals[~np.isnan(cond1_vals)] - cond2_vals = cond2_vals[~np.isnan(cond2_vals)] - - numrep_c1 = len(cond1_vals) - numrep_c2 = len(cond2_vals) - - if(numrep_c11-threshold_prob - likely_c2 = cond2_prob>1-threshold_prob - direction = 0 - - if(unlikely_c1&likely_c2): - direction = -1 - if(unlikely_c2&likely_c1): - direction = 1 - - if direction!=0: - LOGGER.info("occpancy change detected") - refprot = site_df["REFPROT"].values[0] - gene = site_df["gene"].values[0] - site = site_df["site"].values[0] - regulated_sites.append([refprot, gene, site, direction, cond1_prob, cond2_prob, numrep_c1, numrep_c2]) - - - df_occupancy_change = pd.DataFrame(regulated_sites, columns=["REFPROT", "gene", "site", "direction", "c1_meanprob", "c2_meanprob", "c1_nrep", "c2_nrep"]) - return df_occupancy_change - - - - - diff --git a/alphaquant/run_pipeline.py b/alphaquant/run_pipeline.py index e55653d4..a73e1f8d 100644 --- a/alphaquant/run_pipeline.py +++ b/alphaquant/run_pipeline.py @@ -46,22 +46,27 @@ def run_pipeline(input_file: str, condpairs_list: Optional[List[Tuple[str, str]]] = None, file_has_alphaquant_format: bool = False, min_valid_values: int = 2, - valid_values_filter_mode: str = "either", #options: "either", "and", "per_condition" + valid_values_filter_mode: str = "either", #options: "either", "both", "per_condition" min_valid_values_c1: int = 0, min_valid_values_c2: int = 0, min_num_ions: int = 1, minpep: int = 1, organism: Optional[str] = None, cluster_threshold_pval: float = 0.001, + cluster_threshold_ion_type: float = 0.01, cluster_threshold_fcfc: float = 0, fcdiff_cutoff_clustermerge = 0.5, use_ml: bool = True, + residual_decorrelation_tolerance: float = 0.10, + residual_decorrelation_min_keep: int = 1, + aggregation_mode: Union[str, dict] = "stouffer_decorrelation", take_median_ion: bool = True, perform_ptm_mapping: bool = False, perform_phospho_inference: bool = False, enable_experimental_ptm_counting_statistics: bool = False, ptm_fragment_selection: bool = False, outlier_correction: bool = True, + outlier_correction_factor: float = 1.0, normalize: bool = True, use_iontree_if_possible: bool = True, write_out_results_tree: bool = True, @@ -76,8 +81,14 @@ def run_pipeline(input_file: str, peptides_to_exclude_file: Optional[str] = None, reset_progress_folder: bool = False, peptide_outlier_filtering: bool = True, - fragment_outlier_filtering: bool = True, + max_n_fragments: Optional[int] = None, + ion_outlier_mad_threshold: Optional[float] = None, + classic_fragment_outlier_filtering: bool = False, + split_ion_backgrounds: bool = True, + use_variance_predictor: bool = False, + num_bg_contexts: int = 10, ion_test_method: str = 'diffdist', + summarization_nodes: Optional[List[str]] = None, minrep_both: Optional[int] = None, #deprecated minrep_either: Optional[int] = None, #deprecated minrep_c1: Optional[int] = None, #deprecated @@ -107,16 +118,32 @@ def run_pipeline(input_file: str, min_num_ions (int): Minimum number of ions required per peptide. Defaults to 1. minpep (int): Minimum number of peptides required per protein. Defaults to 1. organism (str): Organism name for PTM mapping (e.g., 'human', 'mouse'). Required if perform_ptm_mapping is True. - cluster_threshold_pval (float): P-value threshold for statistical clustering. Defaults to 0.001. + cluster_threshold_pval (float): P-value threshold for statistical clustering at the protein/gene level. Defaults to 0.001. + cluster_threshold_ion_type (float): P-value threshold for clustering at the fragment/MS1 isotope (ion_type) level. Defaults to 0.01. cluster_threshold_fcfc (float): Fold change threshold for clustering. Defaults to 0. fcdiff_cutoff_clustermerge (float): Fold change difference cutoff for merging peptide clusters. Defaults to 0.5. use_ml (bool): Enable machine learning analysis. Defaults to True. + residual_decorrelation_tolerance (float): Maximum allowed one-sided excess-CDF distance between corrected and null sibling-correlation distributions. Defaults to 0.10. + residual_decorrelation_min_keep (int): Minimum number of children to retain per parent during residual decorrelation pruning. Defaults to 1. + aggregation_mode (str | dict): Strategy for combining child z-values at the fragment/MS1 level + (where ions show intra-group dependencies). Higher levels always use Stouffer. + Can be a single string (applied to all dependent levels) or a dict mapping node types + to modes (e.g. ``{"frgion": "min_median_max_z", "ms1_isotopes": "median_z"}``). + - "stouffer_decorrelation" (default): Stouffer's method after residual-decorrelation pruning. + - "mean_z": Arithmetic mean of z-values (conservative, treats children as one measurement). + - "median_z": Median z-value (robust to outlier children). + - "min_median_max_z": Combine min, median, max z-values assuming independence (3-point summary). + - "min_max_z": Combine min, max z-values assuming independence (2-point summary). + - "summed_z": Classic Stouffer assuming independence (rho=0, ignores ICC correction). take_median_ion (bool): Use median-centered fragment ions for peptide comparisons. Defaults to True. perform_ptm_mapping (bool): Enable PTM site mapping analysis. Defaults to False. perform_phospho_inference (bool): Enable phosphorylation-prone region annotation. Defaults to False. enable_experimental_ptm_counting_statistics (bool): Allow experimental PTM counting statistics with "either" mode or zero min_valid_values. Defaults to False. ptm_fragment_selection (bool): If True, enable PTM-oriented fragment selection in clustering. outlier_correction (bool): Enable outlier correction in differential testing. Defaults to True. + outlier_correction_factor (float): Multiplicative factor for the outlier correction scaling. + Values > 1.0 make the correction more aggressive (more conservative p-values), + values < 1.0 make it less aggressive. Only effective when outlier_correction is True. Defaults to 1.0. normalize (bool): Enable sample and condition normalization. Defaults to True. use_iontree_if_possible (bool): Use ion tree structure when available. Defaults to True. write_out_results_tree (bool): Write results in hierarchical tree format. Defaults to True. @@ -131,13 +158,42 @@ def run_pipeline(input_file: str, peptides_to_exclude_file (str): File listing peptides to exclude (e.g., shared between species). reset_progress_folder (bool): Clear and recreate the progress folder. Defaults to False. peptide_outlier_filtering (bool): Enable few peptides per protein filtering for statistical outlier correction. When True, filters outlier peptides based on significance distribution within the protein/gene. Defaults to True. - fragment_outlier_filtering (bool): Enable fragment outlier filtering when aggregating fragments to peptides. When True, removes extreme fragments before statistical aggregation. Defaults to True. + max_n_fragments (int or None): Maximum number of fragment ions to keep per peptide when aggregating. + When set, only the fragments with z-values closest to the median are retained; the rest are + discarded. None (default) means no limit. + ion_outlier_mad_threshold (float or None): MAD-based outlier threshold for base ion z-values. + When set, ions whose z-value deviates more than ``threshold * MAD`` from the sibling median + are removed before aggregation (requires >= 4 siblings; always keeps >= 2). None (default) + disables this filter. Typical values: 2.5 – 3.0. + classic_fragment_outlier_filtering (bool): Use the legacy fragment outlier filter that keeps only + the 4 most central fragment ions (by z-value) when a peptide has more than 4 fragments. + Applied after MAD / max_n_fragments filtering. Defaults to False. + split_ion_backgrounds (bool): Build separate empirical background distributions for fragment ions + and MS1 isotopes instead of pooling them together. Defaults to True. + use_variance_predictor (bool): Use a linear regression model to predict + ion variance from quality metrics (e.g. Cscore, ShapeQualityScore) for + sorting ions before background partitioning. When False, ions are sorted + by median intensity instead. Defaults to False. + num_bg_contexts (int): Number of overlapping windows used to partition + ions into empirical background distributions. Higher values create + more fine-grained intensity-dependent backgrounds. Defaults to 10. ion_test_method (str): Ion-level test to compute ion statistics. Options: - "diffdist" (default): Use empirical background distributions (DifferentialIon). - "ttest": Use Welch two-sample t-test (DifferentialIonTTest), p→z via cached fast inversion. + summarization_nodes (list[str]): Node types at which to sum child intensities + before differential analysis. For each specified node type the base-ion + leaves are collected and their linear intensities summed per replicate. + Ion types (frgion, ms1_isotopes, precursor) are never mixed. + Examples: ``["frgion"]`` sums fragments per precursor; + ``["frgion", "ms1_isotopes"]`` sums both; ``["mod_seq_charge"]`` sums + everything per precursor (split by ion type). Defaults to None (no + summarization). """ LOGGER.info("Starting AlphaQuant") + if summarization_nodes is None: + summarization_nodes = [] + ######################################################### # TODO: this backwards compatibility can be removed beginning of 2026 # to ensure backwards compatibility: in case the minrep paramters are set, we need to convert them to the min_valid_values and valid_values_filter_mode parameters @@ -168,16 +224,17 @@ def run_pipeline(input_file: str, if file_has_alphaquant_format: LOGGER.info("Input file is already in AlphaQuant format. Skipping reformatting.") input_file_reformat = input_file_original - # For pre-formatted files, use a generic input type that doesn't require specific columns input_type = input_type_to_use if input_type_to_use is not None else "generic_preformatted" annotation_file = None - use_ml = False # Disable ML for pre-formatted files - # Skip to the main analysis + use_ml = False + variance_predictor_cols = None else: create_progress_folder_if_applicable(input_file_original, reset_progress_folder) input_type, config_dict, _ = config_dict_loader.get_input_type_and_config_dict(input_file_original, input_type_to_use) annotation_file = load_annotation_file(input_file_original, input_type, annotation_columns) use_ml = check_if_table_supports_ml(config_dict) & use_ml + variance_predictor_cols = config_dict.get("variance_predictor_cols", None) + aqvariables.set_input_config(input_type, config_dict) if perform_ptm_mapping and not file_has_alphaquant_format: if modification_type is None: @@ -218,8 +275,13 @@ def run_pipeline(input_file: str, aqvariables.determine_variables(input_file_reformat, input_type) aqvariables.set_peptide_outlier_filtering(peptide_outlier_filtering) + aqvariables.set_outlier_correction_factor(outlier_correction_factor) + aqvariables.NUM_BG_CONTEXTS = num_bg_contexts # Configure PTM-specific fragment selection: enabled if either PTM mapping is performed or explicit flag is set aqvariables.set_ptm_fragment_selection(perform_ptm_mapping or ptm_fragment_selection) + aqvariables.set_max_n_fragments(max_n_fragments) + aqvariables.set_ion_outlier_mad_threshold(ion_outlier_mad_threshold) + aqvariables.set_classic_fragment_outlier_filtering(classic_fragment_outlier_filtering) #use runconfig object to store the parameters runconfig = ConfigOfRunPipeline(locals()) #all the parameters given into the function are transfered to the runconfig object! The runconfig is then used as the input for the run_analysis functions @@ -309,10 +371,14 @@ def check_if_table_supports_ml(config_dict): return is_longtable and ml_level_charge def load_ml_info_file(input_file, input_type, modification_type = None): - ml_info_filename = aq_utils.get_progress_folder_filename(input_file, f".ml_info_table.tsv") + ml_info_filename = aq_utils.get_progress_folder_filename(input_file, f".ml_info_table.tsv.zip") + old_ml_info_filename = aq_utils.get_progress_folder_filename(input_file, f".ml_info_table.tsv") if os.path.exists(ml_info_filename):#in case there already is a reformatted file, we don't need to reformat it again LOGGER.info(f"ML info file already exists. Using ML info file of type {input_type}") return ml_info_filename + elif os.path.exists(old_ml_info_filename): + LOGGER.info(f"Uncompressed ML info file already exists. Using ML info file of type {input_type}") + return old_ml_info_filename else: return aq_ml_info_table.MLInfoTableCreator(input_file, input_type, modification_type).ml_info_filename @@ -353,4 +419,3 @@ def run_analysis_multiprocess(condpair_combinations, runconfig, num_cores): aqcondpair.analyze_condpair(runconfig= runconfig, condpair = condpair) ,condpair_combinations) - diff --git a/alphaquant/ui/dashboard_parts_run_pipeline.py b/alphaquant/ui/dashboard_parts_run_pipeline.py index 608637f6..31b4d889 100644 --- a/alphaquant/ui/dashboard_parts_run_pipeline.py +++ b/alphaquant/ui/dashboard_parts_run_pipeline.py @@ -15,8 +15,6 @@ import alphaquant.run_pipeline as diffmgr import alphaquant.config.variables as aq_variables import alphaquant.ui.dashboad_parts_plots_basic as dashboad_parts_plots_basic -import alphaquant.ui.dashboard_parts_plots_proteoforms as dashboad_parts_plots_proteoforms -import alphaquant.ui.gui as gui import alphaquant.ui.gui_textfields as gui_textfields import alphaquant.utils.reader_utils as aq_reader_utils @@ -374,6 +372,14 @@ def _make_widgets(self): description='Fold change threshold for highlighting significant changes in the volcano plot' ) + self.aggregation_mode = pn.widgets.Select( + name='Z-value aggregation mode:', + options=['stouffer_decorrelation', 'mean_z', 'median_z', 'min_median_max_z'], + value='stouffer_decorrelation', + width=300, + description='Strategy for combining child z-values during tree propagation' + ) + self.condition_comparison_header = pn.pane.Markdown( "### Available Condition Comparisons", visible=True @@ -402,7 +408,7 @@ def _make_widgets(self): name='Enable machine learning', value=True, width=300 - ), + ), 'take_median_ion': pn.widgets.Checkbox( name='Use median-centered ions', value=True, @@ -447,6 +453,11 @@ def _make_widgets(self): name='Generate runtime plots', value=True, width=300 + ), + 'split_ion_backgrounds': pn.widgets.Checkbox( + name='Separate backgrounds by ion type', + value=True, + width=300 ), 'peptide_outlier_filtering': pn.widgets.Checkbox( name='Use few peptides per protein', @@ -466,6 +477,7 @@ def _make_widgets(self): 'write_out_results_tree': pn.pane.Markdown('Save detailed results in a tree structure'), 'use_multiprocessing': pn.pane.Markdown('Use multiple CPU cores to speed up processing (may use more memory)'), 'runtime_plots': pn.pane.Markdown('Create plots during analysis to visualize the process'), + 'split_ion_backgrounds': pn.pane.Markdown('Build separate empirical backgrounds for fragment ions and MS1 isotopes to reduce conservative bias'), 'peptide_outlier_filtering': pn.pane.Markdown('Filter outlier peptides based on significance for proteins with gene-level nodes'), } @@ -586,6 +598,9 @@ def create_checkbox_with_description(key, checkbox): self.minpep, self.cluster_threshold_pval, pn.layout.Divider(), + "### Aggregation", + self.aggregation_mode, + pn.layout.Divider(), "### Analysis Options", *checkbox_items, ), @@ -806,6 +821,7 @@ def _run_pipeline(self, *events): "min_valid_values_c2": self.min_valid_values_c2.value if self.valid_values_filter_mode.value == 'set min. valid values per condition' else None, # Add the switch values to the pipeline parameters 'use_ml': self.switches['use_ml'].value, + 'aggregation_mode': self.aggregation_mode.value, 'take_median_ion': self.switches['take_median_ion'].value, 'perform_ptm_mapping': self.switches['perform_ptm_mapping'].value, 'perform_phospho_inference': self.switches['perform_phospho_inference'].value, @@ -815,6 +831,7 @@ def _run_pipeline(self, *events): 'write_out_results_tree': self.switches['write_out_results_tree'].value, 'use_multiprocessing': self.switches['use_multiprocessing'].value, 'runtime_plots': self.switches['runtime_plots'].value, + 'split_ion_backgrounds': self.switches['split_ion_backgrounds'].value, 'peptide_outlier_filtering': self.switches['peptide_outlier_filtering'].value, } @@ -1470,63 +1487,3 @@ def _update_tabs(self, event=None): self.main_tabs[1] = ('Plotting', pn.pane.Markdown( f"### Visualization Error\n\n{error_msg}" )) - - -def build_dashboard(): - """Build the overall dashboard layout.""" - # Create state manager first - state_manager = gui.DashboardState() - - header = HeaderWidget( - title="AlphaQuant Dashboard", - img_folder_path="./assets", - github_url="https://github.com/" - ) - main_text = MainWidget( - description=( - "Welcome to our analysis dashboard. " - "Please load your data and run the pipeline." - ), - manual_path="path/to/manual.pdf" - ) - - # Create pipeline instance with state manager - pipeline = RunPipeline(state=state_manager) - pipeline_layout = pipeline.create() - - # Create plotting tabs with state manager - plotting_tab = dashboad_parts_plots_basic.PlottingTab(state=state_manager) - proteoform_tab = dashboad_parts_plots_proteoforms.ProteoformPlottingTab(state=state_manager) - - # Register subscribers - state_manager.register_subscriber(plotting_tab) - state_manager.register_subscriber(proteoform_tab) - - # Create tabs - all_tabs = pn.Tabs( - ('Pipeline', pipeline_layout), - ('Single Comparison', plotting_tab.panel()), - ('Plotting', proteoform_tab.panel()), - dynamic=True, - tabs_location='above', - sizing_mode='stretch_width' - ) - - # Main layout - main_layout = pn.Column( - header.create(), - pn.layout.Divider(), - main_text.create(), - all_tabs, - sizing_mode='stretch_width' - ) - - template = pn.template.FastListTemplate( - title="AlphaQuant Analysis", - sidebar=[], - main=[main_layout], - theme='dark', - main_max_width="1200px", - main_layout="width" - ) - return template From b01809801eb400b46b8d6419ba2d398b2aeafef0 Mon Sep 17 00:00:00 2001 From: ammarcsj <70114795+ammarcsj@users.noreply.github.com> Date: Mon, 25 May 2026 10:43:13 +0200 Subject: [PATCH 5/5] Add .claude/settings.local.json to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2f5b38ee..8e560bbe 100644 --- a/.gitignore +++ b/.gitignore @@ -230,3 +230,4 @@ paper_analyses alphaquant/resources/reference_databases alphaquant/resources/phosphopred_databases .claude/settings.local.json +.claude/settings.local.json