diff --git a/.RData b/.RData new file mode 100644 index 00000000..d0196493 Binary files /dev/null and b/.RData differ diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 00000000..54ab75ab --- /dev/null +++ b/.Rhistory @@ -0,0 +1,2 @@ +devtools::install_github(repo = "https://github.com/livnatje/DIALOGUE") +q() diff --git a/pertpy/tools/_dialogue.py b/pertpy/tools/_dialogue.py index df0c0719..35debef6 100644 --- a/pertpy/tools/_dialogue.py +++ b/pertpy/tools/_dialogue.py @@ -55,7 +55,7 @@ def __init__(self, sample_id: str, celltype_key: str, n_counts_key: str, n_mpcs: self.n_mcps = n_mpcs def _get_pseudobulks( - self, adata: AnnData, groupby: str, strategy: Literal["median", "mean"] = "median" + self, adata: AnnData, groupby: str, strategy: Literal["median", "mean"] = "mean" ) -> pd.DataFrame: """Return cell-averaged data by groupby. @@ -83,45 +83,73 @@ def _get_pseudobulks( return pseudobulk def _pseudobulk_feature_space( - self, adata: AnnData, groupby: str, n_components: int = 50, feature_space_key: str = "X_pca" + self, + adata: AnnData, + groupby: str, + n_components: int = 50, + feature_space_key: str = "X_pca", + agg_func=np.median, # default to np.median; user can supply np.mean, etc. ) -> pd.DataFrame: - """Return Cell-averaged components from a passed feature space. - - TODO: consider merging with `get_pseudobulks` - TODO: DIALOGUE recommends running PCA on each cell type separately before running PMD - this should be implemented as an option here. + """Return cell-averaged components from a passed feature space using a custom aggregation function. Args: - groupby: The key to groupby for pseudobulks. - n_components: The number of components to use. - feature_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap"). + groupby: The key to group by for pseudobulks (e.g., a sample identifier). + n_components: The number of components (features) to use. + feature_space_key: The key in adata.obsm for the feature space (e.g., "X_pca", "X_umap"). + agg_func: The aggregation function to use (e.g., np.median, np.mean). It should accept an `axis` argument. Returns: - A pseudobulk DataFrame of the averaged components. + A pseudobulk DataFrame of the aggregated components, with samples as rows and components as columns. """ aggr = {} + # Loop over each category in the specified groupby column (assumed to be categorical) for category in adata.obs.loc[:, groupby].cat.categories: temp = adata.obs.loc[:, groupby] == category - aggr[category] = adata[temp].obsm[feature_space_key][:, :n_components].mean(axis=0) - aggr = pd.DataFrame(aggr) - return aggr - - def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True) -> np.ndarray: - """Row-wise mean center and scale by the standard deviation. + # Apply the user-provided aggregation function along axis 0 (averaging across cells) + aggr[category] = agg_func(adata[temp].obsm[feature_space_key][:, :n_components], axis=0) + # Create a DataFrame; keys become columns + aggr_df = pd.DataFrame(aggr) + # Transpose so that rows correspond to samples and columns to features + return aggr_df.T + + def _scale_data(self, pseudobulks: pd.DataFrame, normalize: bool = True, cap: float = 0.01) -> np.ndarray: + """Row-wise mean center and scale by the standard deviation, + and then cap extreme values based on quantiles. + + This mimics the following R function (excluding row subsetting): + + f <- function(X1){ + if(param$center.flag){ + X1 <- center.matrix(X1, dim = 2, sd.flag = TRUE) + X1 <- cap.mat(X1, cap = 0.01, MARGIN = 2) + } + X1 <- X1[samplesU, ] + return(X1) + } Args: - pseudobulks: The pseudobulk PCA components. - normalize: Whether to mimic DIALOGUE behavior or not. + pseudobulks: The pseudobulk PCA components as a DataFrame (samples as rows, features as columns). + normalize: Whether to perform centering, scaling, and capping. + cap: The quantile threshold for capping. For example, cap=0.01 means that for each column, values + above the 99th percentile are set to the 99th percentile, and values below the 1st percentile are set to the 1st percentile. Returns: - The scaled count matrix. + The processed (scaled and capped) matrix as a NumPy array. """ - # TODO: the `scale` function we implemented to match the R `scale` fn should already contain this functionality. - # DIALOGUE doesn't scale the data before passing to multicca, unlike what is recommended by sparsecca. - # However, performing this scaling _does_ increase overall correlation of the end result if normalize: - return pseudobulks.to_numpy() + # Center and scale (column-wise: subtract the column mean and divide by the column std) + scaled = (pseudobulks - pseudobulks.mean()) / pseudobulks.std() + + # Apply quantile-based capping column-wise. + capped = scaled.copy() + for col in scaled.columns: + lower = scaled[col].quantile(cap) # lower quantile (e.g., 1st percentile) + upper = scaled[col].quantile(1 - cap) # upper quantile (e.g., 99th percentile) + capped[col] = scaled[col].clip(lower=lower, upper=upper) + + return capped.to_numpy() else: - return ((pseudobulks - pseudobulks.mean()) / pseudobulks.std()).to_numpy() + return pseudobulks.to_numpy() def _concat_adata_mcp_scores( self, adata: AnnData, ct_subs: dict[str, AnnData], mcp_scores: dict[str, np.ndarray], celltype_key: str @@ -560,31 +588,61 @@ def _load( ct_order: list[str], agg_feature: bool = True, normalize: bool = True, + subset_common: bool = True, # new optional parameter ) -> tuple[list, dict]: - """Separates cell into AnnDatas by celltype_key and creates the multifactor PMD input. + """Separates cells into AnnDatas by celltype_key and creates the multifactor PMD input. Mimics DIALOGUE's `make.cell.types` and the pre-processing that occurs in DIALOGUE1. Args: - adata: AnnData object generate celltype objects for - ct_order: The order of cell types + adata: AnnData object to generate celltype objects for. + ct_order: The order of cell types. agg_feature: Whether to aggregate pseudobulks with some embeddings or not. normalize: Whether to mimic DIALOGUE behavior or not. + subset_common: If True, restrict output to common samples across cell types. Returns: - A celltype_label:array dictionary. + A tuple with: + - mcca_in: A list of pseudobulk matrices (one per cell type), with rows corresponding to sample IDs (if subset_common is True). + - ct_subs: A dictionary mapping each cell type to its corresponding AnnData subset (restricted to common samples if subset_common is True). """ + # 1. Split the AnnData into cell-type–specific subsets. ct_subs = {ct: adata[adata.obs[self.celltype_key] == ct].copy() for ct in ct_order} + + # 2. Choose the aggregation function based on the flag. fn = self._pseudobulk_feature_space if agg_feature else self._get_pseudobulks + + # 3. Compute pseudobulk features for each cell type. + # Here, fn should return a DataFrame with sample IDs as the row index. ct_aggr = {ct: fn(ad, self.sample_id) for ct, ad in ct_subs.items()} # type: ignore - # TODO: implement check (as in https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L114-L119) - # that there are at least 5 share samples here + # 4. Apply scaling/normalization to the aggregated data. + # We wrap the output back in a DataFrame to preserve the sample IDs. + ct_scaled = { + ct: pd.DataFrame(self._scale_data(df, normalize=normalize), index=df.index, columns=df.columns) + for ct, df in ct_aggr.items() + } - # TODO: https://github.com/livnatje/DIALOGUE/blob/55da9be0a9bf2fcd360d9e11f63e30d041ec4318/R/DIALOGUE.main.R#L121-L131 - ct_preprocess = {ct: self._scale_data(ad, normalize=normalize).T for ct, ad in ct_aggr.items()} + if subset_common: + # 5. Determine the set of common samples across all cell types (using the scaled data). + common_samples = set(ct_scaled[ct_order[0]].index) + for ct in ct_order[1:]: + common_samples = common_samples.intersection(set(ct_scaled[ct].index)) + common_samples_sorted = sorted(common_samples) - mcca_in = [ct_preprocess[ct] for ct in ct_order] + # Check if there are at least 5 common samples. + if len(common_samples_sorted) < 5: + raise ValueError("Cannot run DIALOGUE with less than 5 common samples across cell types.") + + # 6. Subset each scaled pseudobulk DataFrame to only the common samples. + ct_scaled = {ct: df.loc[common_samples_sorted] for ct, df in ct_scaled.items()} + + # 7. Also, restrict each cell-type AnnData to cells belonging to one of the common samples. + for ct in ct_subs: + ct_subs[ct] = ct_subs[ct][ct_subs[ct].obs[self.sample_id].isin(common_samples_sorted)].copy() + + # 8. Order the preprocessed pseudobulk matrices as a list in the order specified by ct_order. + mcca_in = [ct_scaled[ct] for ct in ct_order] return mcca_in, ct_subs @@ -633,6 +691,8 @@ def calculate_multifactor_PMD( mcca_in, ct_subs = self._load(adata, ct_order=cell_types, agg_feature=agg_feature, normalize=normalize) + mcca_in = [df.to_numpy() if hasattr(df, "to_numpy") else df for df in mcca_in] + n_samples = mcca_in[0].shape[1] if penalties is None: try: