diff --git a/DESCRIPTION b/DESCRIPTION index 9d0b27f..ee41b9e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: OmicsProcessing Type: Package Title: OmicsProcessing -Version: 1.1.0 +Version: 1.2.0 Date: 2025-03-03 Authors@R: c( person("Vivian", "Viallon", role = c("aut")), diff --git a/NAMESPACE b/NAMESPACE index 51a62cb..9a3217c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(FUNnormalization_residualMixedModels) +export(build_omics_synthetic) export(calculate_ICC) export(calculate_ICC_parallel) export(check_dataframe_validity) diff --git a/R/build_omics_synthetic.R b/R/build_omics_synthetic.R new file mode 100644 index 0000000..dcdfdc4 --- /dev/null +++ b/R/build_omics_synthetic.R @@ -0,0 +1,114 @@ +#' Build synthetic omics data for plotting and normalisation examples +#' +#' Create a synthetic omics data set with feature intensities, run-order +#' drift, batch structure, plate structure, and QC sample labels. This is the +#' generator used for the packaged `omics_synthetic` example data. +#' +#' @param seed Integer random seed. +#' @param n Number of rows (samples). +#' @param n_batch Number of batches. +#' @param n_plate_per_batch Number of plates per batch. +#' @param n_features Number of feature columns to generate. +#' @param qc_frac Fraction of rows labelled as QC. +#' +#' @return A list with three elements: +#' \describe{ +#' \item{omics_synthetic}{A synthetic omics data frame.} +#' \item{jump_info}{A data frame describing the simulated step changes per feature.} +#' \item{parameters}{A list of generation parameters and sampled values.} +#' } +#' +#' @examples +#' generated <- build_omics_synthetic(seed = 1) +#' str(generated$omics_synthetic) +#' head(generated$jump_info) +#' +#' @export +build_omics_synthetic <- function( + seed = 1, + n = 2000, + n_batch = 2, + n_plate_per_batch = 2, + n_features = 10, + qc_frac = 0.05 +) { + set.seed(seed) + + feature_cols <- paste0("F", seq_len(n_features)) + feature_means <- stats::runif(n_features, min = 100, max = 700) + feature_sds <- stats::runif(n_features, min = 3, max = 30) + + feature_matrix <- vapply( + seq_len(n_features), + FUN.VALUE = numeric(n), + FUN = function(j) { + stats::rnorm(n, mean = feature_means[j], sd = feature_sds[j]) + } + ) + colnames(feature_matrix) <- feature_cols + + omics_synthetic <- data.frame( + plate_id = factor(sample.int(n_plate_per_batch * n_batch, n, replace = TRUE)), + feature_matrix, + check.names = FALSE + ) + + omics_synthetic <- omics_synthetic[order(omics_synthetic$plate_id), , drop = FALSE] + rownames(omics_synthetic) <- NULL + + omics_synthetic$run_ord <- seq_len(nrow(omics_synthetic)) + omics_synthetic$batch_id <- + (as.integer(omics_synthetic$plate_id) - 1L) %/% n_plate_per_batch + 1L + + n_qc <- ceiling(qc_frac * nrow(omics_synthetic)) + qc_idx <- sample.int(nrow(omics_synthetic), size = n_qc, replace = FALSE) + omics_synthetic$is_qc <- FALSE + omics_synthetic$is_qc[qc_idx] <- TRUE + + drift_slopes <- stats::runif(n_features, min = 0.01, max = 0.1) + for (j in seq_len(n_features)) { + omics_synthetic[[feature_cols[j]]] <- + omics_synthetic[[feature_cols[j]]] - drift_slopes[j] * omics_synthetic$run_ord + } + + jump_info <- vector("list", length = n_features) + names(jump_info) <- feature_cols + + for (j in seq_len(n_features)) { + n_jumps <- sample(1:2, size = 1) + jump_points <- sort(sample(100:(n - 100), size = n_jumps)) + jump_sizes <- stats::runif(n_jumps, min = 0.5, max = 1.5) * feature_sds[j] * 2 + + for (k in seq_len(n_jumps)) { + idx <- omics_synthetic$run_ord >= jump_points[k] + omics_synthetic[[feature_cols[j]]][idx] <- + omics_synthetic[[feature_cols[j]]][idx] + jump_sizes[k] + } + + jump_info[[j]] <- data.frame( + feature = feature_cols[j], + jump_id = seq_len(n_jumps), + jump_point = jump_points, + jump_size = jump_sizes + ) + } + + jump_info <- do.call(rbind, jump_info) + rownames(jump_info) <- NULL + + list( + omics_synthetic = omics_synthetic, + jump_info = jump_info, + parameters = list( + seed = seed, + n = n, + n_batch = n_batch, + n_plate_per_batch = n_plate_per_batch, + n_features = n_features, + qc_frac = qc_frac, + feature_means = feature_means, + feature_sds = feature_sds, + drift_slopes = drift_slopes + ) + ) +} diff --git a/R/data.R b/R/data.R index d546961..610fd5c 100644 --- a/R/data.R +++ b/R/data.R @@ -37,3 +37,23 @@ #' } #' @usage data(data_meta_samples) "data_meta_samples" + +#' Synthetic omics dataset for plotting and normalisation examples +#' +#' A synthetic omics dataset with run order, batch, plate, QC flags, and ten +#' feature columns. It is intended for examples and vignettes demonstrating +#' plotting and normalisation workflows. Reproducible data-generation code is +#' available in `data-raw/omics_synthetic.R` in the source repository and in +#' the installed package via `system.file("scripts", "omics_synthetic.R", +#' package = "OmicsProcessing")`. +#' +#' @format A data frame with 2000 rows and 14 variables: +#' \describe{ +#' \item{plate_id}{Plate identifier with 4 levels.} +#' \item{F1-F10}{Numeric feature intensity columns.} +#' \item{run_ord}{Integer run order.} +#' \item{batch_id}{Batch identifier.} +#' \item{is_qc}{Logical indicator for QC samples.} +#' } +#' @usage data(omics_synthetic) +"omics_synthetic" diff --git a/R/feature_clustering.R b/R/feature_clustering.R index 6be87f7..4bfecea 100644 --- a/R/feature_clustering.R +++ b/R/feature_clustering.R @@ -4,6 +4,11 @@ #' hierarchical clustering, then selects a representative feature for each #' cluster either via scores or via correlation-based aggregation. #' +#' QC rows (as indicated by \code{is_qc}) are excluded from all clustering and +#' summarisation steps, and are not present in the returned \code{clustered_df}. +#' Additionally, all columns in \code{df} that are not part of \code{target_cols} +#' are prepended to the returned \code{clustered_df} unchanged. +#' #' @param df #' A data.frame (or tibble) with feature data in columns and samples in #' rows. @@ -13,8 +18,8 @@ #' @param is_qc #' Logical vector marking QC samples. Must have length equal to #' \code{nrow(df)}. If \code{NULL}, all samples are treated as non-QC. -#' (Currently only used for consistency with other interfaces; it is not -#' used directly in this function.) +#' QC rows are excluded from clustering/summarisation and from the returned +#' \code{clustered_df}. #' @param rt_height #' Numeric height at which to cut the dendrogram in retention-time #' space. Smaller values create more clusters. @@ -27,8 +32,9 @@ #' @param scores #' Named numeric vector of feature scores, required when #' \code{method = "scores"}. Names must coincide with \code{target_cols}. -#' For each retention-time cluster, the feature with the largest score is -#' chosen as representative. +#' For each retention-time cluster, the features are subclustered based on +#' the input correlation threshold. The feature with the highest score is +#' selected as the representative of each subcluster. #' @param cut_height #' Numeric height used inside \code{cluster_features_based_on_correlations} #' when \code{method = "correlations"}. Passed on to that function's @@ -43,9 +49,12 @@ #' #' \describe{ #' \item{\code{clustered_df}}{ -#' A data.frame with one column per final representative feature. -#' When \code{method = "scores"}, this consists of a subset of the -#' original feature columns. When \code{method = "correlations"}, it +#' A data.frame where rows correspond to non-QC samples only. +#' All non-target columns (i.e. \code{setdiff(names(df), target_cols)}) +#' are included unchanged, followed by one column per final +#' representative feature. +#' When \code{method = "scores"}, representative features are a subset of +#' the original feature columns. When \code{method = "correlations"}, it #' may contain synthetic features created by #' \code{cluster_features_based_on_correlations()}. #' } @@ -89,6 +98,7 @@ #' @examples #' \dontrun{ #' df <- data.frame( +#' batch = rep(1, 20), #' `100@150` = rnorm(20), #' `100@151` = rnorm(20), #' `101@200` = rnorm(20) @@ -143,7 +153,11 @@ cluster_features_by_retention_time <- function( if (length(is_qc) != nrow(df)) { stop("`is_qc` must be a logical vector with the same length as nrow(df).") } - target_cols <- resolve_target_cols(df, target_cols) + + # 1) Exclude QC rows for all downstream computations + output + df_use <- df[!is_qc, , drop = FALSE] + + target_cols <- resolve_target_cols(df_use, target_cols) if (method == "scores") { if (is.null(names(scores))) { @@ -175,24 +189,25 @@ cluster_features_by_retention_time <- function( } retention_times <- parse_mass_rt(target_cols, "rt") - cluster_ids <- cluster_hierarchical(retention_times, height = rt_height) cluster_feature_list <- split(as.character(target_cols), cluster_ids) names(cluster_feature_list) <- sprintf("ClustRT%s", names(cluster_feature_list)) - if (method == "scores") { representatives <- get_features_representatives_based_on_scores( + df_use, cluster_feature_list, + corr_thresh = corr_thresh, scores ) - clustered_df <- df[, representatives$representatives] + clustered_df <- df_use[, representatives$representatives, drop = FALSE] representatives_map <- representatives$representatives_map } + if (method == "correlations") { results <- cluster_features_based_on_correlations( - df, + df_use, cluster_feature_list, cut_height = cut_height, corr_thresh = corr_thresh @@ -201,6 +216,15 @@ cluster_features_by_retention_time <- function( representatives_map <- results$representatives_map } + # 2) Attach all non-target columns back onto clustered_df + other_cols <- setdiff(names(df_use), target_cols) + if (length(other_cols) > 0L) { + clustered_df <- cbind( + df_use[, other_cols, drop = FALSE], + clustered_df + ) + } + return( list( clustered_df = clustered_df, @@ -210,6 +234,7 @@ cluster_features_by_retention_time <- function( } + #' General hierarchical clustering #' #' Performs hierarchical clustering on a numeric vector, matrix or @@ -334,46 +359,91 @@ parse_mass_rt <- function(x, -#' Select representative features per cluster based on scores + +#' Select representative features per cluster using correlation threshold and scores +#' +#' This function selects representative features from pre-defined clusters of +#' variables, using a correlation-based rule and a scoring vector: #' -#' For each cluster, this function selects a single representative feature. -#' If a cluster contains one feature, that feature is selected. If it contains -#' multiple features, the feature with the highest score in \code{scores} is -#' selected. +#' \itemize{ +#' \item If a cluster contains a single feature, that feature is selected. +#' \item If a cluster contains two features, the absolute pairwise correlation +#' is computed. If \code{|cor| >= corr_thresh}, only the highest-scoring +#' feature is selected; otherwise, both features are kept as representatives. +#' \item If a cluster contains more than two features, hierarchical clustering +#' is performed using the distance \code{1 - |cor|} with +#' \code{method = "average"}. The dendrogram is cut at +#' \code{h = 1 - corr_thresh} to form subclusters. From each subcluster, +#' the highest-scoring feature is selected as the representative. +#' } #' -#' The function also returns a named list mapping each representative feature -#' to all cluster members it represents. +#' In all cases where a unique representative is chosen for a (sub)cluster, +#' \code{representatives_map} stores the mapping from the representative feature +#' to the set of original features it represents. +#' +#' @param df +#' A data frame containing the feature columns referenced in +#' \code{cluster_feature_list}. Column names must match the feature names. #' #' @param cluster_feature_list -#' A list where each element is a character vector of feature names -#' belonging to one cluster. Each element must be a non-empty character -#' vector. +#' A list where each element is a non-empty character vector of feature names +#' belonging to one cluster (e.g., the output of a prior clustering step). +#' +#' @param corr_thresh +#' A numeric threshold in \code{[0, 1]}. For two-feature clusters, if +#' \code{|cor| >= corr_thresh}, the feature with the highest score is selected. +#' For clusters with more than two features, subclusters are created by cutting +#' the hierarchical tree at \code{h = 1 - corr_thresh}. +#' #' @param scores -#' A named numeric vector where names correspond to feature names and -#' values represent feature scores. All features appearing in +#' A named numeric vector of feature scores. All features appearing in #' \code{cluster_feature_list} must be present in \code{names(scores)}. #' #' @return -#' A list with two elements: -#' +#' A list with two components: #' \describe{ #' \item{\code{representatives}}{ -#' A character vector giving one selected representative feature -#' per cluster. +#' A character vector of selected representative features (one per +#' single-feature cluster; zero, one, or more per multi-feature cluster, +#' depending on the correlation structure and threshold). #' } #' \item{\code{representatives_map}}{ -#' A named list mapping each representative feature to the vector -#' of original features in its cluster. +#' A named list mapping each representative feature (when uniquely chosen) +#' to the character vector of features it represents. #' } #' } #' +#' @details +#' For clusters of size greater than two, the distance matrix is computed as +#' \code{1 - abs(cor(X))} on the selected columns, followed by +#' \code{hclust(..., method = "average")}. Subclusters are extracted via +#' \code{cutree(hc, h = 1 - corr_thresh)}. Within each subcluster, the +#' representative is the feature with the maximum score. +#' +#' When a two-feature cluster has \code{|cor| < corr_thresh}, both features +#' are returned as representatives and no mapping is stored for that pair, +#' as neither represents the other by design. +#' #' @examples +#' # Toy data frame +#' set.seed(1) +#' df <- data.frame( +#' feat_a = rnorm(100), +#' feat_b = rnorm(100), +#' feat_c = rnorm(100), +#' feat_d = rnorm(100), +#' feat_e = rnorm(100), +#' feat_f = rnorm(100) +#' ) +#' +#' # Example clusters #' cluster_feature_list <- list( -#' c("feat_a", "feat_b", "feat_c"), -#' "feat_d", -#' c("feat_e", "feat_f") +#' c("feat_a", "feat_b", "feat_c"), # size > 2 -> hierarchical subclusters +#' "feat_d", # size = 1 -> itself +#' c("feat_e", "feat_f") # size = 2 -> pairwise rule #' ) #' +#' # Scores for all features #' scores <- c( #' feat_a = 0.1, #' feat_b = 0.4, @@ -383,8 +453,13 @@ parse_mass_rt <- function(x, #' feat_f = 0.8 #' ) #' +#' # Choose a correlation threshold +#' corr_thresh <- 0.7 +#' #' res <- get_features_representatives_based_on_scores( +#' df = df, #' cluster_feature_list = cluster_feature_list, +#' corr_thresh = corr_thresh, #' scores = scores #' ) #' @@ -393,7 +468,9 @@ parse_mass_rt <- function(x, #' #' @export get_features_representatives_based_on_scores <- function( + df, cluster_feature_list, + corr_thresh, scores) { if (!is.list(cluster_feature_list)) { stop("`cluster_feature_list` must be a list.") @@ -402,9 +479,11 @@ get_features_representatives_based_on_scores <- function( if (is.null(names(scores))) { stop("`scores` must be a named numeric vector.") } + if (!is.numeric(corr_thresh) || length(corr_thresh) != 1L || is.na(corr_thresh) || corr_thresh < 0 || corr_thresh > 1) { + stop("`corr_thresh` must be a single numeric value in [0, 1].") + } - n_clusters <- length(cluster_feature_list) - representatives <- character(n_clusters) + representatives <- c() representatives_map <- vector("list") for (i in seq_along(cluster_feature_list)) { @@ -428,16 +507,47 @@ get_features_representatives_based_on_scores <- function( if (length(cluster) == 1L) { rep_feature <- cluster[1L] + representatives <- c(representatives, rep_feature) + } else if (length(cluster) == 2L) { + feature_df <- df %>% + dplyr::select(dplyr::all_of(cluster)) + + pair_cor <- stats::cor(feature_df)[1, 2] + + if (pair_cor >= corr_thresh) { + cluster_scores <- scores[cluster] + max_idx <- which.max(cluster_scores) + rep_feature <- cluster[max_idx] + representatives <- c(representatives, rep_feature) + representatives_map[[rep_feature]] <- cluster + } else { + representatives <- c(representatives, cluster) + } } else { - cluster_scores <- scores[cluster] - max_idx <- which.max(cluster_scores) - rep_feature <- cluster[max_idx] - representatives_map[[rep_feature]] <- cluster - } + feature_matrix <- df %>% + dplyr::select(dplyr::all_of(cluster)) %>% + as.matrix() - representatives[i] <- rep_feature - } + dist_mat <- as.dist(1 - abs(cor(feature_matrix))) + hc <- hclust(dist_mat, method = "average") + subclusters <- cutree(hc, h = 1 - corr_thresh) + cluster_list <- split(names(subclusters), subclusters) + for (j in seq_along(cluster_list)) { + cluster_j <- cluster_list[[j]] + if (length(cluster_j) == 1L) { + rep_feature <- cluster_j[1L] + representatives <- c(representatives, rep_feature) + } else { + cluster_scores <- scores[cluster_j] + max_idx <- which.max(cluster_scores) + rep_feature <- cluster_j[max_idx] + representatives <- c(representatives, rep_feature) + representatives_map[[rep_feature]] <- cluster_j + } + } + } + } list( representatives = representatives, representatives_map = representatives_map @@ -525,8 +635,8 @@ get_features_representatives_based_on_scores <- function( #' #' # Pre-defined feature groups (e.g. after a retention-time grouping step) #' cluster_feature_list <- list( -#' c("100@150", "100@151"), # group of size 2 -#' "101@200" # group of size 1 +#' c("100@150", "100@151"), # group of size 2 +#' "101@200" # group of size 1 #' ) #' #' res <- cluster_features_based_on_correlations( diff --git a/R/normalization_SERRF.R b/R/normalization_SERRF.R index 2ae0ab6..0b6f0ce 100644 --- a/R/normalization_SERRF.R +++ b/R/normalization_SERRF.R @@ -9,10 +9,12 @@ #' single-level strata is created so all samples are treated as one batch. When #' provided, the column must exist in `df` with no NA values. #' @param num_screen_SERRF Number of correlated features to use in model fitting. Default is 10. +#' @param seed Set a random number generator seed, default 0. #' #' @return A normalized data frame. #' @export -normalise_SERRF <- function(df, target_cols = NULL, is_qc = NULL, strata_col = NULL, num_screen_SERRF = 10) { +normalise_SERRF <- function(df, target_cols = NULL, is_qc = NULL, strata_col = NULL, num_screen_SERRF = 10, seed = 0) { + set.seed(seed) # ---- Validation ---- temp_strata_cols <- character(0) @@ -73,9 +75,14 @@ normalise_SERRF <- function(df, target_cols = NULL, is_qc = NULL, strata_col = N corrs_target[[batch]] <- cor(target, method = "spearman") } - # ---- Normalize Each Feature ---- + n <- length(target_cols) + step <- max(1, floor(n / 10)) + for (j in seq_along(target_cols)) { - if (j %% 250 == 1) cat("Feature", j, "\n") + if (j %% step == 1 || j == n) { + pct <- round(100 * j / n) + cat(j, "out of", n, "features normalised (", pct, "%)\n") + } feature_name <- target_cols[j] for (batch in batch_levels) { diff --git a/R/plot_normalization_comparison.R b/R/plot_normalization_comparison.R deleted file mode 100644 index 654ba96..0000000 --- a/R/plot_normalization_comparison.R +++ /dev/null @@ -1,253 +0,0 @@ -#' Plot feature values before/after normalization by run order -#' -#' @description -#' This function visualizes the effect of normalization (e.g., SERRF) on -#' omics feature values across batches and plates. It creates scatter plots of feature values -#' over **run order** for a set of target features, highlighting QC vs. non‑QC sample, -#' optionally with batch boundaries indicated by vertical dotted lines and points colored by plate. -#' If `df_after` is supplied, the function returns a side‑by‑side comparison (before vs. after normalization) -#' with a shared legend. Otherwise, it returns only the "before" plot. -#' -#' @param df_before A data frame containing the data **before preprocessing**. -#' Must include the columns referenced by `target_cols` and the column named in -#' `run_order`. Optionally includes columns named in `batch` and/or `plate`. -#' @param df_after An optional data frame containing the data **after preprocessing**. -#' If provided, must include the columns referenced by `target_cols` and the column named -#' in `run_order`. Optionally includes columns named in `batch` and/or `plate`. Default: `NULL`. -#' @param target_cols Feature columns to normalize. Character vector or selector -#' passed to `resolve_target_cols`). These columns are pivoted to long format and faceted -#' (one facet per feature). -#' @param is_qc_before Logical vector (length `nrow(df_before)`) indicating which rows are -#' QC samples (`TRUE`) vs. regular samples (`FALSE`) in the data before preprocessing. -#' Used to control point shape and size. -#' @param is_qc_after Optional logical vector (length `nrow(df_before)`) indicating which rows are -#' QC samples (`TRUE`) vs. regular samples (`FALSE`) in the data after preprocessing. -#' Required if `df_after` is provided. Default: `NULL`. -#' @param batch Optional string giving the **name of the column** in `df_before`/`df_after` -#' that encodes batch membership (e.g., `"batch_id"`). If provided, vertical dotted -#' lines mark batch boundaries (computed from the maximum run order per batch). -#' If `NULL`, all samples are treated as a single batch and no boundary lines are drawn. -#' Default: `NULL`. -#' @param plate Optional string giving the **name of the column** in `df_before`/`df_after` -#' that encodes plate ID. If provided, points are colored by plate. If `NULL`, -#' all points use a single color label `"all"`. Default: `NULL`. -#' @param run_order String giving the **name of the column** that encodes the -#' run order. This column is coerced to numeric and used as the x‑axis, -#' and the data are sorted by this column prior to plotting. -#' @param title_before Title for the "before" panel/plot. Default: `"Before normalization"`. -#' @param title_after Title for the "after" panel/plot. Default: `"After normalization"`. -#' -#' @details -#' Internally, the function: -#' - Selects columns `{batch, plate, run_order} ∪ target_cols` if present (using -#' `dplyr::any_of` for optional columns and `dplyr::all_of` for required features). -#' - Adds an `is_qc` flag and pivots features to long format (`feature`, `value`). -#' - Coerces `run_order` to numeric and orders rows accordingly. -#' - Creates a scatter plot (`value` vs. `run_order`), faceted by `feature` -#' (`scales = "free_y"`), with: -#' * Color mapped to `plate` (rainbow palette if multiple plates; grey if single plate), -#' * Shape and size mapped to `is_qc` (QC = filled circle, larger; Sample = triangle, smaller), -#' * Optional vertical dotted lines at batch boundaries (end of each batch). -#' - If `df_after` is provided, returns a side‑by‑side comparison (before | after) -#' using `patchwork::plot_layout(guides = "collect")` to share the legend. -#' -#' **Inputs as column names (strings):** `batch`, `plate`, and `run_order` are -#' expected to be **character scalars naming columns** in the data frames, not vectors. -#' -#' **Optional columns:** If `batch` or `plate` is `NULL`, a placeholder factor -#' `"all"` is used; if `batch` is `NULL`, no boundary lines are drawn. -#' -#' @return -#' Invisibly returns a **patchwork/ggplot object**: -#' - If `df_after` is `NULL`: a single plot for the "before" data (with legend). -#' - Otherwise: a two‑panel (before | after) plot with a shared legend on the right. -#' -#' @credits Original version developed by Carlota Castro Espin. -#' This version contains modifications by Felix Boekstegers. -#' -#' @examples -#' \dontrun{ -# library(dplyr) -# library(tidyr) -# library(ggplot2) -# library(patchwork) -# -# # Minimal reproducible example -# set.seed(1) -# n <- 60 -# n_batch <- 2 -# n_plate_per_batch <- 2 -# df_before <- data.frame( -# #batch_id = rep(letters[1:3], each = 20), -# #plate_id = rep(c("P1", "P2"), length.out = n), -# plate_id = factor(sample(seq_len(n_plate_per_batch*n_batch), n, replace = TRUE)), -# F1 = rnorm(n, 100, 10), -# F2 = rnorm(n, 500, 30), -# F3 = rnorm(n, 50, 3) -# ) -# df_before <- df_before[order(df_before$plate_id),] -# df_before$run_ord <- seq_len(n) -# df_before$batch_id <- (as.integer(df_before$plate_id) - 1L) %/% n_plate_per_batch + 1L -# is_qc_before <- rep(c(TRUE, FALSE), length.out = n) -# -# # Before only -# p1 <- plot_normalization_comparison( -# df_before = df_before, -# df_after = NULL, -# target_cols = c("F1", "F2","F3"), -# is_qc_before = is_qc_before, -# is_qc_after = NULL, -# batch = "batch_id", -# plate = "plate_id", -# run_order = "run_ord" -# ) -# p1 -# -# # With an "after" data frame (e.g., normalized values) -# df_after <- df_before %>% -# mutate( -# F1 = log1p(F1), -# F2 = log1p(F2), -# F3 = log1p(F3) -# ) %>% OmicsProcessing::normalise_SERRF( -# target_cols = "F", -# is_qc = is_qc_before, -# strata_col = "batch_id", -# num_screen_SERRF = 2 -# ) %>% -# dplyr::mutate(across( -# tidyselect::all_of(c("F1","F2","F3")), -# ~ as.numeric(scale(.)), -# .names = "{.col}" -# )) -# -# is_qc_after <- is_qc_before -# -# p2 <- plot_normalization_comparison( -# df_before = df_before, -# df_after = df_after, -# target_cols = c("F1", "F2","F3"), -# is_qc_before = is_qc_before, -# is_qc_after = is_qc_after, -# batch = "batch_id", -# plate = "plate_id", -# run_order = "run_ord" -# ) -# p2 -# } -#' - -plot_normalization_comparison <- function(df_before, - df_after = NULL, - target_cols, - is_qc_before, - is_qc_after = NULL, - batch = NULL, - plate = NULL, - run_order, - title_before = "Before normalization", - title_after = "After normalization") { - - prepare_df_long <- function(df, is_qc) { - df_long <- df %>% - dplyr::select(any_of(c(batch, plate, run_order)), all_of(target_cols)) %>% - dplyr::mutate(is_qc = is_qc) %>% - tidyr::pivot_longer(cols = all_of(target_cols),names_to = "feature", values_to = "value") - - df_long$run_order <- as.numeric(df_long[[run_order]]) - df_long <- df_long %>% - arrange(run_order) - - if(is.null(batch)){ - df_long$batch <- factor("all") - } else { - df_long$batch <- factor(df_long[[batch]]) - } - if(is.null(plate)){ - df_long$plate <- factor("all") - } else { - df_long$plate <- factor(df_long[[plate]]) - } - return(df_long) - } - - plot_scatter <- function(df_long, title) { - - if (is.null(batch)) { - batch_boundaries <- NULL - } else { - batch_boundaries <- df_long %>% - dplyr::group_by(batch) %>% - dplyr::summarise(max_run = max(run_order, na.rm = TRUE)) %>% - dplyr::pull(max_run) - } - - num_plates <- length(unique(df_long$plate)) - plate_colors <- if (num_plates > 1) grDevices::rainbow(num_plates) else "grey40" - - fig <- ggplot(df_long, aes(x = run_order, y = value, color = plate, shape = is_qc, size = is_qc)) + - geom_point(alpha = 0.5) + - facet_wrap(~feature, ncol = 1, scales = "free_y") + - scale_color_manual(values = plate_colors) + - scale_shape_manual( - name = "SampleType", - values = c(`TRUE` = 16, `FALSE` = 17), # 16=circle, 17=triangle - breaks = c(TRUE, FALSE), - labels = c("QC", "Sample") - ) + - scale_size_manual( - values = c(`TRUE` = 3, `FALSE` = 1), - guide = "none" - ) + - labs(title = title, - x = "Run order", y = "Feature intensity", color = "Plate") + - theme_minimal(base_size = 9) + - theme(panel.grid.major.x = element_blank(), - axis.text.x = element_text(angle = 45, hjust = 1), - legend.position = "bottom") - - if (!is.null(batch)) { - fig <- fig + - ggplot2::geom_vline(xintercept = batch_boundaries[!is.na(batch_boundaries)] + 0.5, - color = "black", linetype = "dotted", linewidth = 0.5) - } - - return(fig) - } - - df_long_before <- prepare_df_long(df_before, is_qc_before) - p_before <- plot_scatter(df_long_before, title_before) - - # If no df_after is given, return only p_before + legend - if (is.null(df_after)) { - combined_plot <- (p_before) & - theme( - legend.position = "right", - legend.key.size = unit(1.2, "lines"), - legend.text = element_text(size = 11), - legend.title = element_text(size = 12) - ) & - guides(color = guide_legend(override.aes = list(size = 4))) - return(invisible(combined_plot)) - } - - df_long_after <- prepare_df_long(df_after, is_qc_after) - p_after <- plot_scatter(df_long_after, title_after) - - combined_plot <- (p_before | p_after) + - plot_layout(widths = c(1, 1), guides = "collect") & # shared legend - theme( - legend.position = "right", # move legend to the side - legend.key.size = unit(1.2, "lines"), - legend.text = element_text(size = 11), - legend.title = element_text(size = 12) - ) & - guides(color = guide_legend(override.aes = list(size = 4))) - - invisible(combined_plot) -} - - - - - diff --git a/R/plot_omics_distributions.R b/R/plot_omics_distributions.R new file mode 100644 index 0000000..9d1151a --- /dev/null +++ b/R/plot_omics_distributions.R @@ -0,0 +1,359 @@ +#' Plot feature values by run order for reference and comparison data +#' +#' @description +#' Visualise omics feature values across run order for a set of target +#' features, highlighting QC versus non-QC samples. The function is +#' designed to assess technical variation associated with run order and +#' to evaluate the influence of stratification factors such as plate or +#' batch on feature measurements. It generates scatter plots of feature +#' values over run order, optionally colouring points by plate and marking +#' batch boundaries with vertical dotted lines. +#' +#' If `df_comp` is supplied, the function returns a side-by-side comparison +#' of the reference and comparison data with a shared legend. Otherwise, it +#' returns only the reference plot. This feature can be used to assess the +#' effect of post-processing on the data (for example, normalisation). +#' +#' @param df A data frame containing the reference data. It must include the +#' columns referenced by `target_cols` and the column named in +#' `run_order`. It may also include columns named in `batch` and `plate`. +#' @param target_cols Character vector giving the feature columns to plot. +#' These columns are pivoted to long format and faceted with one panel per +#' feature. +#' @param is_qc Optional character scalar specifying the QC indicator +#' column. If \code{NULL}, all samples are treated as non-QC. +#' @param run_order Character scalar giving the name of the column encoding +#' run order. This column is coerced to numeric and used as the x-axis. +#' @param df_comp Optional data frame containing comparison data, for example +#' normalised values. If provided, it must include the columns referenced by +#' `target_cols` and the column named in `run_order`. It may also include +#' columns named in `batch` and `plate`. Default is `NULL`. +#' @param batch Optional character scalar giving the name of the column +#' encoding batch membership. If provided, vertical dotted lines are drawn +#' at the end of each batch. If `NULL`, all samples are treated as a single +#' batch and no boundary lines are drawn. Default is `NULL`. +#' @param plate Optional character scalar giving the name of the column +#' encoding plate membership. If provided, points are coloured by plate. If +#' `NULL`, all points are assigned to a single plate level `"all"`. +#' Default is `NULL`. +#' @param title Optional title for the reference plot. If `df_comp` is not +#' `NULL` and `title_ref` is `NULL`, the default title is `"Reference"`. +#' Default is `NULL`. +#' @param title_comp Optional title for the comparison plot. If `df_comp` is +#' not `NULL` and `title_comp` is `NULL`, the default title is +#' `"Comparison"`. Default is `NULL`. +#' @param point_size Numeric scalar giving the base point size for non-QC +#' samples. QC points are drawn one unit larger. Default is `1`. +#' +#' @details +#' Internally, the function: +#' \itemize{ +#' \item Selects the columns `{batch, plate, run_order} \eqn{\cup} +#' target_cols`, using `dplyr::any_of()` for optional columns and +#' `dplyr::all_of()` for required feature columns. +#' \item Adds an `is_qc` flag and pivots the data to long format with +#' columns `feature` and `value`. +#' \item Coerces the run-order column to numeric and sorts rows by run +#' order. +#' \item Creates a scatter plot of `value` versus `run_order`, faceted by +#' feature with `scales = "free_y"`. +#' \item Maps point colour to plate, shape to QC status, alpha to QC +#' status, and size to QC status. +#' \item Uses a rainbow palette when multiple plate levels are present and +#' `"grey40"` when only one plate level is present. +#' \item Optionally adds vertical dotted lines at the maximum run order +#' within each batch. +#' \item If `df_comp` is provided, combines the reference and comparison +#' plots with `patchwork::wrap_plots()` and collects guides into a shared +#' legend. +#' } +#' +#' QC samples are shown as filled circles and non-QC samples as triangles. +#' Non-QC samples are also plotted with lower alpha. +#' +#' @return +#' Invisibly returns a plot object: +#' \itemize{ +#' \item If `df_comp` is `NULL`, a single `ggplot2` plot for the reference +#' data. +#' \item Otherwise, a combined `patchwork` plot with the reference and +#' comparison plots shown side by side and a shared legend. +#' } +#' +#' @author +#' Original version developed by Carlota Castro Espin. Modified by Felix +#' Boekstegers. +plot_omics_distributions <- function( + df, + target_cols, + run_order, + is_qc = NULL, + batch = NULL, + plate = NULL, + title = NULL, + df_comp = NULL, + title_comp = NULL, + point_size = 1 +) { + if (!is.null(df_comp)) { + if (is.null(title)) title <- "Reference" + if (is.null(title_comp)) title_comp <- "Comparison" + } + + df_long_ref <- prepare_df_long( + df = df, + target_cols = target_cols, + run_order = run_order, + is_qc = is_qc, + batch = batch, + plate = plate + ) + + p_ref <- plot_scatter_omics_feature( + df_long = df_long_ref, + title = title, + batch = batch, + point_size = point_size + ) + + if (is.null(df_comp)) { + p_ref <- p_ref + + ggplot2::theme( + legend.position = "right", + legend.key.size = grid::unit(1.2, "lines"), + legend.text = ggplot2::element_text(size = 11), + legend.title = ggplot2::element_text(size = 12) + ) + + ggplot2::guides( + color = ggplot2::guide_legend( + override.aes = list(size = 4) + ) + ) + + return(invisible(p_ref)) + } + + df_long_comp <- prepare_df_long( + df = df_comp, + target_cols = target_cols, + run_order = run_order, + is_qc = is_qc, + batch = batch, + plate = plate + ) + + p_comp <- plot_scatter_omics_feature( + df_long = df_long_comp, + title = title_comp, + batch = batch, + point_size = point_size + ) + + combined_plot <- patchwork::wrap_plots( + p_ref, + p_comp, + ncol = 2, + guides = "collect" + ) & + ggplot2::theme( + legend.position = "right", + legend.key.size = grid::unit(1.2, "lines"), + legend.text = ggplot2::element_text(size = 11), + plot.background = ggplot2::element_rect(fill = NA, colour = NA), + panel.background = ggplot2::element_rect(fill = NA, colour = NA), + legend.title = ggplot2::element_text(size = 12) + ) & + ggplot2::guides( + color = ggplot2::guide_legend( + override.aes = list(size = 4) + ) + ) + + invisible(combined_plot) +} + + +#' Prepare long-format data for feature plots +#' +#' Reshapes a wide data frame of feature intensities into long format +#' suitable for plotting against run order. Adds standardised columns for +#' run order, feature name, value, QC status, batch, and plate. +#' +#' @param df A data frame containing feature measurements and metadata. +#' @param target_cols Character vector of feature column names to include. +#' @param run_order Character scalar specifying the run order column. +#' @param is_qc Optional character scalar specifying the QC indicator +#' column. If \code{NULL}, all samples are treated as non-QC. +#' @param batch Optional character scalar specifying the batch column. +#' If \code{NULL}, a single batch level \code{"all"} is used. +#' @param plate Optional character scalar specifying the plate column. +#' If \code{NULL}, a single plate level \code{"all"} is used. +#' +#' @return A long-format \code{data.frame} with columns: +#' \describe{ +#' \item{run_order}{Numeric run order.} +#' \item{feature}{Feature name.} +#' \item{value}{Feature intensity.} +#' \item{is_qc}{Logical QC indicator.} +#' \item{batch}{Factor batch assignment.} +#' \item{plate}{Factor plate assignment.} +#' } +#' +#' @examples +#' df_long <- prepare_df_long( +#' df = df, +#' target_cols = c("feat1", "feat2"), +#' run_order = "injection_order", +#' is_qc = "qc_flag", +#' batch = "batch", +#' plate = "plate" +#' ) +prepare_df_long <- function( + df, + target_cols, + run_order, + is_qc = NULL, + batch = NULL, + plate = NULL +) { + df_long <- df %>% + dplyr::select( + dplyr::any_of(c(batch, plate, run_order, is_qc)), + dplyr::all_of(target_cols) + ) %>% + tidyr::pivot_longer( + cols = dplyr::all_of(target_cols), + names_to = "feature", + values_to = "value" + ) + + df_long$run_order <- as.numeric(df_long[[run_order]]) + + df_long <- df_long %>% + dplyr::arrange(.data$run_order) + + # QC handling (column or default) + if (is.null(is_qc)) { + df_long$is_qc <- FALSE + } else { + df_long$is_qc <- as.logical(df_long[[is_qc]]) + } + + # Batch handling + if (is.null(batch)) { + df_long$batch <- factor("all") + } else { + df_long$batch <- factor(df_long[[batch]]) + } + + # Plate handling + if (is.null(plate)) { + df_long$plate <- factor("all") + } else { + df_long$plate <- factor(df_long[[plate]]) + } + + df_long +} + +#' Plot feature intensities across run order +#' +#' Generates scatter plots of feature intensities against run order from a +#' long-format data frame. Points are coloured by plate and styled by QC +#' status. Optional vertical lines indicate batch boundaries. +#' +#' @param df_long A long-format data frame produced by +#' \code{OmicsProcessing::prepare_df_long()}. +#' @param title Optional plot title. +#' @param batch Optional character scalar specifying the batch column. +#' If provided, vertical dotted lines are drawn at batch boundaries. +#' @param point_size Numeric scalar controlling base point size. +#' +#' @return A \code{ggplot2} object. +#' +#' @examples +#' p <- plot_scatter_omics_feature( +#' df_long = df_long, +#' title = "Before normalisation", +#' batch = "batch", +#' point_size = 1 +#' ) +#' print(p) +plot_scatter_omics_feature <- function( + df_long, + title = NULL, + batch = NULL, + point_size = 1 +) { + if (is.null(batch)) { + batch_boundaries <- NULL + } else { + batch_boundaries <- df_long %>% + dplyr::group_by(.data$batch) %>% + dplyr::summarise( + max_run = max(.data$run_order, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::pull(.data$max_run) + } + + num_plates <- length(unique(df_long$plate)) + plate_colors <- if (num_plates > 1) { + grDevices::rainbow(num_plates) + } else { + "grey40" + } + + fig <- ggplot2::ggplot( + df_long, + ggplot2::aes( + x = .data$run_order, + y = .data$value, + color = .data$plate, + shape = .data$is_qc, + alpha = .data$is_qc, + size = .data$is_qc + ) + ) + + ggplot2::geom_point() + + ggplot2::facet_wrap(~feature, ncol = 1, scales = "free_y") + + ggplot2::scale_color_manual(values = plate_colors) + + ggplot2::scale_shape_manual( + name = "SampleType", + values = c(`TRUE` = 16, `FALSE` = 17), + breaks = c(TRUE, FALSE), + labels = c("QC", "Sample") + ) + + ggplot2::scale_size_manual( + values = c(`TRUE` = point_size + 1, `FALSE` = point_size), + guide = "none" + ) + + ggplot2::scale_alpha_manual( + values = c(`TRUE` = 1, `FALSE` = 0.3), + guide = "none" + ) + + ggplot2::labs( + title = title, + x = "Run order", + y = "Feature intensity", + color = "Plate" + ) + + ggplot2::theme_minimal(base_size = 9) + + ggplot2::theme( + panel.grid.major.x = ggplot2::element_blank(), + axis.text.x = ggplot2::element_text(angle = 45, hjust = 1), + legend.position = "bottom" + ) + + if (!is.null(batch)) { + fig <- fig + + ggplot2::geom_vline( + xintercept = batch_boundaries[!is.na(batch_boundaries)] + 0.5, + color = "black", + linetype = "dotted", + linewidth = 0.5 + ) + } + + fig +} diff --git a/data-raw/omics_synthetic.R b/data-raw/omics_synthetic.R new file mode 100644 index 0000000..c39df5d --- /dev/null +++ b/data-raw/omics_synthetic.R @@ -0,0 +1,13 @@ +# Regenerate the packaged omics_synthetic dataset. + +source("inst/scripts/omics_synthetic.R") + +generated <- build_omics_synthetic(seed = 1) + +omics_synthetic <- generated$omics_synthetic + +save( + omics_synthetic, + file = "data/omics_synthetic.rda", + compress = "bzip2" +) diff --git a/data/omics_synthetic.rda b/data/omics_synthetic.rda new file mode 100644 index 0000000..e08e728 Binary files /dev/null and b/data/omics_synthetic.rda differ diff --git a/inst/scripts/omics_synthetic.R b/inst/scripts/omics_synthetic.R new file mode 100644 index 0000000..45e8c32 --- /dev/null +++ b/inst/scripts/omics_synthetic.R @@ -0,0 +1,98 @@ +# Synthetic omics data generator used for examples and vignettes in +# OmicsProcessing. + +build_omics_synthetic <- function( + seed = 1, + n = 2000, + n_batch = 2, + n_plate_per_batch = 2, + n_features = 10, + qc_frac = 0.05 +) { + set.seed(seed) + + feature_cols <- paste0("F", seq_len(n_features)) + feature_means <- stats::runif(n_features, min = 100, max = 700) + feature_sds <- stats::runif(n_features, min = 3, max = 30) + + feature_matrix <- vapply( + seq_len(n_features), + FUN.VALUE = numeric(n), + FUN = function(j) { + stats::rnorm(n, mean = feature_means[j], sd = feature_sds[j]) + } + ) + colnames(feature_matrix) <- feature_cols + + omics_synthetic <- data.frame( + plate_id = factor(sample.int(n_plate_per_batch * n_batch, n, replace = TRUE)), + feature_matrix, + check.names = FALSE + ) + + omics_synthetic <- omics_synthetic[order(omics_synthetic$plate_id), , drop = FALSE] + rownames(omics_synthetic) <- NULL + + omics_synthetic$run_ord <- seq_len(nrow(omics_synthetic)) + omics_synthetic$batch_id <- + (as.integer(omics_synthetic$plate_id) - 1L) %/% n_plate_per_batch + 1L + + n_qc <- ceiling(qc_frac * nrow(omics_synthetic)) + qc_idx <- sample.int(nrow(omics_synthetic), size = n_qc, replace = FALSE) + omics_synthetic$is_qc <- FALSE + omics_synthetic$is_qc[qc_idx] <- TRUE + + drift_slopes <- stats::runif(n_features, min = 0.01, max = 0.1) + for (j in seq_len(n_features)) { + omics_synthetic[[feature_cols[j]]] <- + omics_synthetic[[feature_cols[j]]] - drift_slopes[j] * omics_synthetic$run_ord + } + + jump_info <- vector("list", length = n_features) + names(jump_info) <- feature_cols + + for (j in seq_len(n_features)) { + n_jumps <- sample(1:2, size = 1) + jump_points <- sort(sample(100:(n - 100), size = n_jumps)) + jump_sizes <- stats::runif(n_jumps, min = 0.5, max = 1.5) * feature_sds[j] * 2 + + for (k in seq_len(n_jumps)) { + idx <- omics_synthetic$run_ord >= jump_points[k] + omics_synthetic[[feature_cols[j]]][idx] <- + omics_synthetic[[feature_cols[j]]][idx] + jump_sizes[k] + } + + jump_info[[j]] <- data.frame( + feature = feature_cols[j], + jump_id = seq_len(n_jumps), + jump_point = jump_points, + jump_size = jump_sizes + ) + } + + jump_info <- do.call(rbind, jump_info) + rownames(jump_info) <- NULL + + list( + omics_synthetic = omics_synthetic, + jump_info = jump_info, + parameters = list( + seed = seed, + n = n, + n_batch = n_batch, + n_plate_per_batch = n_plate_per_batch, + n_features = n_features, + qc_frac = qc_frac, + feature_means = feature_means, + feature_sds = feature_sds, + drift_slopes = drift_slopes + ) + ) +} + +# Example usage after locating this script with +# system.file("scripts", "omics_synthetic.R", package = "OmicsProcessing"): +# +# source(script_path) +# generated <- build_omics_synthetic() +# str(generated$omics_synthetic) diff --git a/man/build_omics_synthetic.Rd b/man/build_omics_synthetic.Rd new file mode 100644 index 0000000..ad6d34b --- /dev/null +++ b/man/build_omics_synthetic.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/build_omics_synthetic.R +\name{build_omics_synthetic} +\alias{build_omics_synthetic} +\title{Build synthetic omics data for plotting and normalisation examples} +\usage{ +build_omics_synthetic( + seed = 1, + n = 2000, + n_batch = 2, + n_plate_per_batch = 2, + n_features = 10, + qc_frac = 0.05 +) +} +\arguments{ +\item{seed}{Integer random seed.} + +\item{n}{Number of rows (samples).} + +\item{n_batch}{Number of batches.} + +\item{n_plate_per_batch}{Number of plates per batch.} + +\item{n_features}{Number of feature columns to generate.} + +\item{qc_frac}{Fraction of rows labelled as QC.} +} +\value{ +A list with three elements: +\describe{ +\item{omics_synthetic}{A synthetic omics data frame.} +\item{jump_info}{A data frame describing the simulated step changes per feature.} +\item{parameters}{A list of generation parameters and sampled values.} +} +} +\description{ +Create a synthetic omics data set with feature intensities, run-order +drift, batch structure, plate structure, and QC sample labels. This is the +generator used for the packaged \code{omics_synthetic} example data. +} +\examples{ +generated <- build_omics_synthetic(seed = 1) +str(generated$omics_synthetic) +head(generated$jump_info) + +} diff --git a/man/cluster_features_based_on_correlations.Rd b/man/cluster_features_based_on_correlations.Rd index b532a8d..8bac7a4 100644 --- a/man/cluster_features_based_on_correlations.Rd +++ b/man/cluster_features_based_on_correlations.Rd @@ -88,8 +88,8 @@ data_tbl <- tibble::tibble( # Pre-defined feature groups (e.g. after a retention-time grouping step) cluster_feature_list <- list( - c("100@150", "100@151"), # group of size 2 - "101@200" # group of size 1 + c("100@150", "100@151"), # group of size 2 + "101@200" # group of size 1 ) res <- cluster_features_based_on_correlations( diff --git a/man/cluster_features_by_retention_time.Rd b/man/cluster_features_by_retention_time.Rd index 3a4b6ac..85f1ab5 100644 --- a/man/cluster_features_by_retention_time.Rd +++ b/man/cluster_features_by_retention_time.Rd @@ -24,8 +24,8 @@ rows.} \item{is_qc}{Logical vector marking QC samples. Must have length equal to \code{nrow(df)}. If \code{NULL}, all samples are treated as non-QC. -(Currently only used for consistency with other interfaces; it is not -used directly in this function.)} +QC rows are excluded from clustering/summarisation and from the returned +\code{clustered_df}.} \item{rt_height}{Numeric height at which to cut the dendrogram in retention-time space. Smaller values create more clusters.} @@ -38,8 +38,9 @@ processed using correlation-based summarisation.} \item{scores}{Named numeric vector of feature scores, required when \code{method = "scores"}. Names must coincide with \code{target_cols}. -For each retention-time cluster, the feature with the largest score is -chosen as representative.} +For each retention-time cluster, the features are subclustered based on +the input correlation threshold. The feature with the highest score is +selected as the representative of each subcluster.} \item{cut_height}{Numeric height used inside \code{cluster_features_based_on_correlations} when \code{method = "correlations"}. Passed on to that function's @@ -54,9 +55,12 @@ A list with two elements: \describe{ \item{\code{clustered_df}}{ -A data.frame with one column per final representative feature. -When \code{method = "scores"}, this consists of a subset of the -original feature columns. When \code{method = "correlations"}, it +A data.frame where rows correspond to non-QC samples only. +All non-target columns (i.e. \code{setdiff(names(df), target_cols)}) +are included unchanged, followed by one column per final +representative feature. +When \code{method = "scores"}, representative features are a subset of +the original feature columns. When \code{method = "correlations"}, it may contain synthetic features created by \code{cluster_features_based_on_correlations()}. } @@ -73,6 +77,11 @@ hierarchical clustering, then selects a representative feature for each cluster either via scores or via correlation-based aggregation. } \details{ +QC rows (as indicated by \code{is_qc}) are excluded from all clustering and +summarisation steps, and are not present in the returned \code{clustered_df}. +Additionally, all columns in \code{df} that are not part of \code{target_cols} +are prepended to the returned \code{clustered_df} unchanged. + The function proceeds in two steps: \enumerate{ @@ -98,6 +107,7 @@ based on correlations. \examples{ \dontrun{ df <- data.frame( + batch = rep(1, 20), `100@150` = rnorm(20), `100@151` = rnorm(20), `101@200` = rnorm(20) diff --git a/man/get_features_representatives_based_on_scores.Rd b/man/get_features_representatives_based_on_scores.Rd index fc35409..61a3dd3 100644 --- a/man/get_features_representatives_based_on_scores.Rd +++ b/man/get_features_representatives_based_on_scores.Rd @@ -2,50 +2,95 @@ % Please edit documentation in R/feature_clustering.R \name{get_features_representatives_based_on_scores} \alias{get_features_representatives_based_on_scores} -\title{Select representative features per cluster based on scores} +\title{Select representative features per cluster using correlation threshold and scores} \usage{ -get_features_representatives_based_on_scores(cluster_feature_list, scores) +get_features_representatives_based_on_scores( + df, + cluster_feature_list, + corr_thresh, + scores +) } \arguments{ -\item{cluster_feature_list}{A list where each element is a character vector of feature names -belonging to one cluster. Each element must be a non-empty character -vector.} +\item{df}{A data frame containing the feature columns referenced in +\code{cluster_feature_list}. Column names must match the feature names.} + +\item{cluster_feature_list}{A list where each element is a non-empty character vector of feature names +belonging to one cluster (e.g., the output of a prior clustering step).} + +\item{corr_thresh}{A numeric threshold in \code{[0, 1]}. For two-feature clusters, if +\code{|cor| >= corr_thresh}, the feature with the highest score is selected. +For clusters with more than two features, subclusters are created by cutting +the hierarchical tree at \code{h = 1 - corr_thresh}.} -\item{scores}{A named numeric vector where names correspond to feature names and -values represent feature scores. All features appearing in +\item{scores}{A named numeric vector of feature scores. All features appearing in \code{cluster_feature_list} must be present in \code{names(scores)}.} } \value{ -A list with two elements: - +A list with two components: \describe{ \item{\code{representatives}}{ -A character vector giving one selected representative feature -per cluster. +A character vector of selected representative features (one per +single-feature cluster; zero, one, or more per multi-feature cluster, +depending on the correlation structure and threshold). } \item{\code{representatives_map}}{ -A named list mapping each representative feature to the vector -of original features in its cluster. +A named list mapping each representative feature (when uniquely chosen) +to the character vector of features it represents. } } } \description{ -For each cluster, this function selects a single representative feature. -If a cluster contains one feature, that feature is selected. If it contains -multiple features, the feature with the highest score in \code{scores} is -selected. +This function selects representative features from pre-defined clusters of +variables, using a correlation-based rule and a scoring vector: } \details{ -The function also returns a named list mapping each representative feature -to all cluster members it represents. +\itemize{ +\item If a cluster contains a single feature, that feature is selected. +\item If a cluster contains two features, the absolute pairwise correlation +is computed. If \code{|cor| >= corr_thresh}, only the highest-scoring +feature is selected; otherwise, both features are kept as representatives. +\item If a cluster contains more than two features, hierarchical clustering +is performed using the distance \code{1 - |cor|} with +\code{method = "average"}. The dendrogram is cut at +\code{h = 1 - corr_thresh} to form subclusters. From each subcluster, +the highest-scoring feature is selected as the representative. +} + +In all cases where a unique representative is chosen for a (sub)cluster, +\code{representatives_map} stores the mapping from the representative feature +to the set of original features it represents. + +For clusters of size greater than two, the distance matrix is computed as +\code{1 - abs(cor(X))} on the selected columns, followed by +\code{hclust(..., method = "average")}. Subclusters are extracted via +\code{cutree(hc, h = 1 - corr_thresh)}. Within each subcluster, the +representative is the feature with the maximum score. + +When a two-feature cluster has \code{|cor| < corr_thresh}, both features +are returned as representatives and no mapping is stored for that pair, +as neither represents the other by design. } \examples{ +# Toy data frame +set.seed(1) +df <- data.frame( + feat_a = rnorm(100), + feat_b = rnorm(100), + feat_c = rnorm(100), + feat_d = rnorm(100), + feat_e = rnorm(100), + feat_f = rnorm(100) +) + +# Example clusters cluster_feature_list <- list( - c("feat_a", "feat_b", "feat_c"), - "feat_d", - c("feat_e", "feat_f") + c("feat_a", "feat_b", "feat_c"), # size > 2 -> hierarchical subclusters + "feat_d", # size = 1 -> itself + c("feat_e", "feat_f") # size = 2 -> pairwise rule ) +# Scores for all features scores <- c( feat_a = 0.1, feat_b = 0.4, @@ -55,8 +100,13 @@ scores <- c( feat_f = 0.8 ) +# Choose a correlation threshold +corr_thresh <- 0.7 + res <- get_features_representatives_based_on_scores( + df = df, cluster_feature_list = cluster_feature_list, + corr_thresh = corr_thresh, scores = scores ) diff --git a/man/normalise_SERRF.Rd b/man/normalise_SERRF.Rd index ed43245..16c3488 100644 --- a/man/normalise_SERRF.Rd +++ b/man/normalise_SERRF.Rd @@ -9,7 +9,8 @@ normalise_SERRF( target_cols = NULL, is_qc = NULL, strata_col = NULL, - num_screen_SERRF = 10 + num_screen_SERRF = 10, + seed = 0 ) } \arguments{ @@ -24,6 +25,8 @@ single-level strata is created so all samples are treated as one batch. When provided, the column must exist in \code{df} with no NA values.} \item{num_screen_SERRF}{Number of correlated features to use in model fitting. Default is 10.} + +\item{seed}{Set a random number generator seed, default 0.} } \value{ A normalized data frame. diff --git a/man/omics_synthetic.Rd b/man/omics_synthetic.Rd new file mode 100644 index 0000000..03eee68 --- /dev/null +++ b/man/omics_synthetic.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{omics_synthetic} +\alias{omics_synthetic} +\title{Synthetic omics dataset for plotting and normalisation examples} +\format{ +A data frame with 2000 rows and 14 variables: +\describe{ +\item{plate_id}{Plate identifier with 4 levels.} +\item{F1-F10}{Numeric feature intensity columns.} +\item{run_ord}{Integer run order.} +\item{batch_id}{Batch identifier.} +\item{is_qc}{Logical indicator for QC samples.} +} +} +\usage{ +data(omics_synthetic) +} +\description{ +A synthetic omics dataset with run order, batch, plate, QC flags, and ten +feature columns. It is intended for examples and vignettes demonstrating +plotting and normalisation workflows. Reproducible data-generation code is +available in \code{data-raw/omics_synthetic.R} in the source repository and in +the installed package via \code{system.file("scripts", "omics_synthetic.R", package = "OmicsProcessing")}. +} +\keyword{datasets} diff --git a/man/plot_omics_distributions.Rd b/man/plot_omics_distributions.Rd new file mode 100644 index 0000000..8c7b3f1 --- /dev/null +++ b/man/plot_omics_distributions.Rd @@ -0,0 +1,112 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_omics_distributions.R +\name{plot_omics_distributions} +\alias{plot_omics_distributions} +\title{Plot feature values by run order for reference and comparison data} +\usage{ +plot_omics_distributions( + df, + target_cols, + run_order, + is_qc = NULL, + batch = NULL, + plate = NULL, + title = NULL, + df_comp = NULL, + title_comp = NULL, + point_size = 1 +) +} +\arguments{ +\item{df}{A data frame containing the reference data. It must include the +columns referenced by \code{target_cols} and the column named in +\code{run_order}. It may also include columns named in \code{batch} and \code{plate}.} + +\item{target_cols}{Character vector giving the feature columns to plot. +These columns are pivoted to long format and faceted with one panel per +feature.} + +\item{run_order}{Character scalar giving the name of the column encoding +run order. This column is coerced to numeric and used as the x-axis.} + +\item{is_qc}{Optional character scalar specifying the QC indicator +column. If \code{NULL}, all samples are treated as non-QC.} + +\item{batch}{Optional character scalar giving the name of the column +encoding batch membership. If provided, vertical dotted lines are drawn +at the end of each batch. If \code{NULL}, all samples are treated as a single +batch and no boundary lines are drawn. Default is \code{NULL}.} + +\item{plate}{Optional character scalar giving the name of the column +encoding plate membership. If provided, points are coloured by plate. If +\code{NULL}, all points are assigned to a single plate level \code{"all"}. +Default is \code{NULL}.} + +\item{title}{Optional title for the reference plot. If \code{df_comp} is not +\code{NULL} and \code{title_ref} is \code{NULL}, the default title is \code{"Reference"}. +Default is \code{NULL}.} + +\item{df_comp}{Optional data frame containing comparison data, for example +normalised values. If provided, it must include the columns referenced by +\code{target_cols} and the column named in \code{run_order}. It may also include +columns named in \code{batch} and \code{plate}. Default is \code{NULL}.} + +\item{title_comp}{Optional title for the comparison plot. If \code{df_comp} is +not \code{NULL} and \code{title_comp} is \code{NULL}, the default title is +\code{"Comparison"}. Default is \code{NULL}.} + +\item{point_size}{Numeric scalar giving the base point size for non-QC +samples. QC points are drawn one unit larger. Default is \code{1}.} +} +\value{ +Invisibly returns a plot object: +\itemize{ +\item If \code{df_comp} is \code{NULL}, a single \code{ggplot2} plot for the reference +data. +\item Otherwise, a combined \code{patchwork} plot with the reference and +comparison plots shown side by side and a shared legend. +} +} +\description{ +Visualise omics feature values across run order for a set of target +features, highlighting QC versus non-QC samples. The function is +designed to assess technical variation associated with run order and +to evaluate the influence of stratification factors such as plate or +batch on feature measurements. It generates scatter plots of feature +values over run order, optionally colouring points by plate and marking +batch boundaries with vertical dotted lines. + +If \code{df_comp} is supplied, the function returns a side-by-side comparison +of the reference and comparison data with a shared legend. Otherwise, it +returns only the reference plot. This feature can be used to assess the +effect of post-processing on the data (for example, normalisation). +} +\details{ +Internally, the function: +\itemize{ +\item Selects the columns \verb{\{batch, plate, run_order\} \eqn{\cup} target_cols}, using \code{dplyr::any_of()} for optional columns and +\code{dplyr::all_of()} for required feature columns. +\item Adds an \code{is_qc} flag and pivots the data to long format with +columns \code{feature} and \code{value}. +\item Coerces the run-order column to numeric and sorts rows by run +order. +\item Creates a scatter plot of \code{value} versus \code{run_order}, faceted by +feature with \code{scales = "free_y"}. +\item Maps point colour to plate, shape to QC status, alpha to QC +status, and size to QC status. +\item Uses a rainbow palette when multiple plate levels are present and +\code{"grey40"} when only one plate level is present. +\item Optionally adds vertical dotted lines at the maximum run order +within each batch. +\item If \code{df_comp} is provided, combines the reference and comparison +plots with \code{patchwork::wrap_plots()} and collects guides into a shared +legend. +} + +QC samples are shown as filled circles and non-QC samples as triangles. +Non-QC samples are also plotted with lower alpha. +} +\author{ +Original version developed by Carlota Castro Espin. Modified by Felix +Boekstegers. +} diff --git a/man/plot_scatter_omics_feature.Rd b/man/plot_scatter_omics_feature.Rd new file mode 100644 index 0000000..2b7f57c --- /dev/null +++ b/man/plot_scatter_omics_feature.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_omics_distributions.R +\name{plot_scatter_omics_feature} +\alias{plot_scatter_omics_feature} +\title{Plot feature intensities across run order} +\usage{ +plot_scatter_omics_feature(df_long, title = NULL, batch = NULL, point_size = 1) +} +\arguments{ +\item{df_long}{A long-format data frame produced by +\code{OmicsProcessing::prepare_df_long()}.} + +\item{title}{Optional plot title.} + +\item{batch}{Optional character scalar specifying the batch column. +If provided, vertical dotted lines are drawn at batch boundaries.} + +\item{point_size}{Numeric scalar controlling base point size.} +} +\value{ +A \code{ggplot2} object. +} +\description{ +Generates scatter plots of feature intensities against run order from a +long-format data frame. Points are coloured by plate and styled by QC +status. Optional vertical lines indicate batch boundaries. +} +\examples{ +p <- plot_scatter_omics_feature( + df_long = df_long, + title = "Before normalisation", + batch = "batch", + point_size = 1 +) +print(p) +} diff --git a/man/prepare_df_long.Rd b/man/prepare_df_long.Rd new file mode 100644 index 0000000..cb8e2ee --- /dev/null +++ b/man/prepare_df_long.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_omics_distributions.R +\name{prepare_df_long} +\alias{prepare_df_long} +\title{Prepare long-format data for feature plots} +\usage{ +prepare_df_long( + df, + target_cols, + run_order, + is_qc = NULL, + batch = NULL, + plate = NULL +) +} +\arguments{ +\item{df}{A data frame containing feature measurements and metadata.} + +\item{target_cols}{Character vector of feature column names to include.} + +\item{run_order}{Character scalar specifying the run order column.} + +\item{is_qc}{Optional character scalar specifying the QC indicator +column. If \code{NULL}, all samples are treated as non-QC.} + +\item{batch}{Optional character scalar specifying the batch column. +If \code{NULL}, a single batch level \code{"all"} is used.} + +\item{plate}{Optional character scalar specifying the plate column. +If \code{NULL}, a single plate level \code{"all"} is used.} +} +\value{ +A long-format \code{data.frame} with columns: +\describe{ +\item{run_order}{Numeric run order.} +\item{feature}{Feature name.} +\item{value}{Feature intensity.} +\item{is_qc}{Logical QC indicator.} +\item{batch}{Factor batch assignment.} +\item{plate}{Factor plate assignment.} +} +} +\description{ +Reshapes a wide data frame of feature intensities into long format +suitable for plotting against run order. Adds standardised columns for +run order, feature name, value, QC status, batch, and plate. +} +\examples{ +df_long <- prepare_df_long( + df = df, + target_cols = c("feat1", "feat2"), + run_order = "injection_order", + is_qc = "qc_flag", + batch = "batch", + plate = "plate" +) +} diff --git a/pkgdown/index.md b/pkgdown/index.md index 6e1c44b..4033447 100644 --- a/pkgdown/index.md +++ b/pkgdown/index.md @@ -17,6 +17,7 @@ Pre-analysis processing for metabolomics and proteomics: missingness filtering, - Detect outlier samples with [`remove_outliers()`](reference/remove_outliers.html) ([vignette](articles/outlier-removal.html)) - Impute with RF, LCMD, or both via [`hybrid_imputation()`](reference/hybrid_imputation.html) ([vignette](articles/hybrid-imputation.html)) - Normalise with SERRF using [`normalise_SERRF()`](reference/normalise_SERRF.html) ([vignette](articles/serrf-normalisation.html)) + - Compare run-order plots before and after normalisation with `plot_omics_distributions()` ([vignette](articles/plot-omics-distributions.html)) - Cluster features by RT or correlations using [`cluster_features_by_retention_time()`](reference/cluster_features_by_retention_time.html) ([vignette](articles/feature-clustering.html)) ## Quick start @@ -102,6 +103,26 @@ clusters <- cluster_features_by_retention_time( corr_thresh = 0.75 ) ``` +### Quick visual check after normalisation + +Before moving on to downstream analyses, it is often useful to inspect a few features manually. A simple strategy is to choose 3 features at random and compare their run-order profiles before and after normalisation with [`plot_omics_distributions()`](articles/plot-omics-distributions.html). + +```r +set.seed(1) +sample_features <- sample(target_cols, 3) + +plot_omics_distributions( + df = df_imputed, + df_comp = df_normalised, + target_cols = sample_features, + run_order = "run_ord", + is_qc = "is_qc", + batch = "batch_id", + plate = "plate_id", + title = "Before normalisation", + title_comp = "After normalisation" +) +``` ## Developers & Contributors @@ -118,5 +139,6 @@ Please follow these guidelines: [Developers & Contributors](articles/developer-g - Semi-automated pipeline details: [Semi-automated pipeline vignette](articles/process_data.html) - Log-transform features: [Log transformation (log1p)](articles/log-transformation.html) - SERRF batch correction: [Batch correction using SERRF](articles/serrf-normalisation.html) +- Normalisation plot comparison: [Compare run-order plots before and after normalisation](articles/plot-omics-distributions.html) - Feature clustering: [Retention-time clustering](articles/feature-clustering.html) - Developers & contributors: [Developer guide](articles/developer-guidelines.html) diff --git a/tests/testthat/test-feature_clustering.R b/tests/testthat/test-feature_clustering.R index c51b9c1..eaa4589 100644 --- a/tests/testthat/test-feature_clustering.R +++ b/tests/testthat/test-feature_clustering.R @@ -25,29 +25,6 @@ test_that("clusters matrix input into k groups", { expect_equal(unname(res), c(1L, 1L, 2L, 2L)) }) -test_that("retention-time clustering with scores picks highest per RT cluster", { - df <- data.frame( - "100@150" = 1:4, - "100@151" = 5:8, - "101@200" = 9:12, - check.names = FALSE - ) - scores <- c("100@150" = 0.2, "100@151" = 0.8, "101@200" = 0.5) - - res <- cluster_features_by_retention_time( - df = df, - target_cols = c("100@150", "100@151", "101@200"), - is_qc = rep(FALSE, nrow(df)), - rt_height = 1, # puts first two features together, third separate - method = "scores", - scores = scores - ) - - expect_equal(names(res$clustered_df), c("100@151", "101@200")) - expect_equal(res$representatives_map$`100@151`, c("100@150", "100@151")) - expect_null(res$representatives_map$`101@200`) -}) - test_that("correlation-based path produces synthetic feature for RT cluster", { skip_if_not_installed("FactoMineR") skip_if_not_installed("ClustOfVar") diff --git a/tests/testthat/test-get_features_representatives_based_on_scores.R b/tests/testthat/test-get_features_representatives_based_on_scores.R index 6f94340..28dc038 100644 --- a/tests/testthat/test-get_features_representatives_based_on_scores.R +++ b/tests/testthat/test-get_features_representatives_based_on_scores.R @@ -1,60 +1,96 @@ -test_that("cluster_feature_list must be a list", { - expect_error( - get_features_representatives_based_on_scores("not_a_list", c(a = 1)), - "`cluster_feature_list` must be a list\\." - ) -}) - -test_that("scores must be a named numeric vector", { - expect_error( - get_features_representatives_based_on_scores(list("a"), c(1, 2)), - "`scores` must be a named numeric vector\\." - ) -}) -test_that("each cluster element must be character vector", { - expect_error( - get_features_representatives_based_on_scores(list(1:3), c(a = 1)), - "Each element of `cluster_feature_list` must be a" - ) -}) +test_that("get_features_representatives_based_on_scores works on example data", { + set.seed(789) + f0 <- rnorm(15) + f1 <- rnorm(15) + f2 <- f1 + rnorm(15, sd = 0.01) + f3 <- f1 + rnorm(15, sd = 0.01) + f4 <- rnorm(15) + f5 <- f4 + rnorm(15, sd = 0.01) + f6 <- rnorm(15) + f7 <- rnorm(15) + f8 <- f7 + rnorm(15, sd = 0.01) + f9 <- rnorm(15) + f10 <- rnorm(15) -test_that("errors when scores for cluster members are missing", { - expect_error( - get_features_representatives_based_on_scores( - list("a", c("b", "c")), - c(a = 1, b = 2) - ), - "Some features in cluster 2 have no score: c" + df <- data.frame( + "f0@1" = f0, + "f1@1" = f1, + "f2@1" = f2, + "f3@1" = f3, + "f4@1" = f4, + "f5@1" = f5, + "f6@2" = f6, + "f7@3" = f7, + "f8@3" = f8, + "f9@4" = f9, + "f10@4" = f10, check.names = FALSE ) -}) -test_that("selects highest scoring feature per cluster and returns map", { - clusters <- list( - c("feat_a", "feat_b", "feat_c"), - "feat_d", - c("feat_e", "feat_f") - ) scores <- c( - feat_a = 0.1, - feat_b = 0.4, - feat_c = 0.3, - feat_d = 0.9, - feat_e = 0.2, - feat_f = 0.8 + "f0@1" = 0.1, + "f1@1" = 0.1, + "f2@1" = 0.4, # should win cluster 1 + "f3@1" = 0.3, + "f4@1" = 0.9, # should win cluster 2 + "f5@1" = 0.8, + "f6@2" = 0.8, # single-feature cluster + "f7@3" = 0.8, + "f8@3" = 1.8, # should win cluster 3 + "f9@4" = 1.8, # tie with f10@4 -> first max index chosen + "f10@4" = 1.8 + ) + # Cluster list: + # cluster 1: 4 highly correlated features around "1" + # cluster 2: 2 highly correlated features + # cluster 3: 2 highly correlated features + # cluster 4: 2 moderately correlated features + cluster_feature_list <- list( + c("f0@1", "f1@1", "f2@1", "f3@1", "f4@1", "f5@1"), + "f6@2", + c("f7@3", "f8@3"), + c("f9@4", "f10@4") ) - res <- get_features_representatives_based_on_scores( - cluster_feature_list = clusters, + corr_thresh <- 0.75 + + result <- get_features_representatives_based_on_scores( + df = df, + cluster_feature_list = cluster_feature_list, + corr_thresh = corr_thresh, scores = scores ) - expect_equal(res$representatives, c("feat_b", "feat_d", "feat_f")) - expect_equal( - res$representatives_map, - list( - feat_b = c("feat_a", "feat_b", "feat_c"), - feat_f = c("feat_e", "feat_f") - ) - ) + reps <- result$representatives + reps_map <- result$representatives_map + + # === Scenario 1: single-feature cluster in an RT group2 === + expect_true("f6@2" %in% reps) + + # === Scenario 2: multi-feature (size > 2) high correlation subcluster in RT group1 === + # f2@1 has highest score among f0,f1,f2,f3 + expect_true("f2@1" %in% reps) + expect_equal(reps_map[["f2@1"]], c("f1@1", "f2@1", "f3@1")) + + # === Scenario 2.1: single-feature in an RT group1 === + # f2@1 has highest score among f0,f1,f2,f3 + expect_true("f0@1" %in% reps) + expect_true(is.null(reps_map[["f0@1"]])) + + # === Scenario 3: 2-feature cluster with high correlation in an RT group1 === + # f4@1 and f5@1 are highly correlated; f4@1 wins (score 0.9) + expect_true("f4@1" %in% reps) + expect_equal(reps_map[["f4@1"]], c("f4@1", "f5@1")) + + # === Scenario 4: Another 2-feature cluster with high correlation in RT group3 === + # f7@3 vs f8@3 → f8@3 wins (score 1.8) + expect_true("f8@3" %in% reps) + expect_equal(reps_map[["f8@3"]], c("f7@3", "f8@3")) + + # === Scenario 5: 2 un correlated features in RT group4 === + # f9@4 and f10@4 tie in score → the first max (f9@4) should be selected + expect_true("f9@4" %in% reps) + expect_true("f10@4" %in% reps) + expect_true(is.null(reps_map[["f9@4"]])) + expect_true(is.null(reps_map[["f10@4"]])) }) diff --git a/tests/testthat/test_plot_omics_distributions.R b/tests/testthat/test_plot_omics_distributions.R new file mode 100644 index 0000000..73949ab --- /dev/null +++ b/tests/testthat/test_plot_omics_distributions.R @@ -0,0 +1,198 @@ +testthat::test_that("prepare_df_long reshapes data and standardises columns", { + df <- data.frame( + injection_order = c(2, 1, 3), + qc_flag = c(TRUE, FALSE, TRUE), + batch_id = c("b1", "b1", "b2"), + plate_id = c("p1", "p2", "p1"), + feat1 = c(10, 20, 30), + feat2 = c(100, 200, 300) + ) + + out <- prepare_df_long( + df = df, + target_cols = c("feat1", "feat2"), + run_order = "injection_order", + is_qc = "qc_flag", + batch = "batch_id", + plate = "plate_id" + ) + + testthat::expect_s3_class(out, "data.frame") + testthat::expect_equal( + sort(names(out)), + sort(c( + "injection_order", "qc_flag", "batch_id", "plate_id", + "feature", "value", "run_order", "is_qc", "batch", "plate" + )) + ) + testthat::expect_equal(nrow(out), 6) + testthat::expect_true(is.numeric(out$run_order)) + testthat::expect_true(is.logical(out$is_qc)) + testthat::expect_s3_class(out$batch, "factor") + testthat::expect_s3_class(out$plate, "factor") + testthat::expect_equal(out$run_order, c(1, 1, 2, 2, 3, 3)) + testthat::expect_setequal(unique(out$feature), c("feat1", "feat2")) +}) + +testthat::test_that("prepare_df_long uses defaults when qc, batch, and plate are NULL", { + df <- data.frame( + injection_order = c(2, 1), + feat1 = c(10, 20) + ) + + out <- prepare_df_long( + df = df, + target_cols = "feat1", + run_order = "injection_order", + is_qc = NULL, + batch = NULL, + plate = NULL + ) + + testthat::expect_equal(nrow(out), 2) + testthat::expect_identical(out$is_qc, c(FALSE, FALSE)) + testthat::expect_equal(levels(out$batch), "all") + testthat::expect_equal(levels(out$plate), "all") + testthat::expect_equal(out$run_order, c(1, 2)) +}) + +testthat::test_that("plot_scatter_omics_feature returns a ggplot object", { + df_long <- data.frame( + run_order = c(1, 2, 3, 4), + value = c(10, 11, 20, 21), + feature = c("feat1", "feat1", "feat2", "feat2"), + is_qc = c(TRUE, FALSE, TRUE, FALSE), + batch = factor(c("b1", "b1", "b2", "b2")), + plate = factor(c("p1", "p1", "p2", "p2")) + ) + + p <- plot_scatter_omics_feature( + df_long = df_long, + title = "Test plot", + batch = "batch", + point_size = 1 + ) + + testthat::expect_s3_class(p, "ggplot") +}) + +testthat::test_that("plot_scatter_omics_feature adds batch boundary lines when batch is given", { + df_long <- data.frame( + run_order = c(1, 2, 3, 4), + value = c(10, 11, 20, 21), + feature = c("feat1", "feat1", "feat2", "feat2"), + is_qc = c(TRUE, FALSE, TRUE, FALSE), + batch = factor(c("b1", "b1", "b2", "b2")), + plate = factor(c("p1", "p1", "p2", "p2")) + ) + + p <- plot_scatter_omics_feature( + df_long = df_long, + batch = "batch" + ) + + layer_classes <- vapply( + p$layers, + function(x) class(x$geom)[1], + character(1) + ) + + testthat::expect_true("GeomVline" %in% layer_classes) +}) + +testthat::test_that("plot_scatter_omics_feature does not add batch lines when batch is NULL", { + df_long <- data.frame( + run_order = c(1, 2), + value = c(10, 20), + feature = c("feat1", "feat1"), + is_qc = c(TRUE, FALSE), + batch = factor(c("b1", "b1")), + plate = factor(c("p1", "p1")) + ) + + p <- plot_scatter_omics_feature( + df_long = df_long, + batch = NULL + ) + + layer_classes <- vapply( + p$layers, + function(x) class(x$geom)[1], + character(1) + ) + + testthat::expect_false("GeomVline" %in% layer_classes) +}) + +testthat::test_that("plot_omics_distributions returns a single ggplot without comparison data", { + df <- data.frame( + injection_order = c(1, 2, 3), + qc_flag = c(TRUE, FALSE, FALSE), + batch_id = c("b1", "b1", "b2"), + plate_id = c("p1", "p1", "p2"), + feat1 = c(10, 20, 30) + ) + + p <- plot_omics_distributions( + df = df, + target_cols = "feat1", + run_order = "injection_order", + is_qc = "qc_flag", + batch = "batch_id", + plate = "plate_id" + ) + + testthat::expect_s3_class(p, "ggplot") +}) + +testthat::test_that("plot_omics_distributions returns a patchwork object with comparison data", { + df <- data.frame( + injection_order = c(1, 2, 3), + qc_flag = c(TRUE, FALSE, FALSE), + batch_id = c("b1", "b1", "b2"), + plate_id = c("p1", "p1", "p2"), + feat1 = c(10, 20, 30) + ) + + df_comp <- data.frame( + injection_order = c(1, 2, 3), + qc_flag = c(TRUE, FALSE, FALSE), + batch_id = c("b1", "b1", "b2"), + plate_id = c("p1", "p1", "p2"), + feat1 = c(11, 19, 31) + ) + + p <- plot_omics_distributions( + df = df, + df_comp = df_comp, + target_cols = "feat1", + run_order = "injection_order", + is_qc = "qc_flag", + batch = "batch_id", + plate = "plate_id" + ) + + testthat::expect_true(inherits(p, "patchwork")) +}) + +testthat::test_that("plot_omics_distributions sets default titles when comparison is supplied", { + df <- data.frame( + injection_order = c(1, 2), + qc_flag = c(TRUE, FALSE), + batch_id = c("b1", "b1"), + plate_id = c("p1", "p1"), + feat1 = c(10, 20) + ) + + p <- plot_omics_distributions( + df = df, + df_comp = df, + target_cols = "feat1", + run_order = "injection_order", + is_qc = "qc_flag", + batch = "batch_id", + plate = "plate_id" + ) + + testthat::expect_true(inherits(p, "patchwork")) +}) diff --git a/vignettes/before_after.png b/vignettes/before_after.png new file mode 100644 index 0000000..b7ed55e Binary files /dev/null and b/vignettes/before_after.png differ diff --git a/vignettes/before_after_multi.png b/vignettes/before_after_multi.png new file mode 100644 index 0000000..1743a8f Binary files /dev/null and b/vignettes/before_after_multi.png differ diff --git a/vignettes/before_batch.png b/vignettes/before_batch.png new file mode 100644 index 0000000..b8c3d67 Binary files /dev/null and b/vignettes/before_batch.png differ diff --git a/vignettes/before_minimum.png b/vignettes/before_minimum.png new file mode 100644 index 0000000..8582baf Binary files /dev/null and b/vignettes/before_minimum.png differ diff --git a/vignettes/before_multi.png b/vignettes/before_multi.png new file mode 100644 index 0000000..b0827b1 Binary files /dev/null and b/vignettes/before_multi.png differ diff --git a/vignettes/before_plate.png b/vignettes/before_plate.png new file mode 100644 index 0000000..79e912b Binary files /dev/null and b/vignettes/before_plate.png differ diff --git a/vignettes/before_qc.png b/vignettes/before_qc.png new file mode 100644 index 0000000..0146029 Binary files /dev/null and b/vignettes/before_qc.png differ diff --git a/vignettes/feature-clustering.Rmd b/vignettes/feature-clustering.Rmd index 6188e0f..b3e7fc4 100644 --- a/vignettes/feature-clustering.Rmd +++ b/vignettes/feature-clustering.Rmd @@ -46,6 +46,7 @@ res_scores <- OmicsProcessing::cluster_features_by_retention_time( target_cols = target_cols, rt_height = 0.07, method = "scores", + corr_thresh = 0.75, scores = scores ) @@ -53,7 +54,9 @@ clustered_df <- res_scores$clustered_df # original features + representative rep_map <- res_scores$representatives_map # mapping of representatives to raw features ``` -The `rt_height` parameter defines the maximum RT span for forming a cluster: features whose RTs differ by less than this value are grouped. Within each cluster, the supplied `scores` determine the representative. In this example, pre-normalisation mean intensities serve as scores; consequently, each representative is the feature with the highest mean intensity before normalisation. The `representatives_map` lists representatives as names, with vectors of raw feature names as their values. +The `rt_height` parameter defines the maximum RT span for forming a cluster: features whose RTs differ by less than this value are grouped. Within each RT‑based group, pairwise correlations between features are calculated, and the features are further partitioned into correlation‑based subclusters. The supplied `scores` determine the representative feature for each subcluster. In this example, pre-normalisation mean intensities serve as scores; consequently, each representative is the feature with the highest mean intensity before normalisation. + +The `representatives_map` lists representatives as names, with vectors of raw feature names as their values. In effect, features with similar retention times are identified, and among those with high correlation, the feature with the highest pre‑normalisation mean intensity is selected as the representative. ### How representatives are selected with the correlation method diff --git a/vignettes/plot-omics-distributions.Rmd b/vignettes/plot-omics-distributions.Rmd new file mode 100644 index 0000000..8aa8536 --- /dev/null +++ b/vignettes/plot-omics-distributions.Rmd @@ -0,0 +1,327 @@ +--- +title: "Normalisation plot comparison" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Normalisation plot comparison} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Overview + +This vignette shows how to use +`OmicsProcessing::plot_omics_distributions()` to inspect how one or more +features behave across run order. + +These plots are useful when you want to answer practical questions +such as: + +- Do measurements drift over time during the run? +- Do QC samples behave differently from study samples? +- Are there visible plate or batch effects? +- Does normalisation reduce unwanted technical structure? + +You do not need to prepare your own data to follow this vignette. The +package contains a built-in synthetic example data set called +`omics_synthetic`. + +Reproducible data-generation code for `omics_synthetic` is available in +the source repository at `data-raw/omics_synthetic.R`. After package +installation, you can locate the installable copy with: + +[Function reference for `build_omics_synthetic()`](../reference/build_omics_synthetic.html) + +```{r eval=FALSE} +system.file("scripts", "omics_synthetic.R", package = "OmicsProcessing") +``` + +## Load example data + +```{r eval=FALSE} +data("omics_synthetic", package = "OmicsProcessing") + +omics_synthetic +``` + +The example data contains: + +- `F1` to `F10`: feature intensity columns +- `run_ord`: the injection or acquisition order +- `is_qc`: a logical column indicating QC samples +- `plate_id`: plate membership +- `batch_id`: batch membership + +In practice, your own data should contain the same kind of information, +even if your column names are different. The plotting function lets you +specify your own column names explicitly, so your data does not need to +look exactly like this example. + +## Plot Feature Distributions With Minimal Inputs + +The smallest useful call needs only: + +- a data frame, +- one or more feature columns, and +- a run-order column. + +```{r eval=FALSE} +p <- OmicsProcessing:::plot_omics_distributions( + df = omics_synthetic, + target_cols = "F1", + run_order = "run_ord" +) + +p +``` + +This first version is a good screening plot. It lets you see whether the +feature stays roughly stable over the analytical run or whether it shows +drift or sudden shifts. For a first check of a new data set, this is +often the best place to start because it keeps the display simple. + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_minimum.png") +``` + +## Add Information Layers + +The same plot becomes much more informative once you add sample +annotations. The following examples build up the plot step by step. + +### Add QC Indicator + +If your data contains QC samples, adding `is_qc` helps distinguish +technical control samples from study samples. + +```{r eval=FALSE} +p <- OmicsProcessing:::plot_omics_distributions( + df = omics_synthetic, + target_cols = "F1", + run_order = "run_ord", + is_qc = "is_qc" +) + +p +``` + +This is often the most informative extra layer. If QC points follow a +clear trend across run order, that usually suggests technical drift +rather than a biological effect. + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_qc.png") +``` + +### Add Plate Information + +If samples were measured across different plates, colouring by plate can +reveal plate-to-plate shifts. + +```{r eval=FALSE} +p <- OmicsProcessing:::plot_omics_distributions( + df = omics_synthetic, + target_cols = "F1", + run_order = "run_ord", + is_qc = "is_qc", + plate = "plate_id" +) + +p +``` + +This is useful when you suspect systematic differences between +laboratory processing groups. If one plate forms a visibly separate +pattern, that may indicate a technical rather than biological source of +variation. + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_plate.png") +``` + +### Add Batch Information And A Title + +If you also provide a batch column, the function draws dotted vertical +lines at batch boundaries. + +```{r eval=FALSE} +p <- OmicsProcessing:::plot_omics_distributions( + df = omics_synthetic, + target_cols = "F1", + run_order = "run_ord", + is_qc = "is_qc", + plate = "plate_id", + batch = "batch_id", + title_ref = "Before normalisation" +) + +p +``` + +This makes it easier to spot abrupt changes that happen at batch +transitions rather than gradually over time. A sharp level change at a +batch boundary is often a sign that batch correction may be needed. + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_batch.png") +``` + +## Plot Multiple Features + +You can inspect several features at once by supplying multiple feature +names. + +```{r eval=FALSE} +p <- OmicsProcessing:::plot_omics_distributions( + df = omics_synthetic, + target_cols = c("F1", "F2", "F3"), + run_order = "run_ord", + is_qc = "is_qc", + plate = "plate_id", + batch = "batch_id", + title_ref = "Before normalisation" +) + +p +``` + +This is a useful compromise between single-feature inspection and a full +multivariate summary. It helps identify whether technical problems are +restricted to one feature or shared across several. If several features +show similar drift or step changes, the pattern is more likely to be a +technical artefact. + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_multi.png") +``` + +## Compare Feature Distributions Before And After Normalisation + +This function is especially useful when you want to compare a raw or +pre-processed data set against a normalised version. + +Typical preparation steps are: + +1. filter features and samples, +2. impute missing values if needed, +3. normalise the data, and +4. compare the before/after distributions. + +```{r eval=FALSE} +target_features <- c("F1", "F2", "F3") + +df_imputed <- omics_synthetic + +df_normalised <- OmicsProcessing::normalise_SERRF( + df_imputed, + target_cols = target_features, + is_qc = df_imputed$is_qc, + strata_col = "batch_id" +) + +OmicsProcessing:::plot_omics_distributions( + df = df_imputed, + df_comp = df_normalised, + target_cols = target_features, + run_order = "run_ord", + is_qc = "is_qc", + batch = "batch_id", + plate = "plate_id", + title_ref = "Before normalisation", + title_comp = "After normalisation" +) +``` + +If you want a simpler visual comparison first, you can start with a +single feature and then expand to several features once you understand +the overall behaviour. + +```{r eval=FALSE} +OmicsProcessing:::plot_omics_distributions( + df = df_imputed, + df_comp = df_normalised, + target_cols = "F1", + run_order = "run_ord", + is_qc = "is_qc", + batch = "batch_id", + plate = "plate_id", + title_ref = "Before normalisation", + title_comp = "After normalisation" +) +``` + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_after.png") +``` + +In this comparison mode, the left panel usually represents the original +data and the right panel the corrected data. You can then judge whether +the normalisation has reduced technical drift while keeping the overall +signal structure plausible. + +This can also plot multiple columns and compare them like so: + +```{r eval=FALSE} +target_features <- c("F1", "F2", "F3") + +OmicsProcessing:::plot_omics_distributions( + df = df_imputed, + df_comp = df_normalised, + target_cols = target_features, + run_order = "run_ord", + is_qc = "is_qc", + batch = "batch_id", + plate = "plate_id", + title_ref = "Before normalisation", + title_comp = "After normalisation" +) +``` + +```{r echo=FALSE, out.width='100%', fig.align='center'} +knitr::include_graphics("before_after_multi.png") +``` + +## Interpretation Tips + +- A smooth trend in QC samples across run order often indicates + technical drift. +- Sudden jumps at batch boundaries suggest batch effects. +- Plate-specific colour clusters may indicate plate effects. +- After successful normalisation, QC samples often look more stable + across the run. +- If the corrected plot looks overly distorted or compressed, inspect + the normalisation settings before proceeding. + +If you are using SERRF for correction, see the related vignette: +[Batch correction using SERRF](../articles/serrf-normalisation.html). + +## Reproducing The Example Data + +The example data set can be regenerated from the package source. + +[Function reference for `build_omics_synthetic()`](../reference/build_omics_synthetic.html) + +```{r eval=FALSE} +source("data-raw/omics_synthetic.R") +``` + +If you want to inspect or modify the generator first, use: + +[Function reference for `build_omics_synthetic()`](../reference/build_omics_synthetic.html) + +```{r eval=FALSE} +generated <- build_omics_synthetic(seed = 1) + +str(generated$omics_synthetic) +head(generated$jump_info) +``` + +For most users, the packaged `omics_synthetic` data will be enough for +learning and testing. The generator is mainly useful if you want to +change the simulation settings or create additional benchmark examples.