Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: OmicsProcessing
Type: Package
Title: OmicsProcessing
Version: 1.1.3
Version: 1.1.4
Date: 2025-03-03
Authors@R: c(
person("Vivian", "Viallon", role = c("aut")),
Expand Down
50 changes: 36 additions & 14 deletions R/feature_clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
#' hierarchical clustering, then selects a representative feature for each
#' cluster either via scores or via correlation-based aggregation.
#'
#' QC rows (as indicated by \code{is_qc}) are excluded from all clustering and
#' summarisation steps, and are not present in the returned \code{clustered_df}.
#' Additionally, all columns in \code{df} that are not part of \code{target_cols}
#' are prepended to the returned \code{clustered_df} unchanged.
#'
#' @param df
#' A data.frame (or tibble) with feature data in columns and samples in
#' rows.
Expand All @@ -13,8 +18,8 @@
#' @param is_qc
#' Logical vector marking QC samples. Must have length equal to
#' \code{nrow(df)}. If \code{NULL}, all samples are treated as non-QC.
#' (Currently only used for consistency with other interfaces; it is not
#' used directly in this function.)
#' QC rows are excluded from clustering/summarisation and from the returned
#' \code{clustered_df}.
#' @param rt_height
#' Numeric height at which to cut the dendrogram in retention-time
#' space. Smaller values create more clusters.
Expand All @@ -27,9 +32,9 @@
#' @param scores
#' Named numeric vector of feature scores, required when
#' \code{method = "scores"}. Names must coincide with \code{target_cols}.
#' For each retention-time cluster, the feature are subclusterd based on the
#' input correlation threshold. The feature with the highest score is selected
#' as the representative of each subcluster.
#' For each retention-time cluster, the features are subclustered based on
#' the input correlation threshold. The feature with the highest score is
#' selected as the representative of each subcluster.
#' @param cut_height
#' Numeric height used inside \code{cluster_features_based_on_correlations}
#' when \code{method = "correlations"}. Passed on to that function's
Expand All @@ -44,9 +49,12 @@
#'
#' \describe{
#' \item{\code{clustered_df}}{
#' A data.frame with one column per final representative feature.
#' When \code{method = "scores"}, this consists of a subset of the
#' original feature columns. When \code{method = "correlations"}, it
#' A data.frame where rows correspond to non-QC samples only.
#' All non-target columns (i.e. \code{setdiff(names(df), target_cols)})
#' are included unchanged, followed by one column per final
#' representative feature.
#' When \code{method = "scores"}, representative features are a subset of
#' the original feature columns. When \code{method = "correlations"}, it
#' may contain synthetic features created by
#' \code{cluster_features_based_on_correlations()}.
#' }
Expand Down Expand Up @@ -90,6 +98,7 @@
#' @examples
#' \dontrun{
#' df <- data.frame(
#' batch = rep(1, 20),
#' `100@150` = rnorm(20),
#' `100@151` = rnorm(20),
#' `101@200` = rnorm(20)
Expand Down Expand Up @@ -144,7 +153,11 @@ cluster_features_by_retention_time <- function(
if (length(is_qc) != nrow(df)) {
stop("`is_qc` must be a logical vector with the same length as nrow(df).")
}
target_cols <- resolve_target_cols(df, target_cols)

# 1) Exclude QC rows for all downstream computations + output
df_use <- df[!is_qc, , drop = FALSE]

target_cols <- resolve_target_cols(df_use, target_cols)

if (method == "scores") {
if (is.null(names(scores))) {
Expand Down Expand Up @@ -176,26 +189,25 @@ cluster_features_by_retention_time <- function(
}

retention_times <- parse_mass_rt(target_cols, "rt")

cluster_ids <- cluster_hierarchical(retention_times, height = rt_height)

cluster_feature_list <- split(as.character(target_cols), cluster_ids)
names(cluster_feature_list) <- sprintf("ClustRT%s", names(cluster_feature_list))


if (method == "scores") {
representatives <- get_features_representatives_based_on_scores(
df,
df_use,
cluster_feature_list,
corr_thresh = corr_thresh,
scores
)
clustered_df <- df[, representatives$representatives]
clustered_df <- df_use[, representatives$representatives, drop = FALSE]
representatives_map <- representatives$representatives_map
}

if (method == "correlations") {
results <- cluster_features_based_on_correlations(
df,
df_use,
cluster_feature_list,
cut_height = cut_height,
corr_thresh = corr_thresh
Expand All @@ -204,6 +216,15 @@ cluster_features_by_retention_time <- function(
representatives_map <- results$representatives_map
}

# 2) Attach all non-target columns back onto clustered_df
other_cols <- setdiff(names(df_use), target_cols)
if (length(other_cols) > 0L) {
clustered_df <- cbind(
df_use[, other_cols, drop = FALSE],
clustered_df
)
}

return(
list(
clustered_df = clustered_df,
Expand All @@ -213,6 +234,7 @@ cluster_features_by_retention_time <- function(
}



#' General hierarchical clustering
#'
#' Performs hierarchical clustering on a numeric vector, matrix or
Expand Down
25 changes: 17 additions & 8 deletions man/cluster_features_by_retention_time.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.