Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: OmicsProcessing
Type: Package
Title: OmicsProcessing
Version: 1.1.4
Version: 1.2.0
Date: 2025-03-03
Authors@R: c(
person("Vivian", "Viallon", role = c("aut")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export("%>%")
export(FUNnormalization_residualMixedModels)
export(build_omics_synthetic)
export(calculate_ICC)
export(calculate_ICC_parallel)
export(check_dataframe_validity)
Expand Down
114 changes: 114 additions & 0 deletions R/build_omics_synthetic.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#' Build synthetic omics data for plotting and normalisation examples
#'
#' Create a synthetic omics data set with feature intensities, run-order
#' drift, batch structure, plate structure, and QC sample labels. This is the
#' generator used for the packaged `omics_synthetic` example data.
#'
#' @param seed Integer random seed.
#' @param n Number of rows (samples).
#' @param n_batch Number of batches.
#' @param n_plate_per_batch Number of plates per batch.
#' @param n_features Number of feature columns to generate.
#' @param qc_frac Fraction of rows labelled as QC.
#'
#' @return A list with three elements:
#' \describe{
#' \item{omics_synthetic}{A synthetic omics data frame.}
#' \item{jump_info}{A data frame describing the simulated step changes per feature.}
#' \item{parameters}{A list of generation parameters and sampled values.}
#' }
#'
#' @examples
#' generated <- build_omics_synthetic(seed = 1)
#' str(generated$omics_synthetic)
#' head(generated$jump_info)
#'
#' @export
build_omics_synthetic <- function(
seed = 1,
n = 2000,
n_batch = 2,
n_plate_per_batch = 2,
n_features = 10,
qc_frac = 0.05
) {
set.seed(seed)

feature_cols <- paste0("F", seq_len(n_features))
feature_means <- stats::runif(n_features, min = 100, max = 700)
feature_sds <- stats::runif(n_features, min = 3, max = 30)

feature_matrix <- vapply(
seq_len(n_features),
FUN.VALUE = numeric(n),
FUN = function(j) {
stats::rnorm(n, mean = feature_means[j], sd = feature_sds[j])
}
)
colnames(feature_matrix) <- feature_cols

omics_synthetic <- data.frame(
plate_id = factor(sample.int(n_plate_per_batch * n_batch, n, replace = TRUE)),
feature_matrix,
check.names = FALSE
)

omics_synthetic <- omics_synthetic[order(omics_synthetic$plate_id), , drop = FALSE]
rownames(omics_synthetic) <- NULL

omics_synthetic$run_ord <- seq_len(nrow(omics_synthetic))
omics_synthetic$batch_id <-
(as.integer(omics_synthetic$plate_id) - 1L) %/% n_plate_per_batch + 1L

n_qc <- ceiling(qc_frac * nrow(omics_synthetic))
qc_idx <- sample.int(nrow(omics_synthetic), size = n_qc, replace = FALSE)
omics_synthetic$is_qc <- FALSE
omics_synthetic$is_qc[qc_idx] <- TRUE

drift_slopes <- stats::runif(n_features, min = 0.01, max = 0.1)
for (j in seq_len(n_features)) {
omics_synthetic[[feature_cols[j]]] <-
omics_synthetic[[feature_cols[j]]] - drift_slopes[j] * omics_synthetic$run_ord
}

jump_info <- vector("list", length = n_features)
names(jump_info) <- feature_cols

for (j in seq_len(n_features)) {
n_jumps <- sample(1:2, size = 1)
jump_points <- sort(sample(100:(n - 100), size = n_jumps))
jump_sizes <- stats::runif(n_jumps, min = 0.5, max = 1.5) * feature_sds[j] * 2

for (k in seq_len(n_jumps)) {
idx <- omics_synthetic$run_ord >= jump_points[k]
omics_synthetic[[feature_cols[j]]][idx] <-
omics_synthetic[[feature_cols[j]]][idx] + jump_sizes[k]
}

jump_info[[j]] <- data.frame(
feature = feature_cols[j],
jump_id = seq_len(n_jumps),
jump_point = jump_points,
jump_size = jump_sizes
)
}

jump_info <- do.call(rbind, jump_info)
rownames(jump_info) <- NULL

list(
omics_synthetic = omics_synthetic,
jump_info = jump_info,
parameters = list(
seed = seed,
n = n,
n_batch = n_batch,
n_plate_per_batch = n_plate_per_batch,
n_features = n_features,
qc_frac = qc_frac,
feature_means = feature_means,
feature_sds = feature_sds,
drift_slopes = drift_slopes
)
)
}
20 changes: 20 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,23 @@
#' }
#' @usage data(data_meta_samples)
"data_meta_samples"

#' Synthetic omics dataset for plotting and normalisation examples
#'
#' A synthetic omics dataset with run order, batch, plate, QC flags, and ten
#' feature columns. It is intended for examples and vignettes demonstrating
#' plotting and normalisation workflows. Reproducible data-generation code is
#' available in `data-raw/omics_synthetic.R` in the source repository and in
#' the installed package via `system.file("scripts", "omics_synthetic.R",
#' package = "OmicsProcessing")`.
#'
#' @format A data frame with 2000 rows and 14 variables:
#' \describe{
#' \item{plate_id}{Plate identifier with 4 levels.}
#' \item{F1-F10}{Numeric feature intensity columns.}
#' \item{run_ord}{Integer run order.}
#' \item{batch_id}{Batch identifier.}
#' \item{is_qc}{Logical indicator for QC samples.}
#' }
#' @usage data(omics_synthetic)
"omics_synthetic"
9 changes: 7 additions & 2 deletions R/normalization_SERRF.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,14 @@ normalise_SERRF <- function(df, target_cols = NULL, is_qc = NULL, strata_col = N
corrs_target[[batch]] <- cor(target, method = "spearman")
}

# ---- Normalize Each Feature ----
n <- length(target_cols)
step <- max(1, floor(n / 10))

for (j in seq_along(target_cols)) {
if (j %% 250 == 1) cat(j, "out of ", length(target_cols), " features normalised\n")
if (j %% step == 1 || j == n) {
pct <- round(100 * j / n)
cat(j, "out of", n, "features normalised (", pct, "%)\n")
}
feature_name <- target_cols[j]

for (batch in batch_levels) {
Expand Down
Loading
Loading