IARCBiostat · afarnudi · Apr 7, 2026 · Mar 16, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: OmicsProcessing
 Type: Package
 Title: OmicsProcessing
-Version: 1.1.4
+Version: 1.2.0
 Date: 2025-03-03
 Authors@R: c(
     person("Vivian", "Viallon", role = c("aut")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export("%>%")
 export(FUNnormalization_residualMixedModels)
+export(build_omics_synthetic)
 export(calculate_ICC)
 export(calculate_ICC_parallel)
 export(check_dataframe_validity)

diff --git a/R/build_omics_synthetic.R b/R/build_omics_synthetic.R
@@ -0,0 +1,114 @@
+#' Build synthetic omics data for plotting and normalisation examples
+#'
+#' Create a synthetic omics data set with feature intensities, run-order
+#' drift, batch structure, plate structure, and QC sample labels. This is the
+#' generator used for the packaged `omics_synthetic` example data.
+#'
+#' @param seed Integer random seed.
+#' @param n Number of rows (samples).
+#' @param n_batch Number of batches.
+#' @param n_plate_per_batch Number of plates per batch.
+#' @param n_features Number of feature columns to generate.
+#' @param qc_frac Fraction of rows labelled as QC.
+#'
+#' @return A list with three elements:
+#' \describe{
+#'   \item{omics_synthetic}{A synthetic omics data frame.}
+#'   \item{jump_info}{A data frame describing the simulated step changes per feature.}
+#'   \item{parameters}{A list of generation parameters and sampled values.}
+#' }
+#'
+#' @examples
+#' generated <- build_omics_synthetic(seed = 1)
+#' str(generated$omics_synthetic)
+#' head(generated$jump_info)
+#'
+#' @export
+build_omics_synthetic <- function(
+  seed = 1,
+  n = 2000,
+  n_batch = 2,
+  n_plate_per_batch = 2,
+  n_features = 10,
+  qc_frac = 0.05
+) {
+  set.seed(seed)
+
+  feature_cols <- paste0("F", seq_len(n_features))
+  feature_means <- stats::runif(n_features, min = 100, max = 700)
+  feature_sds <- stats::runif(n_features, min = 3, max = 30)
+
+  feature_matrix <- vapply(
+    seq_len(n_features),
+    FUN.VALUE = numeric(n),
+    FUN = function(j) {
+      stats::rnorm(n, mean = feature_means[j], sd = feature_sds[j])
+    }
+  )
+  colnames(feature_matrix) <- feature_cols
+
+  omics_synthetic <- data.frame(
+    plate_id = factor(sample.int(n_plate_per_batch * n_batch, n, replace = TRUE)),
+    feature_matrix,
+    check.names = FALSE
+  )
+
+  omics_synthetic <- omics_synthetic[order(omics_synthetic$plate_id), , drop = FALSE]
+  rownames(omics_synthetic) <- NULL
+
+  omics_synthetic$run_ord <- seq_len(nrow(omics_synthetic))
+  omics_synthetic$batch_id <-
+    (as.integer(omics_synthetic$plate_id) - 1L) %/% n_plate_per_batch + 1L
+
+  n_qc <- ceiling(qc_frac * nrow(omics_synthetic))
+  qc_idx <- sample.int(nrow(omics_synthetic), size = n_qc, replace = FALSE)
+  omics_synthetic$is_qc <- FALSE
+  omics_synthetic$is_qc[qc_idx] <- TRUE
+
+  drift_slopes <- stats::runif(n_features, min = 0.01, max = 0.1)
+  for (j in seq_len(n_features)) {
+    omics_synthetic[[feature_cols[j]]] <-
+      omics_synthetic[[feature_cols[j]]] - drift_slopes[j] * omics_synthetic$run_ord
+  }
+
+  jump_info <- vector("list", length = n_features)
+  names(jump_info) <- feature_cols
+
+  for (j in seq_len(n_features)) {
+    n_jumps <- sample(1:2, size = 1)
+    jump_points <- sort(sample(100:(n - 100), size = n_jumps))
+    jump_sizes <- stats::runif(n_jumps, min = 0.5, max = 1.5) * feature_sds[j] * 2
+
+    for (k in seq_len(n_jumps)) {
+      idx <- omics_synthetic$run_ord >= jump_points[k]
+      omics_synthetic[[feature_cols[j]]][idx] <-
+        omics_synthetic[[feature_cols[j]]][idx] + jump_sizes[k]
+    }
+
+    jump_info[[j]] <- data.frame(
+      feature = feature_cols[j],
+      jump_id = seq_len(n_jumps),
+      jump_point = jump_points,
+      jump_size = jump_sizes
+    )
+  }
+
+  jump_info <- do.call(rbind, jump_info)
+  rownames(jump_info) <- NULL
+
+  list(
+    omics_synthetic = omics_synthetic,
+    jump_info = jump_info,
+    parameters = list(
+      seed = seed,
+      n = n,
+      n_batch = n_batch,
+      n_plate_per_batch = n_plate_per_batch,
+      n_features = n_features,
+      qc_frac = qc_frac,
+      feature_means = feature_means,
+      feature_sds = feature_sds,
+      drift_slopes = drift_slopes
+    )
+  )
+}
diff --git a/R/data.R b/R/data.R
@@ -37,3 +37,23 @@
 #' }
 #' @usage data(data_meta_samples)
 "data_meta_samples"
+
+#' Synthetic omics dataset for plotting and normalisation examples
+#'
+#' A synthetic omics dataset with run order, batch, plate, QC flags, and ten
+#' feature columns. It is intended for examples and vignettes demonstrating
+#' plotting and normalisation workflows. Reproducible data-generation code is
+#' available in `data-raw/omics_synthetic.R` in the source repository and in
+#' the installed package via `system.file("scripts", "omics_synthetic.R",
+#' package = "OmicsProcessing")`.
+#'
+#' @format A data frame with 2000 rows and 14 variables:
+#' \describe{
+#'   \item{plate_id}{Plate identifier with 4 levels.}
+#'   \item{F1-F10}{Numeric feature intensity columns.}
+#'   \item{run_ord}{Integer run order.}
+#'   \item{batch_id}{Batch identifier.}
+#'   \item{is_qc}{Logical indicator for QC samples.}
+#' }
+#' @usage data(omics_synthetic)
+"omics_synthetic"
diff --git a/R/normalization_SERRF.R b/R/normalization_SERRF.R
@@ -75,9 +75,14 @@ normalise_SERRF <- function(df, target_cols = NULL, is_qc = NULL, strata_col = N
         corrs_target[[batch]] <- cor(target, method = "spearman")
     }
 
-    # ---- Normalize Each Feature ----
+    n <- length(target_cols)
+    step <- max(1, floor(n / 10))
+
     for (j in seq_along(target_cols)) {
-        if (j %% 250 == 1) cat(j, "out of ", length(target_cols), " features normalised\n")
+        if (j %% step == 1 || j == n) {
+            pct <- round(100 * j / n)
+            cat(j, "out of", n, "features normalised (", pct, "%)\n")
+        }
         feature_name <- target_cols[j]
 
         for (batch in batch_levels) {