spang-lab
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 21 additions & 0 deletions b/‎DESCRIPTION‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 7 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎R/build_buckets.R‎
Lines changed: 48 additions & 0 deletions b/‎R/build_buckets.R‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎R/check_inupts.R‎
Lines changed: 181 additions & 0 deletions b/‎R/check_inupts.R‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎R/correlation_c.R‎
Lines changed: 71 additions & 0 deletions b/‎R/correlation_c.R‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎R/data.R‎
Lines changed: 17 additions & 0 deletions b/‎R/data.R‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
@@ -0,0 +1,21 @@
+Package: harp
+Type: Package
+Title: Estimates reference profiles and cellular abundance for deconvolution of bulk transcriptomic data
+Version: 0.1.0
+Authors@R: c(
+    person(given = "Zahra", family = "Nozari", role = c("aut", "cre"), email = "[email protected]"),
+    person(given = "Paul", family = "Hüttl", role = c("ctb"), email = "[email protected]"))
+Description: HARP is a computational method that improves tissue deconvolution accuracy by addressing both biological and technical inconsistencies in the data. The method harmonizes discrepancies between experimentally measured tissue compositions, deconvolution results, and reconstructed bulk profiles. This process leads to more reliable cell type proportion estimates for bulk tissue samples. 
+License: will be determined
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.3.2
+Depends: 
+    stats,
+    ggplot2,
+    reshape2,
+    stringr,
+    DTD,
+    scales,
+    matlib,
+    R (>= 3.5)
@@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+export(estimated_c_correlation)
+export(harp_deconvolution_model)
+export(harp_pipeline)
+import(DTD)
+importFrom(DTD,estimate_c)
@@ -0,0 +1,48 @@
+#' build_buckets
+#' @description this function makes different buckets for the n_fold cross validation function
+#' it takes the train data and put it into differenct buckets. the number of buckets depends on the n_fold number
+#'
+#' @param train.matrix numeric matrix, training data
+#' @param n_folds integer, number of buckets to build
+#' @return vector
+#' @noRd
+
+build_buckets <- function(train_matrix, n_folds = 5) {
+  if (!is.matrix(train_matrix)) {
+    stop("train_matrix must be a matrix")
+  }
+
+  n_cols <- ncol(train_matrix)
+
+  test_number(
+    value = n_folds,
+    validation.source = c("build_buckets", "n_folds"),
+    min = 2, # minimum 2 folds for cross-validation
+    max = floor(n_cols / 3), # maximum folds based on ensuring 3 samples per bucket
+    integer_only = TRUE # n_folds must be an integer
+  )
+  # Check if it's possible to have each bucket with at least 3 samples
+  if (n_cols < n_folds * 3) {
+    stop("Number of columns in train_matrix is too small to ensure a minimum of 3 samples per bucket.
+    You can either increase the sample size of the data or use a smaller number of folds")
+  }
+  # Calculate the number of columns per fold
+  columns_per_fold <- n_cols %/% n_folds
+  extra_columns <- n_cols %% n_folds
+
+  # Create the fold indices
+  fold_indices <- rep(1:n_folds, each = columns_per_fold)
+
+  # Distribute the extra columns
+  if (extra_columns > 0) {
+    fold_indices <- c(fold_indices, sample(1:n_folds, extra_columns))
+  }
+
+  # Shuffle the fold indices
+  index_buckets <- sample(fold_indices)
+
+  # Assign the names
+  names(index_buckets) <- colnames(train_matrix)
+
+  return(index_buckets)
+}
@@ -0,0 +1,181 @@
+#' check_input_data
+#'
+#' saftey check for function in harp
+#' This function checks the input data for correctness, ensuring that the matrices for the reference profile (`C`),
+#' the bulk expression profile (`Y`), and the scaled expression matrix (`X_sc`) fullfill required structural
+#' and naming standards. It also verifies the presence of mutual genes between `Y` and `X_sc`, rearranges the data
+#' to align them properly, and confirms that sample and gene names are consistent across all matrices.
+#' @param C  matrix representing the reference profile, with row names as cell types and column names as samples.
+#' @param Y  matrix representing the bulk expression profile, with row names as genes and column names as samples.
+#' @param X_sc  matrix representing the reference profiles of cell types, with row names as genes and column names as cell types.
+#'
+#' @return list containing the validated and reordered matrices
+#' @noRd
+check_input_data <- function(C, Y, X_sc) {
+    message("Starting validation of the input data...")
+
+    # Check if C is a matrix
+    if (!is.matrix(C)) {
+        stop("In check_input: C (cell counts) is not a matrix")
+    }
+
+    # Check if Y is a matrix
+    if (!is.matrix(Y)) {
+        stop("In check_input: Y (bulk expression) is not a matrix")
+    }
+    # Check if Y is a matrix
+    if (!is.matrix(X_sc)) {
+        stop("In check_input: X_sc (cell reference profile) is not a matrix")
+    }
+
+    # Check if column names of Y are in column names of C
+    if (!all(colnames(Y) %in% colnames(C)) || !all(colnames(C) %in% colnames(Y))) {
+        stop("In check_input: sample names in Y (bulk expression) and C (cell counts)  must match exactly")
+    }
+
+    # Check if rownames of C match colnames of X_sc
+    if (!all(rownames(C) %in% colnames(X_sc)) || !all(colnames(X_sc) %in% rownames(C))) {
+        stop("In check_input: cell type names in C (cell counts)  and X_sc (cell reference profile) must match exactly")
+    }
+    # Check if rownames of Y are in rownames of X_sc
+    if (!any(all(rownames(Y) %in% rownames(X_sc)))) {
+        message("Genes in  cell reference profile and bulk expression profile do not completely match; mutual genes will be selected.")
+    }
+
+    # Select mutual genes
+    gene <- intersect(rownames(X_sc), rownames(Y))
+
+    # Stop if no mutual genes are found
+    if (length(gene) == 0) {
+        stop("In check_input: No mutual genes found between reference profile and bulk expression profile")
+    }
+
+    # Produce a warning if mutual genes are fewer than 500
+    if (length(gene) < 500) {
+        warning("In check_input: Fewer than 500 mutual genes found between reference profile and bulk profile")
+    }
+
+    # reorder samples in Y to match the order in c
+    Y <- Y[gene, colnames(C)]
+    # reorder cell types of X_sc to match the order in C
+    X_sc <- X_sc[gene, rownames(C)]
+    message("Input data validation complete.")
+
+    list(C = C, Y = Y, X_sc = X_sc)
+}
+
+#' test_number
+#' saftey check for function in harp
+#'
+#' @param value value to be tested for number properties
+#' @param validation.source vector of length 2: [1] the calling function's name, [2] the parameter name being tested
+#' @param min minimum
+#' @param max maximum
+#' @param integer_only logical, if TRUE checks for integer, if FALSE allows any non-negative number
+#'
+#' @return TRUE, if no error is detected, stops with error otherwise
+#' @noRd
+test_number <- function(value,
+                        validation.source,
+                        min,
+                        max,
+                        integer_only = TRUE) {
+    error.message <- paste0("In ", validation.source[1], ": ", "'", validation.source[2], "'")
+
+    if (!is.numeric(value) || length(value) != 1) {
+        error.message <- paste0(error.message, " is not a single numeric value")
+        stop(error.message, call. = FALSE)
+    }
+
+    # for non-integer case, ensure value is >= 0
+    if (!integer_only && value < 0) {
+        error.message <- paste0(error.message, " must be non-negative")
+        stop(error.message, call. = FALSE)
+    }
+    # check for integer if required
+    if (integer_only && round(value) != value) {
+        error.message <- paste0(error.message, " is not an integer")
+        stop(error.message, call. = FALSE)
+    }
+
+    if (value < min) {
+        error.message <- paste0(error.message, " is below minimal value")
+        stop(error.message, call. = FALSE)
+    }
+
+    if (value > max) {
+        error.message <- paste0(error.message, " is above maximal value")
+        stop(error.message, call. = FALSE)
+    }
+
+    return(TRUE)
+}
+
+#' validate lambda sequence
+#'
+#' saftey check for function in harp
+#' @param lambda_seq single non-negative number or sequence of non-negative numbers (at least two values recommended)
+#' @param caller_function_name name of the calling function for error messages
+#' @param allow_single logical; if TRUE, lambda_seq can be a single non-negative number.
+#'
+#'
+#' @return TRUE if validation passes, stops with error otherwise
+#' @noRd
+validate_lambda_seq <- function(lambda_seq, caller_function_name, allow_single = TRUE) {
+    # check if caller_function_name is provided
+    if (missing(caller_function_name)) {
+        stop("caller_function_name must be provided")
+    }
+
+    # check if input is numeric vector
+    if (!is.numeric(lambda_seq)) {
+        stop(paste0("In ", caller_function_name, ": lambda_seq must be numeric"))
+    }
+
+    # for single value
+    if (length(lambda_seq) == 1) {
+        if (!allow_single) {
+            stop(paste0("In ", caller_function_name, ": lambda_seq cannot be a single value when allow_single is FALSE"))
+        }
+        test_number(
+            value = lambda_seq,
+            validation.source = c(caller_function_name, "lambda_seq"),
+            min = 0,
+            max = Inf,
+            integer_only = FALSE # allow any non-negative number
+        )
+    }
+    # for sequence
+    else {
+        # check if any values are negative
+        if (any(lambda_seq < 0)) {
+            stop(paste0("In ", caller_function_name, ": all values in lambda_seq must be non-negative"))
+        }
+
+        # check if sequence has at least two values
+        if (length(lambda_seq) < 2) {
+            stop(paste0("In ", caller_function_name, ": lambda_seq must contain at least two values (more values recommended)"))
+        }
+    }
+
+    return(TRUE)
+}
+
+#' check_logical
+#'
+#' saftey check for functions in harp
+#' @param value value to be tested for number properties
+#' @param validation.source vector of length 2: [1] the calling function's name, [2] the parameter name being tested
+#'
+#' @return TRUE, or it throws an error
+#' @noRd
+check_logical <- function(value,
+                          validation.source) {
+    error.message <- paste0("In ", validation.source[1], ": ", "'", validation.source[2], "'")
+
+    if (any(!is.logical(value)) || length(value) != 1) {
+        error.message <- paste0(error.message, " must be a single value, either 'TRUE' or 'FALSE' ")
+        stop(error.message, call. = FALSE)
+    }
+    return(TRUE)
+}
@@ -0,0 +1,71 @@
+#' calculate Correlations Between Estimated and True cellular compositions
+#' 
+#' calculates row-wise correlations between two matrices (estimated and true cell proportions)
+#' @param estimated_c a matrix estimated cellular composions
+#' @param true_c a matrix of true cellulat compositions with same dimensions as estimated_c
+#' @param ... Additional arguments passed to stats::cor()
+#' 
+#' @return numeric vector containing correlation coefficients
+#' @export 
+#' @examples
+#' # create example matrices
+#' estimated <- matrix(rnorm(20), nrow = 4)
+#' true <- matrix(rnorm(20), nrow = 4)
+#' rownames(estimated) <- paste0("cell", 1:4)
+#' rownames(true) <- paste0("cell", 1:4)
+#' colnames(estimated) <- paste0("sample", 1:5)
+#' colnames(true) <- paste0("sample", 1:5)
+#' 
+#' # calculate correlations
+#' results <- estimated_c_correlation (estimated_c = estimated, true_c = true)
+estimated_c_correlation <- function(..., estimated_c, true_c) {
+  # Input validation
+  if (!is.matrix(estimated_c) || !is.matrix(true_c)) {
+    stop("Both estimated_c and true_c must be matrices")
+  }
+  if (!identical(dim(estimated_c), dim(true_c))) {
+    stop("Dimensions of estimated_c and true_c must match")
+  }
+  
+  # check row names
+  if (is.null(rownames(estimated_c)) || is.null(rownames(true_c))) {
+    stop("Both matrices must have row names")
+  }
+  if (!identical(rownames(estimated_c), rownames(true_c))) {
+    stop("Row names of estimated_c and true_c must match exactly")
+  }
+  
+  # check column names
+  if (is.null(colnames(estimated_c)) || is.null(colnames(true_c))) {
+    stop("Both matrices must have column names")
+  }
+  if (!identical(colnames(estimated_c), colnames(true_c))) {
+    stop("Column names of estimated_c and true_c must match exactly")
+  }
+  
+  # intiate result vector
+  n_rows <- nrow(estimated_c)
+  correlation <- numeric(n_rows)
+  names(correlation) <- rownames(estimated_c)
+  
+
+  tryCatch({
+    # calculate standard deviations 
+    sd_values <- apply(true_c, 1, stats::sd, na.rm = TRUE)
+    
+    correlation <- sapply(seq_len(n_rows), function(i) {
+      if (sd_values[i] == 0) return(0)
+      if (all(is.na(estimated_c[i, ])) || all(is.na(true_c[i, ]))) return(NA)
+      stats::cor(estimated_c[i, ], true_c[i, ], use = "complete.obs", ...)
+    })
+    
+
+    names(correlation) <- rownames(estimated_c)
+    
+  }, error = function(e) {
+    warning("Error in correlation calculation: ", e$message)
+    return(rep(NA, n_rows))
+  })
+  
+  return(correlation)
+}
@@ -0,0 +1,17 @@
+#' Sample Deconvolution Data
+#'
+#' This dataset contains a cell type reference constructed from single cell data of 
+#' [The landscape of tumor cell states and ecosystems in diffuse large B cell lymphoma](https://www.cell.com/cancer-cell/fulltext/S1535-6108(21)00451-7)
+#' by [Steen et al., 2021].
+#' Furthermore it contains bulk data (split on the patient level into train and test) artificially constructed by averaging cell profiles of
+#' [Dissecting intratumour heterogeneity of nodal B-cell lymphomas at the transcriptional, genetic and drug-response levels](https://www.nature.com/articles/s41556-020-0532-x)
+#' by [Roider et al., 2020]
+#' Lastly it contains ground truth cell proportions simulating FACS measurments capturing the abundance of cell types
+#' @format A list containing three data frames:
+#' \describe{
+#'   \item{cell_reference_profile}{The anchor reference that is harmonized by harp}
+#'   \item{train_data}{The training data for harp, that means bulk counts and cell proportions}
+#'   \item{bulk_counts_test}{Independent test counts for the model to be validated on}
+#' }
+#' @source Deconvolution data of real world sc datasets to be used for a MWE for harp
+"harp_data"