Skip to content

Commit d3fa161

Browse files
committed
initial commit
1 parent bd42558 commit d3fa161

21 files changed

+1493
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata

DESCRIPTION

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Package: harp
2+
Type: Package
3+
Title: Estimates reference profiles and cellular abundance for deconvolution of bulk transcriptomic data
4+
Version: 0.1.0
5+
Authors@R: c(
6+
person(given = "Zahra", family = "Nozari", role = c("aut", "cre"), email = "[email protected]"),
7+
person(given = "Paul", family = "Hüttl", role = c("ctb"), email = "[email protected]"))
8+
Description: HARP is a computational method that improves tissue deconvolution accuracy by addressing both biological and technical inconsistencies in the data. The method harmonizes discrepancies between experimentally measured tissue compositions, deconvolution results, and reconstructed bulk profiles. This process leads to more reliable cell type proportion estimates for bulk tissue samples.
9+
License: will be determined
10+
Encoding: UTF-8
11+
LazyData: true
12+
RoxygenNote: 7.3.2
13+
Depends:
14+
stats,
15+
ggplot2,
16+
reshape2,
17+
stringr,
18+
DTD,
19+
scales,
20+
matlib,
21+
R (>= 3.5)

NAMESPACE

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(estimated_c_correlation)
4+
export(harp_deconvolution_model)
5+
export(harp_pipeline)
6+
import(DTD)
7+
importFrom(DTD,estimate_c)

R/build_buckets.R

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#' build_buckets
2+
#' @description this function makes different buckets for the n_fold cross validation function
3+
#' it takes the train data and put it into differenct buckets. the number of buckets depends on the n_fold number
4+
#'
5+
#' @param train.matrix numeric matrix, training data
6+
#' @param n_folds integer, number of buckets to build
7+
#' @return vector
8+
#' @noRd
9+
10+
build_buckets <- function(train_matrix, n_folds = 5) {
11+
if (!is.matrix(train_matrix)) {
12+
stop("train_matrix must be a matrix")
13+
}
14+
15+
n_cols <- ncol(train_matrix)
16+
17+
test_number(
18+
value = n_folds,
19+
validation.source = c("build_buckets", "n_folds"),
20+
min = 2, # minimum 2 folds for cross-validation
21+
max = floor(n_cols / 3), # maximum folds based on ensuring 3 samples per bucket
22+
integer_only = TRUE # n_folds must be an integer
23+
)
24+
# Check if it's possible to have each bucket with at least 3 samples
25+
if (n_cols < n_folds * 3) {
26+
stop("Number of columns in train_matrix is too small to ensure a minimum of 3 samples per bucket.
27+
You can either increase the sample size of the data or use a smaller number of folds")
28+
}
29+
# Calculate the number of columns per fold
30+
columns_per_fold <- n_cols %/% n_folds
31+
extra_columns <- n_cols %% n_folds
32+
33+
# Create the fold indices
34+
fold_indices <- rep(1:n_folds, each = columns_per_fold)
35+
36+
# Distribute the extra columns
37+
if (extra_columns > 0) {
38+
fold_indices <- c(fold_indices, sample(1:n_folds, extra_columns))
39+
}
40+
41+
# Shuffle the fold indices
42+
index_buckets <- sample(fold_indices)
43+
44+
# Assign the names
45+
names(index_buckets) <- colnames(train_matrix)
46+
47+
return(index_buckets)
48+
}

R/check_inupts.R

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#' check_input_data
2+
#'
3+
#' saftey check for function in harp
4+
#' This function checks the input data for correctness, ensuring that the matrices for the reference profile (`C`),
5+
#' the bulk expression profile (`Y`), and the scaled expression matrix (`X_sc`) fullfill required structural
6+
#' and naming standards. It also verifies the presence of mutual genes between `Y` and `X_sc`, rearranges the data
7+
#' to align them properly, and confirms that sample and gene names are consistent across all matrices.
8+
#' @param C matrix representing the reference profile, with row names as cell types and column names as samples.
9+
#' @param Y matrix representing the bulk expression profile, with row names as genes and column names as samples.
10+
#' @param X_sc matrix representing the reference profiles of cell types, with row names as genes and column names as cell types.
11+
#'
12+
#' @return list containing the validated and reordered matrices
13+
#' @noRd
14+
check_input_data <- function(C, Y, X_sc) {
15+
message("Starting validation of the input data...")
16+
17+
# Check if C is a matrix
18+
if (!is.matrix(C)) {
19+
stop("In check_input: C (cell counts) is not a matrix")
20+
}
21+
22+
# Check if Y is a matrix
23+
if (!is.matrix(Y)) {
24+
stop("In check_input: Y (bulk expression) is not a matrix")
25+
}
26+
# Check if Y is a matrix
27+
if (!is.matrix(X_sc)) {
28+
stop("In check_input: X_sc (cell reference profile) is not a matrix")
29+
}
30+
31+
# Check if column names of Y are in column names of C
32+
if (!all(colnames(Y) %in% colnames(C)) || !all(colnames(C) %in% colnames(Y))) {
33+
stop("In check_input: sample names in Y (bulk expression) and C (cell counts) must match exactly")
34+
}
35+
36+
# Check if rownames of C match colnames of X_sc
37+
if (!all(rownames(C) %in% colnames(X_sc)) || !all(colnames(X_sc) %in% rownames(C))) {
38+
stop("In check_input: cell type names in C (cell counts) and X_sc (cell reference profile) must match exactly")
39+
}
40+
# Check if rownames of Y are in rownames of X_sc
41+
if (!any(all(rownames(Y) %in% rownames(X_sc)))) {
42+
message("Genes in cell reference profile and bulk expression profile do not completely match; mutual genes will be selected.")
43+
}
44+
45+
# Select mutual genes
46+
gene <- intersect(rownames(X_sc), rownames(Y))
47+
48+
# Stop if no mutual genes are found
49+
if (length(gene) == 0) {
50+
stop("In check_input: No mutual genes found between reference profile and bulk expression profile")
51+
}
52+
53+
# Produce a warning if mutual genes are fewer than 500
54+
if (length(gene) < 500) {
55+
warning("In check_input: Fewer than 500 mutual genes found between reference profile and bulk profile")
56+
}
57+
58+
# reorder samples in Y to match the order in c
59+
Y <- Y[gene, colnames(C)]
60+
# reorder cell types of X_sc to match the order in C
61+
X_sc <- X_sc[gene, rownames(C)]
62+
message("Input data validation complete.")
63+
64+
list(C = C, Y = Y, X_sc = X_sc)
65+
}
66+
67+
#' test_number
68+
#' saftey check for function in harp
69+
#'
70+
#' @param value value to be tested for number properties
71+
#' @param validation.source vector of length 2: [1] the calling function's name, [2] the parameter name being tested
72+
#' @param min minimum
73+
#' @param max maximum
74+
#' @param integer_only logical, if TRUE checks for integer, if FALSE allows any non-negative number
75+
#'
76+
#' @return TRUE, if no error is detected, stops with error otherwise
77+
#' @noRd
78+
test_number <- function(value,
79+
validation.source,
80+
min,
81+
max,
82+
integer_only = TRUE) {
83+
error.message <- paste0("In ", validation.source[1], ": ", "'", validation.source[2], "'")
84+
85+
if (!is.numeric(value) || length(value) != 1) {
86+
error.message <- paste0(error.message, " is not a single numeric value")
87+
stop(error.message, call. = FALSE)
88+
}
89+
90+
# for non-integer case, ensure value is >= 0
91+
if (!integer_only && value < 0) {
92+
error.message <- paste0(error.message, " must be non-negative")
93+
stop(error.message, call. = FALSE)
94+
}
95+
# check for integer if required
96+
if (integer_only && round(value) != value) {
97+
error.message <- paste0(error.message, " is not an integer")
98+
stop(error.message, call. = FALSE)
99+
}
100+
101+
if (value < min) {
102+
error.message <- paste0(error.message, " is below minimal value")
103+
stop(error.message, call. = FALSE)
104+
}
105+
106+
if (value > max) {
107+
error.message <- paste0(error.message, " is above maximal value")
108+
stop(error.message, call. = FALSE)
109+
}
110+
111+
return(TRUE)
112+
}
113+
114+
#' validate lambda sequence
115+
#'
116+
#' saftey check for function in harp
117+
#' @param lambda_seq single non-negative number or sequence of non-negative numbers (at least two values recommended)
118+
#' @param caller_function_name name of the calling function for error messages
119+
#' @param allow_single logical; if TRUE, lambda_seq can be a single non-negative number.
120+
#'
121+
#'
122+
#' @return TRUE if validation passes, stops with error otherwise
123+
#' @noRd
124+
validate_lambda_seq <- function(lambda_seq, caller_function_name, allow_single = TRUE) {
125+
# check if caller_function_name is provided
126+
if (missing(caller_function_name)) {
127+
stop("caller_function_name must be provided")
128+
}
129+
130+
# check if input is numeric vector
131+
if (!is.numeric(lambda_seq)) {
132+
stop(paste0("In ", caller_function_name, ": lambda_seq must be numeric"))
133+
}
134+
135+
# for single value
136+
if (length(lambda_seq) == 1) {
137+
if (!allow_single) {
138+
stop(paste0("In ", caller_function_name, ": lambda_seq cannot be a single value when allow_single is FALSE"))
139+
}
140+
test_number(
141+
value = lambda_seq,
142+
validation.source = c(caller_function_name, "lambda_seq"),
143+
min = 0,
144+
max = Inf,
145+
integer_only = FALSE # allow any non-negative number
146+
)
147+
}
148+
# for sequence
149+
else {
150+
# check if any values are negative
151+
if (any(lambda_seq < 0)) {
152+
stop(paste0("In ", caller_function_name, ": all values in lambda_seq must be non-negative"))
153+
}
154+
155+
# check if sequence has at least two values
156+
if (length(lambda_seq) < 2) {
157+
stop(paste0("In ", caller_function_name, ": lambda_seq must contain at least two values (more values recommended)"))
158+
}
159+
}
160+
161+
return(TRUE)
162+
}
163+
164+
#' check_logical
165+
#'
166+
#' saftey check for functions in harp
167+
#' @param value value to be tested for number properties
168+
#' @param validation.source vector of length 2: [1] the calling function's name, [2] the parameter name being tested
169+
#'
170+
#' @return TRUE, or it throws an error
171+
#' @noRd
172+
check_logical <- function(value,
173+
validation.source) {
174+
error.message <- paste0("In ", validation.source[1], ": ", "'", validation.source[2], "'")
175+
176+
if (any(!is.logical(value)) || length(value) != 1) {
177+
error.message <- paste0(error.message, " must be a single value, either 'TRUE' or 'FALSE' ")
178+
stop(error.message, call. = FALSE)
179+
}
180+
return(TRUE)
181+
}

R/correlation_c.R

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#' calculate Correlations Between Estimated and True cellular compositions
2+
#'
3+
#' calculates row-wise correlations between two matrices (estimated and true cell proportions)
4+
#' @param estimated_c a matrix estimated cellular composions
5+
#' @param true_c a matrix of true cellulat compositions with same dimensions as estimated_c
6+
#' @param ... Additional arguments passed to stats::cor()
7+
#'
8+
#' @return numeric vector containing correlation coefficients
9+
#' @export
10+
#' @examples
11+
#' # create example matrices
12+
#' estimated <- matrix(rnorm(20), nrow = 4)
13+
#' true <- matrix(rnorm(20), nrow = 4)
14+
#' rownames(estimated) <- paste0("cell", 1:4)
15+
#' rownames(true) <- paste0("cell", 1:4)
16+
#' colnames(estimated) <- paste0("sample", 1:5)
17+
#' colnames(true) <- paste0("sample", 1:5)
18+
#'
19+
#' # calculate correlations
20+
#' results <- estimated_c_correlation (estimated_c = estimated, true_c = true)
21+
estimated_c_correlation <- function(..., estimated_c, true_c) {
22+
# Input validation
23+
if (!is.matrix(estimated_c) || !is.matrix(true_c)) {
24+
stop("Both estimated_c and true_c must be matrices")
25+
}
26+
if (!identical(dim(estimated_c), dim(true_c))) {
27+
stop("Dimensions of estimated_c and true_c must match")
28+
}
29+
30+
# check row names
31+
if (is.null(rownames(estimated_c)) || is.null(rownames(true_c))) {
32+
stop("Both matrices must have row names")
33+
}
34+
if (!identical(rownames(estimated_c), rownames(true_c))) {
35+
stop("Row names of estimated_c and true_c must match exactly")
36+
}
37+
38+
# check column names
39+
if (is.null(colnames(estimated_c)) || is.null(colnames(true_c))) {
40+
stop("Both matrices must have column names")
41+
}
42+
if (!identical(colnames(estimated_c), colnames(true_c))) {
43+
stop("Column names of estimated_c and true_c must match exactly")
44+
}
45+
46+
# intiate result vector
47+
n_rows <- nrow(estimated_c)
48+
correlation <- numeric(n_rows)
49+
names(correlation) <- rownames(estimated_c)
50+
51+
52+
tryCatch({
53+
# calculate standard deviations
54+
sd_values <- apply(true_c, 1, stats::sd, na.rm = TRUE)
55+
56+
correlation <- sapply(seq_len(n_rows), function(i) {
57+
if (sd_values[i] == 0) return(0)
58+
if (all(is.na(estimated_c[i, ])) || all(is.na(true_c[i, ]))) return(NA)
59+
stats::cor(estimated_c[i, ], true_c[i, ], use = "complete.obs", ...)
60+
})
61+
62+
63+
names(correlation) <- rownames(estimated_c)
64+
65+
}, error = function(e) {
66+
warning("Error in correlation calculation: ", e$message)
67+
return(rep(NA, n_rows))
68+
})
69+
70+
return(correlation)
71+
}

R/data.R

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#' Sample Deconvolution Data
2+
#'
3+
#' This dataset contains a cell type reference constructed from single cell data of
4+
#' [The landscape of tumor cell states and ecosystems in diffuse large B cell lymphoma](https://www.cell.com/cancer-cell/fulltext/S1535-6108(21)00451-7)
5+
#' by [Steen et al., 2021].
6+
#' Furthermore it contains bulk data (split on the patient level into train and test) artificially constructed by averaging cell profiles of
7+
#' [Dissecting intratumour heterogeneity of nodal B-cell lymphomas at the transcriptional, genetic and drug-response levels](https://www.nature.com/articles/s41556-020-0532-x)
8+
#' by [Roider et al., 2020]
9+
#' Lastly it contains ground truth cell proportions simulating FACS measurments capturing the abundance of cell types
10+
#' @format A list containing three data frames:
11+
#' \describe{
12+
#' \item{cell_reference_profile}{The anchor reference that is harmonized by harp}
13+
#' \item{train_data}{The training data for harp, that means bulk counts and cell proportions}
14+
#' \item{bulk_counts_test}{Independent test counts for the model to be validated on}
15+
#' }
16+
#' @source Deconvolution data of real world sc datasets to be used for a MWE for harp
17+
"harp_data"

0 commit comments

Comments
 (0)