diff --git a/_viash.yaml b/_viash.yaml index ad91fd543..f6472c608 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -11,9 +11,9 @@ keywords: [openproblems, benchmarking, single-cell omics] references: doi: # Malte Luecken, Scott Gigante, Daniel Burkhardt, Robrecht Cannoodt, et al. - # Defining and benchmarking open problems in single-cell analysis, - # 03 April 2024, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-4181617/v1] - - 10.21203/rs.3.rs-4181617/v1 + # Defining and benchmarking open problems in single-cell analysis. + # Nat Biotechnol 43, 1035–1040 (2025). + - 10.1038/s41587-025-02694-w links: issue_tracker: https://github.com/openproblems-bio/openproblems/issues diff --git a/common b/common index f01ff2170..0effaf2ad 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit f01ff2170161295e89014ee5453c61b29b4e4e77 +Subproject commit 0effaf2addbb8df6c0d11caae04d0ca63aa6345d diff --git a/scripts/create_resources/reprocess_task_results_v4.sh b/scripts/create_resources/reprocess_task_results_v4.sh new file mode 100755 index 000000000..dd6171da0 --- /dev/null +++ b/scripts/create_resources/reprocess_task_results_v4.sh @@ -0,0 +1,83 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +OUT_DIR="resources" + +echo ">>> Fetching raw results..." +aws s3 sync --profile op \ + s3://openproblems-data/resources/ \ + "$OUT_DIR/" \ + --exclude "*" \ + --include "**/results/run_*/*" \ + --delete + +echo ">>> Patch state.yaml files..." +# fix state.yaml id and output_trace +python <>> Creating params.yaml..." +cat > /tmp/params.yaml << HERE +input_states: resources/*/results/run_*/state.yaml +rename_keys: 'input_task_info:output_task_info;input_dataset_info:output_dataset_info;input_method_configs:output_method_configs;input_metric_configs:output_metric_configs;input_scores:output_scores;input_trace:output_trace' +output_state: '\$id/state.yaml' +settings: '{"output_combined": "\$id/output_combined.json", "output_report": "\$id/output_report.html", "output_task_info": "\$id/output_task_info.json", "output_dataset_info": "\$id/output_dataset_info.json", "output_method_info": "\$id/output_method_info.json", "output_metric_info": "\$id/output_metric_info.json", "output_results": "\$id/output_results.json", "output_scores": "\$id/output_quality_control.json"}' +publish_dir: "$OUT_DIR" +HERE + +echo ">>> Processing results..." +nextflow run target/nextflow/reporting/process_task_results/main.nf \ + -profile docker \ + -params-file /tmp/params.yaml \ + -c common/nextflow_helpers/labels_ci.config \ + -entry auto \ + -resume + +# find all files in $OUT with the pattern output_report.html +echo ">>> List reports..." +find "$OUT_DIR" -name "output_report.html" + +# echo ">>> Uploading processed results to S3..." +# aws s3 sync --profile op \ +# "resources_test/openproblems/task_results_v4/" \ +# "s3://openproblems-data/resources_test/openproblems/task_results_v4/" \ +# --delete --dryrun + +# echo +# echo ">>> Done!" diff --git a/scripts/create_resources/task_results_v4.sh b/scripts/create_resources/task_results_v4.sh new file mode 100755 index 000000000..8bcb0220a --- /dev/null +++ b/scripts/create_resources/task_results_v4.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +OUT_DIR="resources_test/openproblems/task_results_v4" + +echo ">>> Fetching raw results..." +aws s3 sync --profile op \ + s3://openproblems-data/resources/task_batch_integration/results/run_2025-01-23_18-03-16/ \ + "$OUT_DIR/raw/" \ + --delete + +echo +echo ">>> Processing results..." +if [ -d "$OUT_DIR/processed" ]; then rm -Rf $OUT_DIR/processed; fi +nextflow run target/nextflow/reporting/process_task_results/main.nf \ + -profile docker \ + --input_task_info $OUT_DIR/raw/task_info.yaml \ + --input_dataset_info $OUT_DIR/raw/dataset_uns.yaml \ + --input_method_configs $OUT_DIR/raw/method_configs.yaml \ + --input_metric_configs $OUT_DIR/raw/metric_configs.yaml \ + --input_scores $OUT_DIR/raw/score_uns.yaml \ + --input_trace $OUT_DIR/raw/trace.txt \ + --output_state state.yaml \ + --publishDir $OUT_DIR/processed + +echo ">>> Uploading processed results to S3..." +aws s3 sync --profile op \ + "resources_test/openproblems/task_results_v4/" \ + "s3://openproblems-data/resources_test/openproblems/task_results_v4/" \ + --delete --dryrun + +echo +echo ">>> Done!" diff --git a/src/reporting/combine_output/config.vsh.yaml b/src/reporting/combine_output/config.vsh.yaml new file mode 100644 index 000000000..87673b9e0 --- /dev/null +++ b/src/reporting/combine_output/config.vsh.yaml @@ -0,0 +1,102 @@ +name: combine_output +namespace: reporting +description: Combine task outputs into a single JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input_task_info + type: file + description: Task info file + info: + format: + type: json + schema: /common/schemas/results_v4/task_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/task_info.json + - name: --input_dataset_info + type: file + description: Dataset info file + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + - name: --input_method_info + type: file + description: Method info file + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/method_info.json + - name: --input_metric_info + type: file + description: Metric info file + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + - name: --input_results + type: file + description: Results file + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + required: true + example: resources_test/openproblems/task_results_v4/processed/results.json + - name: --input_quality_control + type: file + description: Quality control file + info: + format: + type: json + schema: /common/schemas/results_v4/quality_control.json + required: true + example: resources_test/openproblems/task_results_v4/processed/quality_control.json + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: Combined output JSON + default: combined_output.json + info: + format: + type: json + schema: /common/schemas/results_v4/combined_output.json + +resources: + - type: r_script + path: script.R + - path: /common/schemas + dest: schemas + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/reporting/combine_output/script.R b/src/reporting/combine_output/script.R new file mode 100644 index 000000000..32734c3c8 --- /dev/null +++ b/src/reporting/combine_output/script.R @@ -0,0 +1,105 @@ +## VIASH START +processed_dir <- "resources_test/openproblems/task_results_v4/processed" + +par <- list( + # Inputs + input_task_info = paste0(processed_dir, "/task_info.json"), + input_quality_control = paste0(processed_dir, "/quality_control.json"), + input_metric_info = paste0(processed_dir, "/metric_info.json"), + input_method_info = paste0(processed_dir, "/method_info.json"), + input_dataset_info = paste0(processed_dir, "/dataset_info.json"), + input_results = paste0(processed_dir, "/results.json"), + # Outputs + output = "task_results.json" +) +## VIASH END + +################################################################################ +# MAIN SCRIPT +################################################################################ + +cat("====== Combine output ======\n") + +cat("\n>>> Reading input files...\n") +cat("Reading task info from '", par$input_task_info, "'...\n", sep = "") +task_info <- jsonlite::read_json(par$input_task_info) + +cat( + "Reading quality control from '", + par$input_quality_control, + "'...\n", + sep = "" +) +quality_control <- jsonlite::read_json(par$input_quality_control) + +cat("Reading metric info from '", par$input_metric_info, "'...\n", sep = "") +metric_info <- jsonlite::read_json(par$input_metric_info) + +cat("Reading method info from '", par$input_method_info, "'...\n", sep = "") +method_info <- jsonlite::read_json(par$input_method_info) + +cat("Reading dataset info from '", par$input_dataset_info, "'...\n", sep = "") +dataset_info <- jsonlite::read_json(par$input_dataset_info) + +cat("Reading results from '", par$input_results, "'...\n", sep = "") +results <- jsonlite::read_json(par$input_results) + +cat("\n>>> Combining outputs...\n") +# Create combined output according to task_results.json +combined_output <- list( + task_info = task_info, + dataset_info = dataset_info, + method_info = method_info, + metric_info = metric_info, + results = results, + quality_control = quality_control +) + +cat("\n>>> Writing output file...\n") +cat("Writing combined output to '", par$output, "'...\n", sep = "") +jsonlite::write_json( + combined_output, + par$output, + pretty = TRUE, + null = "null", + na = "null", + auto_unbox = TRUE +) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "combined_output.json"), + "-r", + file.path(results_schemas, "task_info.json"), + "-r", + file.path(results_schemas, "dataset_info.json"), + "-r", + file.path(results_schemas, "method_info.json"), + "-r", + file.path(results_schemas, "metric_info.json"), + "-r", + file.path(results_schemas, "results.json"), + "-r", + file.path(results_schemas, "quality_control.json"), + "-r", + file.path(results_schemas, "core.json"), + "-d", + par$output +) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/generate_qc/config.vsh.yaml b/src/reporting/generate_qc/config.vsh.yaml index c141575cb..17818c14f 100644 --- a/src/reporting/generate_qc/config.vsh.yaml +++ b/src/reporting/generate_qc/config.vsh.yaml @@ -1,47 +1,98 @@ name: generate_qc namespace: reporting description: Generate task QC metrics -arguments: - - name: --task_info - type: file - description: Task info file - example: resources_test/openproblems/task_results_v3/processed/task_info.json - - name: --method_info - type: file - description: Method info file - example: resources_test/openproblems/task_results_v3/processed/method_info.json - - name: --metric_info - type: file - description: Metric info file - example: resources_test/openproblems/task_results_v3/processed/metric_info.json - - name: --dataset_info - type: file - description: Dataset info file - example: resources_test/openproblems/task_results_v3/processed/dataset_info.json - - name: --results - type: file - description: Results file - example: resources_test/openproblems/task_results_v3/processed/results.json - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema + +argument_groups: + - name: Inputs + arguments: + - name: --input_task_info + type: file + description: Task info file + info: + format: + type: json + schema: /common/schemas/results_v4/task_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/task_info.json + - name: --input_dataset_info + type: file + description: Dataset info file + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + - name: --input_method_info + type: file + description: Method info file + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/method_info.json + - name: --input_metric_info + type: file + description: Metric info file + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + - name: --input_results + type: file + description: Results JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + required: true + example: resources_test/openproblems/task_results_v4/processed/results.json + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: quality_control.json + description: Output quality control JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/quality_control.json + resources: - - type: python_script - path: script.py + - type: r_script + path: script.R + - path: /common/schemas + dest: schemas + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker - image: openproblems/base_python:1 + image: openproblems/base_r:1 + setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli + - type: r + cran: + - dplyr + - purrr + - readr + - stringr + - tidyr + runners: - type: executable - type: nextflow diff --git a/src/reporting/generate_qc/script.R b/src/reporting/generate_qc/script.R new file mode 100644 index 000000000..8725e546f --- /dev/null +++ b/src/reporting/generate_qc/script.R @@ -0,0 +1,816 @@ +## VIASH START +processed_dir <- "resources_test/openproblems/task_results_v4/processed" + +par <- list( + # Inputs + input_task_info = paste0(processed_dir, "/task_info.json"), + input_method_info = paste0(processed_dir, "/method_info.json"), + input_metric_info = paste0(processed_dir, "/metric_info.json"), + input_dataset_info = paste0(processed_dir, "/dataset_info.json"), + input_results = paste0(processed_dir, "/results.json"), + # Outputs + output = "quality_control.json" +) +## VIASH END + +################################################################################ +# FUNCTIONS +################################################################################ + +create_qc_entry <- function( + category, + label, + value, + severity_value, + condition, + message +) { + # If values are missing, set to -1 + # This can happen if a method/metric is not run and therefore has no results + if (is.null(value) || is.na(value) || length(value) == 0) { + value <- -1 + } + + if ( + is.null(severity_value) || + is.na(severity_value) || + length(severity_value) == 0 + ) { + severity_value <- -1 + } + + severity <- dplyr::case_when( + severity_value < 0 ~ 3L, + severity_value < 1 ~ 0L, + severity_value < 2 ~ 1L, + severity_value < 3 ~ 2L, + TRUE ~ 3L + ) + + list( + category = category, + label = label, + value = value, + severity = severity, + severity_value = severity_value, + condition = condition, + message = message + ) +} + +percent_missing <- function(items, field) { + is_missing <- purrr::map_lgl(items, \(.item) { + if (field == "references") { + return(references_missing(.item)) + } + + field_value <- .item[[field]] + is.null(field_value) || + is.na(field_value) || + (is.character(field_value) && field_value == "") + }) + + mean(is_missing) +} + +references_missing <- function(item) { + # Special case for control methods without references + if ("type" %in% names(item) && item$type == "control_method") { + return(FALSE) + } + + references <- item$references + if (length(references) == 0) { + return(TRUE) + } + + if ( + length(references) == 2 && all(c("doi", "bibtex") %in% names(references)) + ) { + if (length(references$doi) == 0 && length(references$bibtex) == 0) { + return(TRUE) + } + } + + return(FALSE) +} + +check_info_fields <- function(info, type, expected_fields, task_name) { + category <- paste(stringr::str_to_title(type), "info") + + purrr::map(expected_fields, function(.field) { + pct_missing <- percent_missing(info, .field) + create_qc_entry( + category = category, + label = paste0("Info field '", .field, "' % missing"), + value = pct_missing, + severity_value = ifelse(pct_missing > 0, 3.0, 0.0), + condition = "pct_missing <= 0", + message = paste0( + category, + " field '", + .field, + "' should be defined\n", + " Task: ", + task_name, + "\n", + " Field: ", + .field, + "\n", + " Percentage missing: ", + round(pct_missing * 100, 0) + ) + ) + }) +} + +check_missing_results <- function( + results_long, + name, + type, + n_datasets, + n_methods, + n_metrics, + task_name +) { + n_expected <- switch( + type, + "dataset" = n_methods * n_metrics, + "method" = n_datasets * n_metrics, + "metric" = n_datasets * n_methods + ) + + name_col <- paste0(type, "_name") + n_results <- results_long |> + dplyr::filter(!!rlang::sym(name_col) == name) |> + nrow() + pct_missing <- 1 - (n_results / n_expected) + + title <- type |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_sentence() + + create_qc_entry( + category = "Raw results", + label = paste0(title, " '", name, "' % missing"), + value = pct_missing, + severity_value = pct_missing / 0.1, + condition = "pct_missing <= 0.1", + message = paste0( + "Percentage of missing results should be less than 10%\n", + " Task: ", + task_name, + "\n", + " ", + title, + ": ", + name, + "\n", + " Number of results: ", + n_results, + "\n", + " Expected number of results: ", + n_expected, + "\n", + " Percentage missing: ", + round(pct_missing * 100, 0), + "%\n" + ) + ) +} + +check_failed_processes <- function(results, name, type, task_name) { + name_col <- paste0(type, "_name") + results_name <- results |> + dplyr::filter(!!rlang::sym(name_col) == name) + + n_expected <- nrow(results_name) + n_succeeded <- sum(results_name$succeeded) + pct_failed <- 1 - (n_succeeded / n_expected) + + title <- type |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_sentence() + + create_qc_entry( + category = "Raw results", + label = paste0(title, " '", name, "' % failed"), + value = pct_failed, + severity_value = pct_failed / 0.1, + condition = "pct_failed <= 0.1", + message = paste0( + "Percentage of failed processes should be less than 10%\n", + " Task: ", + task_name, + "\n", + " ", + title, + ": ", + name, + "\n", + " Succeeded processes: ", + n_succeeded, + "\n", + " Attempted processes: ", + n_expected, + "\n", + " Percentage failed: ", + round(pct_failed * 100, 0), + "%\n" + ) + ) +} + +check_metric_scaling <- function( + results_long, + metric, + control_methods, + task_name +) { + `%||%` <- rlang::`%||%` + + metric_results <- results_long |> + dplyr::filter(metric_name == metric) |> + dplyr::select(-metric_name) + + if ( + nrow(metric_results) == 0 || + !any(control_methods %in% metric_results$method_name) + ) { + return(list()) + } + + control_range <- metric_results |> + dplyr::filter( + method_name %in% control_methods + ) |> + dplyr::group_by(dataset_name) |> + dplyr::summarise( + control_min = min(metric_value), + control_max = max(metric_value) + ) + + scaled_metrics <- metric_results |> + dplyr::left_join(control_range, by = "dataset_name") |> + dplyr::mutate( + scaled_value = (metric_value - control_min) / (control_max - control_min), + outside = scaled_value < 0 | scaled_value > 1, + pct_outside = dplyr::case_when( + scaled_value < 0 ~ 0 - scaled_value, + scaled_value > 1 ~ scaled_value - 1, + TRUE ~ NA + ) + ) + + pct_outside <- sum(scaled_metrics$outside) / nrow(scaled_metrics) + worst_score <- min(scaled_metrics$scaled_value) + worst_pct_outside <- if (!is.na(worst_score) && worst_score < 0) { + max(scaled_metrics$pct_outside[scaled_metrics$scaled_value < 0]) + } else { + 0 + } + best_score <- max(scaled_metrics$scaled_value) + best_pct_outside <- if (!is.na(best_score) && best_score > 1) { + max(scaled_metrics$pct_outside[scaled_metrics$scaled_value > 1]) + } else { + 0 + } + + metric_checks <- list( + create_qc_entry( + category = "Scaling", + label = paste0("Metric '", metric, "' % outside range"), + value = pct_outside, + severity_value = pct_outside / 0.1, + condition = "pct_outside <= 0.1", + message = paste0( + "Percentage of scaled scores outside control range should be less than 10%\n", + " Task: ", + task_name, + "\n", + " Metric: ", + metric, + "\n", + " Inside range: ", + sum(!scaled_metrics$outside), + "\n", + " Scaled scores: ", + nrow(scaled_metrics), + "\n", + " Percentage outside: ", + round(pct_outside * 100, 0), + "%\n" + ) + ), + create_qc_entry( + category = "Scaling", + label = paste0("Metric '", metric, "' worst score % outside range"), + value = worst_pct_outside, + severity_value = worst_pct_outside / 0.1, + condition = "worst_pct_outside <= 0.1", + message = paste0( + "The worst scaled score should be less than 10% outside the control range\n", + " Task: ", + task_name, + "\n", + " Metric: ", + metric, + "\n", + " Worst score: ", + worst_score, + "\n", + " Percentage outside range: ", + round(worst_pct_outside * 100, 0), + "%\n" + ) + ), + create_qc_entry( + category = "Scaling", + label = paste0("Metric '", metric, "' best score % outside range"), + value = best_pct_outside, + severity_value = best_pct_outside / 0.1, + condition = "best_pct_outside <= 0.1", + message = paste0( + "The best scaled score should be less than 10% outside the control range\n", + " Task: ", + task_name, + "\n", + " Metric: ", + metric, + "\n", + " Best score: ", + best_score, + "\n", + " Percentage outside range: ", + round(best_pct_outside * 100, 0), + "%\n" + ) + ) + ) + + method_metric_checks <- purrr::map( + sort(unique(scaled_metrics$method_name)), + function(.method) { + check_method_metric_scaling(scaled_metrics, .method, task_name, metric) + } + ) |> + purrr::list_flatten() + + c(metric_checks, method_metric_checks) +} + +check_method_metric_scaling <- function( + scaled_metrics, + method, + task_name, + metric_name +) { + method_scaled_metrics <- scaled_metrics |> + dplyr::filter(method_name == method) + + worst_score <- min(method_scaled_metrics$scaled_value) + worst_pct_outside <- if (!is.na(worst_score) && worst_score < 0) { + max(method_scaled_metrics$pct_outside[ + method_scaled_metrics$scaled_value < 0 + ]) + } else { + 0 + } + best_score <- max(method_scaled_metrics$scaled_value) + best_pct_outside <- if (!is.na(worst_score) && best_score > 1) { + max(method_scaled_metrics$pct_outside[ + method_scaled_metrics$scaled_value > 1 + ]) + } else { + 0 + } + + list( + create_qc_entry( + category = "Scaling", + label = paste0("Worst '", metric_name, "' score for '", method, "'"), + value = worst_score, + severity_value = ifelse(worst_score < -1, worst_pct_outside, 0), + condition = "worst_score < -1", + message = paste0( + "Method '", + method, + "' performs much worse than controls for metric' ", + metric_name, + "'\n", + " Task: ", + task_name, + "\n", + " Method: ", + method, + "\n", + " Metric: ", + metric_name, + "\n", + " Worst score: ", + worst_score, + "\n", + " Percentage outside range: ", + round(worst_pct_outside * 100, 0), + "%\n" + ) + ), + create_qc_entry( + category = "Scaling", + label = paste0("Best '", metric_name, "' score for '", method, "'"), + value = best_score, + severity_value = ifelse(best_score > 2, best_pct_outside, 0), + condition = "best_score > 2", + message = paste0( + "Method '", + method, + "' performs much better than controls for metric '", + metric_name, + "'\n", + " Task: ", + task_name, + "\n", + " Method: ", + method, + "\n", + " Metric: ", + metric_name, + "\n", + " Best score: ", + best_score, + "\n", + " Percentage outside range: ", + round(best_pct_outside * 100, 0), + "%\n" + ) + ) + ) +} + +################################################################################ +# MAIN SCRIPT +################################################################################ + +cat("====== Generate QC ======\n") + +cat("\n>>> Reading input files...\n") +cat("Reading task info from '", par$input_task_info, "'...\n", sep = "") +task_info <- jsonlite::read_json(par$input_task_info) + +cat("Reading dataset info from '", par$input_dataset_info, "'...\n", sep = "") +dataset_info <- jsonlite::read_json(par$input_dataset_info) + +cat("Reading method info from '", par$input_method_info, "'...\n", sep = "") +method_info <- jsonlite::read_json(par$input_method_info) + +cat("Reading metric info from '", par$input_metric_info, "'...\n", sep = "") +metric_info <- jsonlite::read_json(par$input_metric_info) + +cat("Reading results from '", par$input_results, "'...\n", sep = "") +results <- jsonlite::read_json(par$input_results, simplifyVector = TRUE) + +cat("\n>>> Checking expected info fields...\n") + +expected_task_fields <- c("name", "label", "summary", "description") +expected_dataset_fields <- c( + "name", + "label", + "summary", + "description", + "references" +) +expected_method_fields <- c( + "name", + "label", + "commit", + "summary", + "description", + "references" +) +expected_metric_fields <- c( + "name", + "label", + "commit", + "summary", + "description", + "references" +) + +task_name <- task_info$name %||% "unknown" + +info_task <- check_info_fields( + list(task_info), + "task", + expected_task_fields, + task_name +) +info_datasets <- check_info_fields( + dataset_info, + "dataset", + expected_dataset_fields, + task_name +) +info_methods <- check_info_fields( + method_info, + "method", + expected_method_fields, + task_name +) +info_metrics <- check_info_fields( + metric_info, + "metric", + expected_metric_fields, + task_name +) + +cat("\n>>> Checking missing results...\n") +results_long <- results |> + dplyr::select(dataset_name, method_name, metric_names, metric_values) |> + tidyr::unnest_longer(c("metric_names", "metric_values")) |> + dplyr::rename( + metric_name = metric_names, + metric_value = metric_values + ) |> + dplyr::filter(!is.na(metric_value)) + +dataset_names <- purrr::map_chr(dataset_info, "name") +method_names <- purrr::map_chr(method_info, "name") +metric_names <- purrr::map_chr(metric_info, "name") + +n_datasets <- length(dataset_names) +n_methods <- length(method_names) +n_metrics <- length(metric_names) + +n_results_expected <- n_datasets * n_methods * n_metrics +n_results <- nrow(results_long) +pct_results_missing <- 1 - (n_results / n_results_expected) + +results_task <- list( + create_qc_entry( + category = "Raw results", + label = "Task number of results", + value = n_results, + severity_value = pct_results_missing / 0.1, + condition = "length(dataset_info) * length(results) == length(method_info) * length(metric_info)", + message = paste0( + "Number of results should be equal to #datasets × #methods × #metrics \n", + " Task: ", + task_name, + "\n", + " Number of results: ", + n_results, + "\n", + " Number of datasets: ", + n_datasets, + "\n", + " Number of methods: ", + n_methods, + "\n", + " Number of metrics: ", + n_metrics, + "\n", + " Expected number of results: ", + n_results_expected, + "\n" + ) + ) +) + +results_datasets <- purrr::map(dataset_names, function(.dataset) { + check_missing_results( + results_long, + .dataset, + "dataset", + n_datasets, + n_methods, + n_metrics, + task_name + ) +}) +results_methods <- purrr::map(method_names, function(.method) { + check_missing_results( + results_long, + .method, + "method", + n_datasets, + n_methods, + n_metrics, + task_name + ) +}) +results_metrics <- purrr::map(metric_names, function(.metric) { + check_missing_results( + results_long, + .metric, + "metric", + n_datasets, + n_methods, + n_metrics, + task_name + ) +}) + +cat("\n>>> Checking failed processes\n") +metric_component_results <- results |> + dplyr::select(dataset_name, method_name, metric_components) |> + tidyr::unnest(metric_components) |> + dplyr::rename(metric_component_name = component_name) + +n_processes <- nrow(results) + nrow(metric_component_results) +n_succeeded <- sum(results$succeeded) + sum(metric_component_results$succeeded) +pct_failed <- 1 - (n_succeeded / n_processes) + +failed_task <- list( + create_qc_entry( + category = "Raw results", + label = "Task number of successful processes", + value = n_succeeded, + severity_value = pct_failed / 0.1, + condition = "sum(results$succeeded) + sum(metric_component_results$succeeded) == nrow(results) + nrow(metric_component_results)", + message = paste0( + "Number of successful processes should be equal to the number of attempted processes\n", + " Task: ", + task_name, + "\n", + " Succeeded processes: ", + n_succeeded, + "\n", + " Attempted processes: ", + n_processes, + "\n", + " Percentage failed: ", + round(pct_failed * 100, 0), + "%\n" + ) + ) +) + +failed_datasets <- purrr::map(dataset_names, function(.dataset) { + check_failed_processes(results, .dataset, "dataset", task_name) +}) +failed_methods <- purrr::map(method_names, function(.method) { + check_failed_processes(results, .method, "method", task_name) +}) + +failed_metrics <- purrr::map( + unique(metric_component_results$metric_component_name), + function(.component) { + check_failed_processes( + metric_component_results, + .component, + "metric_component", + task_name + ) + } +) + +cat("\n>>> Checking control methods...\n") +is_control <- purrr::map_lgl(method_info, \(.method) { + .method$type == "control_method" +}) +control_methods <- method_names[is_control] + +dataset_controls <- results_long |> + dplyr::filter(method_name %in% control_methods) |> + dplyr::select(dataset_name, method_name) |> + dplyr::distinct() |> + dplyr::group_by(dataset_name) |> + dplyr::count(name = "n_controls") |> + dplyr::ungroup() |> + dplyr::mutate(dataset_name = factor(dataset_name, levels = dataset_names)) |> + tidyr::complete(dataset_name, fill = list(n_controls = 0)) + +controls_datasets <- purrr::map( + seq_len(nrow(dataset_controls)), + function(.idx) { + dataset_name <- dataset_controls$dataset_name[.idx] + n_controls <- dataset_controls$n_controls[.idx] + + create_qc_entry( + category = "Raw results", + label = paste0("Dataset `", dataset_name, "' number of control methods"), + value = n_controls, + severity_value = ifelse(n_controls != length(control_methods), 3, 0), + condition = "n_controls != length(control_methods)", + message = paste0( + "Number of successful control methods for a dataset should equal the number of controls\n", + " Task: ", + task_name, + "\n", + " Dataset: ", + dataset_name, + "\n", + " Succeeded control_methods: ", + n_controls, + "\n", + " Total control methods: ", + length(control_methods), + "\n", + " Percentage succeeded: ", + round(n_controls / length(control_methods) * 100, 0), + "%\n" + ) + ) + } +) + +metric_controls <- results_long |> + dplyr::filter(method_name %in% control_methods) |> + dplyr::select(method_name, metric_name) |> + dplyr::group_by(metric_name) |> + dplyr::count(name = "n_controls") |> + dplyr::ungroup() |> + dplyr::mutate(metric_name = factor(metric_name, levels = metric_names)) |> + tidyr::complete(metric_name, fill = list(n_controls = 0)) + +n_expected <- length(dataset_names) * length(control_methods) +controls_metrics <- purrr::map(seq_len(nrow(metric_controls)), function(.idx) { + metric_name <- metric_controls$metric_name[.idx] + n_controls <- metric_controls$n_controls[.idx] + + create_qc_entry( + category = "Raw results", + label = paste0("Metric '", metric_name, "' number of control methods"), + value = n_controls, + severity_value = ifelse(n_controls != n_expected, 3, 0), + condition = "n_controls != length(datasets) * length(control_methods)", + message = paste0( + "Number of metric scores for control methods should be equal to #datasets × #control_methods\n", + " Task: ", + task_name, + "\n", + " Metric: ", + metric_name, + "\n", + " Control method scores: ", + n_controls, + "\n", + " Expected control method scores: ", + n_expected, + "\n", + " Percentage succeeded: ", + round(n_controls / n_expected * 100, 0), + "%\n" + ) + ) +}) + +cat("\n>>> Checking metric scaling...\n") +scaling <- purrr::map(metric_names, function(.metric) { + check_metric_scaling(results_long, .metric, control_methods, task_name) +}) |> + purrr::list_flatten() + +cat("\n>>> Collecting QC results...\n") +qc_results <- c( + info_task, + info_datasets, + info_methods, + info_metrics, + results_task, + results_datasets, + results_methods, + results_metrics, + failed_task, + failed_datasets, + failed_methods, + failed_metrics, + controls_datasets, + controls_metrics, + scaling +) + +cat("\n>>> Writing output file...\n") +cat("Writing quality control to '", par$output, "'...\n", sep = "") +jsonlite::write_json( + qc_results, + par$output, + pretty = TRUE, + null = "null", + na = "null", + auto_unbox = TRUE +) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "quality_control.json"), + "-d", + par$output +) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/generate_qc/script.py b/src/reporting/generate_qc/script.py deleted file mode 100644 index 685cc6436..000000000 --- a/src/reporting/generate_qc/script.py +++ /dev/null @@ -1,302 +0,0 @@ -import json -import numpy as np - -## VIASH START -par = { - "task_info": "resources_test/openproblems/task_results_v3/processed/task_info.json", - "method_info": "resources_test/openproblems/task_results_v3/processed/method_info.json", - "metric_info": "resources_test/openproblems/task_results_v3/processed/metric_info.json", - "dataset_info": "resources_test/openproblems/task_results_v3/processed/dataset_info.json", - "results": "resources_test/openproblems/task_results_v3/processed/results.json", - "output": "output.json" -} -## VIASH END - -EXPECTED_TASK_FIELDS = ["task_id", "task_name", "task_summary", "task_description"] -EXPECTED_METHOD_FIELDS = ["task_id", "commit_sha", "method_id", "method_name", "method_summary", "paper_reference", "is_baseline"] -EXPECTED_METRIC_FIELDS = ["task_id", "commit_sha", "metric_id", "metric_name", "metric_summary", "paper_reference", "maximize"] -EXPECTED_DATASET_FIELDS = ["task_id", "dataset_id", "dataset_name", "dataset_summary", "data_reference", "data_url"] - -def dump_json(obj, fp): - """Dump to JSON in a numpy-safe fashion.""" - json.dump( - obj, - fp, - indent=4, - sort_keys=False, - separators=(", ", ": "), - ensure_ascii=False, - ) - -def create_quality_control(task_info, dataset_info, method_info, metric_info, results): - """Quality control to detect anomalies in the results.""" - task_id = task_info["task_id"] - - result_qc = [] - - def add_qc( - category: str, - name: str, - value, - severity_value: float, - code: str, - message: str, - ) -> None: - "Add an entry to the result qc" - if severity_value <= 1: - severity = 0 - elif severity_value <= 2: - severity = 1 - elif severity_value <= 3: - severity = 2 - else: - severity = 3 - result_qc.append({ - "task_id": task_id, - "category": category, - "name": name, - "value": value, - "severity": severity, - "severity_value": severity_value, - "code": code, - "message": message - }) - - def percent_missing(list_of_dicts, field): - are_missing = [] - for item in list_of_dicts: - if field == "paper_reference" and item.get("is_baseline", False): - are_missing.append(0.0) - elif field in item and item[field] is not None: - are_missing.append(0.0) - else: - are_missing.append(1.0) - return np.mean(are_missing) - - # check task_info - for field in EXPECTED_TASK_FIELDS: - pct_missing = percent_missing([task_info], field) - add_qc( - "Task info", - f"Pct '{field}' missing", - pct_missing, - 3.0 if pct_missing > 0 else 0.0, - "percent_missing([task_info], field)", - f"Task metadata field '{field}' should be defined\n" - f" Task id: {task_id}\n" - f" Field: {field}\n" - ) - - # check method_info - for field in EXPECTED_METHOD_FIELDS: - pct_missing = percent_missing(method_info, field) - add_qc( - "Method info", - f"Pct '{field}' missing", - pct_missing, - 3.0 if pct_missing > 0 else 0.0, - "percent_missing(method_info, field)", - f"Method metadata field '{field}' should be defined\n" - f" Task id: {task_id}\n" - f" Field: {field}\n" - ) - - # check metric_info - for field in EXPECTED_METRIC_FIELDS: - pct_missing = percent_missing(metric_info, field) - add_qc( - "Metric info", - f"Pct '{field}' missing", - pct_missing, - 3.0 if pct_missing > 0 else 0.0, - "percent_missing(metric_info, field)", - f"Metric metadata field '{field}' should be defined\n" - f" Task id: {task_id}\n" - f" Field: {field}\n" - ) - - # check dataset_info - for field in EXPECTED_DATASET_FIELDS: - pct_missing = percent_missing(dataset_info, field) - add_qc( - "Dataset info", - f"Pct '{field}' missing", - pct_missing, - 3.0 if pct_missing > 0 else 0.0, - "percent_missing(dataset_info, field)", - f"Dataset metadata field '{field}' should be defined\n" - f" Task id: {task_id}\n" - f" Field: {field}\n" - ) - - # turn results into long format for easier processing - results_long = [ - { - "task_id": task_id, - "method_id": x["method_id"], - "dataset_id": x["dataset_id"], - "metric_id": metric["metric_id"], - "metric_value" : x["metric_values"].get(metric["metric_id"]), - "scaled_score" : x["scaled_scores"].get(metric["metric_id"]), - } - for metric in metric_info - for x in results - ] - - # check percentage missing - pct_missing = 1 - len(results_long) / (len(method_info) * len(metric_info) * len(dataset_info)) - add_qc( - "Raw data", - "Number of results", - len(results), - pct_missing / .1, - "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", - f"Number of results should be equal to #methods × #metrics × #datasets.\n" - f" Task id: {task_id}\n" - f" Number of results: {len(results)}\n" - f" Number of methods: {len(method_info)}\n" - f" Number of metrics: {len(metric_info)}\n" - f" Number of datasets: {len(dataset_info)}\n" - ) - - # QC per metric - for metric in metric_info: - metric_id = metric["metric_id"] - values = [ - res - for res in results_long - if res["metric_id"] == metric_id - and res["metric_value"] is not None - and np.isreal(res["metric_value"]) - ] - pct_missing = 1 - len(values) / len(dataset_info) / len(method_info) - - add_qc( - "Raw results", - f"Metric '{metric_id}' %missing", - pct_missing, - pct_missing / .1, - "pct_missing <= .1", - f"Percentage of missing results should be less than 10%.\n" - f" Task id: {task_id}\n" - f" Metric id: {metric_id}\n" - f" Percentage missing: {pct_missing*100:.0f}%\n" - ) - - # QC per method - for method in method_info: - method_id = method["method_id"] - values = [ - res - for res in results_long - if res["method_id"] == method_id - and res["metric_value"] is not None - and np.isreal(res["metric_value"]) - ] - pct_missing = 1 - len(values) / len(dataset_info) / len(metric_info) - - add_qc( - "Raw results", - f"Method '{method_id}' %missing", - pct_missing, - pct_missing / .1, - "pct_missing <= .1", - f"Percentage of missing results should be less than 10%.\n" - f" Task id: {task_id}\n" - f" method id: {method_id}\n" - f" Percentage missing: {pct_missing*100:.0f}%\n" - ) - - # QC per dataset - for dataset in dataset_info: - dataset_id = dataset["dataset_id"] - values = [ - res - for res in results_long - if res["dataset_id"] == dataset_id - and res["metric_value"] is not None - and np.isreal(res["metric_value"]) - ] - pct_missing = 1 - len(values) / len(metric_info) / len(method_info) - - add_qc( - "Raw results", - f"Dataset '{dataset_id}' %missing", - pct_missing, - pct_missing / .1, - "pct_missing <= .1", - f"Percentage of missing results should be less than 10%.\n" - f" Task id: {task_id}\n" - f" dataset id: {dataset_id}\n" - f" Percentage missing: {pct_missing*100:.0f}%\n" - ) - - - # QC per metric and method - for metric in metric_info: - for method in method_info: - metric_id = metric["metric_id"] - method_id = method["method_id"] - scores = [ - res["scaled_score"] - for res in results_long - if res["metric_id"] == metric_id - and res["method_id"] == method_id - and res["scaled_score"] is not None - and np.isreal(res["scaled_score"]) - ] - - if len(scores) >= 1: - worst_score = np.min(scores).item() - best_score = np.max(scores).item() - - add_qc( - "Scaling", - f"Worst score {method_id} {metric_id}", - worst_score, - worst_score / -1, - "worst_score >= -1", - f"Method {method_id} performs much worse than baselines.\n" - f" Task id: {task_id}\n" - f" Method id: {method_id}\n" - f" Metric id: {metric_id}\n" - f" Worst score: {worst_score}%\n" - ) - - add_qc( - "Scaling", - f"Best score {method_id} {metric_id}", - best_score, - best_score / 2, - "best_score <= 2", - f"Method {method_id} performs a lot better than baselines.\n" - f" Task id: {task_id}\n" - f" Method id: {method_id}\n" - f" Metric id: {metric_id}\n" - f" Best score: {best_score}%\n" - ) - - return result_qc - -def main(par): - # read data from files - with open(par["task_info"], "r", encoding="utf8") as file: - task_info = json.load(file) - with open(par["method_info"], "r", encoding="utf8") as file: - method_info = json.load(file) - with open(par["metric_info"], "r", encoding="utf8") as file: - metric_info = json.load(file) - with open(par["dataset_info"], "r", encoding="utf8") as file: - dataset_info = json.load(file) - with open(par["results"], "r", encoding="utf8") as file: - results = json.load(file) - - # create info objects - quality_control = create_quality_control(task_info, dataset_info, method_info, metric_info, results) - - # write data to files - with open(par["output"], "w", encoding="utf8") as file: - dump_json(quality_control, file) - -if __name__ == "__main__": - main(par) diff --git a/src/reporting/get_dataset_info/config.vsh.yaml b/src/reporting/get_dataset_info/config.vsh.yaml index ff43cb478..9cfbbe640 100644 --- a/src/reporting/get_dataset_info/config.vsh.yaml +++ b/src/reporting/get_dataset_info/config.vsh.yaml @@ -1,35 +1,61 @@ name: get_dataset_info namespace: reporting -description: Extract dataset info and convert to expected format for website results -arguments: - - name: --input - type: file - description: A yaml file - required: true - example: resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema +description: Convert dataset uns YAML to schema-compliant JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: A YAML file containing dataset uns + required: true + example: resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: dataset_info.json + description: Output JSON file matching dataset info schema + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + resources: - type: r_script path: script.R + - path: /src/reporting/shared/functions.R + dest: functions.R + - path: /common/schemas + dest: schemas + - path: /src/reporting/shared/bibliography.bib + dest: bibliography.bib + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1 setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli - type: r - cran: [ purrr, yaml, rlang, processx ] + cran: + - bibtex + - purrr + - stringr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_dataset_info/script.R b/src/reporting/get_dataset_info/script.R index 797fdb1ad..19e3033da 100644 --- a/src/reporting/get_dataset_info/script.R +++ b/src/reporting/get_dataset_info/script.R @@ -1,53 +1,112 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - ## VIASH START par <- list( - input = "resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml", - output = "resources_test/openproblems/task_results_v3/processed/dataset_info.json" + input = "resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml", + output = "resources_test/openproblems/task_results_v4/processed/dataset_info.json" ) ## VIASH END -datasets <- yaml::yaml.load_file(par$input) +source(file.path(meta$resources_dir, "functions.R")) -# transform into format expected by website -outputs <- map(datasets, function(dataset) { - # ↑ the 'dataset' object could be used as the new format +`%||%` <- rlang::`%||%` - # TODO: it'd be nice if the s3 path was also included in the dataset info +cat("====== Get dataset info ======\n") - # construct v1 format - out <- list( - "dataset_id" = dataset$dataset_id, - "dataset_name" = dataset$dataset_name, - "dataset_summary" = dataset$dataset_summary, - "dataset_description" = dataset$dataset_description %||% NA_character_, - "data_reference" = dataset$dataset_reference %||% NA_character_, - "data_url" = dataset$dataset_url %||% NA_character_, - "date_created" = dataset$date_created %||% NA_character_, - "file_size" = dataset$file_size %||% NA_character_ - ) +cat("\n>>> Reading input files...\n") +cat("Reading dataset uns from '", par$input, "'...\n", sep = "") +dataset_uns <- yaml::yaml.load_file( + par$input, + # Read file sizes as floats to avoid issues with big integers + handlers = list(int = \(x) { + as.numeric(x) + }) +) + +cat( + "\n>>> Processing ", + length(dataset_uns), + " datasets...\n", + sep = "" +) +bibliography <- read_bibliography( + file.path(meta$resources_dir, "bibliography.bib") +) +dataset_info_json <- purrr::map(dataset_uns, function(.dataset) { + cat("Processing dataset uns '", .dataset$dataset_id, "'\n", sep = "") + + authors <- get_authors_list(.dataset$authors) - if (!is.null(dataset[["common_dataset_id"]])) { - out[["common_dataset_id"]] <- dataset[["common_dataset_id"]] + if ("dataset_reference" %in% names(.dataset)) { + reference_name <- "dataset_reference" + } else if ("data_reference" %in% names(.dataset)) { + reference_name <- "data_reference" + } else { + stop("No reference found in dataset uns for '", .dataset$dataset_id, "'") } - # show warning when certain data is missing and return null? - for (n in names(out)) { - if (is.null(out[[n]])) { - out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) - stop("missing value for value '", n, "' in ", out_as_str) - } + references <- get_references_list(.dataset[[reference_name]], bibliography) + + if ("dataset_url" %in% names(.dataset)) { + url_name <- "dataset_url" + } else if ("data_url" %in% names(.dataset)) { + url_name <- "data_url" + } else { + stop("No URL found in dataset uns for '", .dataset$dataset_id, "'") } - out + list( + name = jsonlite::unbox(.dataset$dataset_id), + label = jsonlite::unbox(.dataset$dataset_name), + commit = jsonlite::unbox(.dataset$dataset_commit %||% "missing-sha"), + summary = .dataset$dataset_summary |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + description = .dataset$dataset_description |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + source_url = jsonlite::unbox(.dataset[[url_name]]), + common_dataset_names = .dataset$common_dataset_id, + modalities = jsonlite::unbox(.dataset$dataset_modality), + organisms = .dataset$dataset_organism, + authors = authors, + references = references, + date_created = jsonlite::unbox(.dataset$date_created), + file_size_mb = jsonlite::unbox(.dataset$file_size / 1048576) + ) }) +cat("\n>>> Writing output files...\n") +cat("Writing dataset info to '", par$output, "'...\n", sep = "") jsonlite::write_json( - outputs, + dataset_info_json, par$output, - auto_unbox = TRUE, - pretty = TRUE + pretty = TRUE, + null = "null" +) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "dataset_info.json"), + "-r", + file.path(results_schemas, "core.json"), + "-d", + par$output ) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/get_method_info/config.vsh.yaml b/src/reporting/get_method_info/config.vsh.yaml index 23528273b..884418b7c 100644 --- a/src/reporting/get_method_info/config.vsh.yaml +++ b/src/reporting/get_method_info/config.vsh.yaml @@ -1,35 +1,61 @@ name: get_method_info namespace: reporting -description: Extract method info -arguments: - - name: --input - type: file - description: A yaml file - required: true - example: resources_test/openproblems/task_results_v3/raw/method_configs.yaml - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema +description: Convert method configs YAML to schema-compliant JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: A YAML file containing method configs + required: true + example: resources_test/openproblems/task_results_v4/raw/method_configs.yaml + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: method_info.json + description: Output JSON file matching method info schema + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + example: resources_test/openproblems/task_results_v4/processed/method_info.json + resources: - type: r_script path: script.R + - path: /src/reporting/shared/functions.R + dest: functions.R + - path: /common/schemas + dest: schemas + - path: /src/reporting/shared/bibliography.bib + dest: bibliography.bib + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1 setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli - type: r - cran: [ purrr, yaml, rlang, processx ] + cran: + - bibtex + - purrr + - stringr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_method_info/script.R b/src/reporting/get_method_info/script.R index 0623d89fd..7e0eed9cf 100644 --- a/src/reporting/get_method_info/script.R +++ b/src/reporting/get_method_info/script.R @@ -1,118 +1,175 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - ## VIASH START par <- list( - input = "method_configs.yaml", - output = "resources_test/openproblems/task_results_v3/processed/method_info.json" + input = "resources_test/openproblems/task_results_v4/processed/task_info.json", + output = "resources_test/openproblems/task_results_v4/processed/method_info.json" ) ## VIASH END -configs <- yaml::yaml.load_file(par$input) +source(file.path(meta$resources_dir, "functions.R")) -outputs <- map(configs, function(config) { - if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") { - return(NULL) - } +################################################################################ +# FUNCTIONS +################################################################################ - # prep for viash 0.9.0 - build_info <- config$build_info %||% config$info - if ("functionality" %in% names(config)) { - config[names(config$functionality)] <- config$functionality - config[["functionality"]] <- NULL - } +get_implementation_url <- function(config) { + paste0( + config$build_info$git_remote, + "/blob/", + config$build_info$git_commit, + "/", + config$build_info$config |> + stringr::str_replace(".*/src/", "src/") |> + stringr::str_remove("/config.vsh.yaml") + ) +} - info <- config$info - - # add extra info - info$comp_path <- gsub(".*/src/", "src/", build_info$config) %>% gsub("/config.vsh.yaml", "", .) - info$task_id <- gsub("/.*", "", config$namespace) - info$id <- config$name - info$namespace <- config$namespace - info$label <- config$label %||% info$label - info$summary <- config$summary %||% info$summary - info$description <- config$description %||% info$description - info$commit_sha <- build_info$git_commit %||% "missing-sha" - info$code_version <- config$version - info$code_url <- config$links$repository - info$documentation_url <- config$links$documentation - # Check if the method has a docker container to create an image url. If it does not have a docker it will be a nextflow component consisting of different components that will have a docker image. +get_container_image <- function(config) { + # Check if the method has a docker container to create an image url. + # If it does not have a docker it will be a nextflow component consisting of + # different components that will have a docker image. engines <- config$engines - has_docker <- any(map_lgl(engines, ~ .x$type == "docker")) + has_docker <- any(purrr::map_lgl(engines, ~ .x$type == "docker")) if (has_docker) { - info$image <- paste0( + paste0( "https://", - config$links$docker_registry, "/", - config$package_config$organization, "/", - config$package_config$name, "/", - gsub("src/", "", info$comp_path), + config$links$docker_registry, + "/", + config$package_config$organization, + "/", + config$package_config$name, + "/", + config$build_info$config |> + stringr::str_remove(".*/src/") |> + stringr::str_remove("/config.vsh.yaml"), ":", - info$code_version + config$version ) - } else { - info$image <- paste0( + } else { + paste0( "https://github.com/orgs/openproblems-bio/packages?repo_name=", config$package_config$name, "&q=", - gsub("src/", "", info$comp_path) + config$build_info$config |> + stringr::str_remove(".*/src/") |> + stringr::str_remove("/config.vsh.yaml") ) } - info$implementation_url <- paste0( - build_info$git_remote, "/blob/", - build_info$git_commit, "/", - info$comp_path +} + +get_additional_info <- function(config) { + # Fields that are stored elsewhere and we don't want to save here + exclude <- c( + "type", + "type_info", + "label", + "summary", + "description", + "documentation_url", + "authors" ) - info$type_info <- NULL - - # Flatten references - if (!is.null(config$references) && config$references != "") { - info <- imap(config$references, function(value, key) { - info[[paste0("references_", key)]] <- value - return(info) - })[[1]] + + config$info[setdiff(names(config$info), exclude)] |> + purrr::map(recurse_unbox) +} + +recurse_unbox <- function(x) { + if (is.list(x)) { + purrr::map(x, recurse_unbox) + } else if (length(x) == 1) { + jsonlite::unbox(x) + } else { + x } - info$references <- NULL - - print(info) - - - # ↑ this could be used as the new format - - # construct v1 format - out <- list( - task_id = info$task_id, - method_id = info$id, - method_name = info$label, - method_summary = info$summary, - method_description = info$description, - is_baseline = grepl("control", info$type), - references_doi = info$references_doi %||% NA_character_, - references_bibtex = info$references_bibtex %||% NA_character_, - code_url = info$code_url %||% NA_character_, - documentation_url = info$documentation_url %||% NA_character_, - image = info$image %||% NA_character_, - implementation_url = info$implementation_url %||% NA_character_, - code_version = info$code_version %||% NA_character_, - commit_sha = info$commit_sha - ) +} + +################################################################################ +# MAIN SCRIPT +################################################################################ + +cat("====== Get method info ======\n") - # show warning when certain data is missing and return null? - for (n in names(out)) { - if (is.null(out[[n]])) { - out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) - stop("missing value for value '", n, "' in ", out_as_str) - } +`%||%` <- rlang::`%||%` + +cat("\n>>> Reading input files...\n") +cat("Reading method info from '", par$input, "'...\n", sep = "") +method_configs <- yaml::yaml.load_file(par$input) + +cat( + "\n>>> Processing ", + length(method_configs), + " method configs...\n", + sep = "" +) +bibliography <- read_bibliography( + file.path(meta$resources_dir, "bibliography.bib") +) +method_info_json <- purrr::map(method_configs, function(.config) { + if (.config$status == "disabled") { + cat("Skipping disabled method '", .config$name, "'\n", sep = "") + return(NULL) + } else { + cat("Processing method '", .config$name, "'\n", sep = "") } - # return output - out + list( + name = jsonlite::unbox(.config$name), + label = jsonlite::unbox(.config$label %||% .config$info$label), + commit = jsonlite::unbox(.config$build_info$git_commit %||% "missing-sha"), + summary = .config$summary %||% + .config$info$summary |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + description = .config$description %||% + .config$info$description |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + type = jsonlite::unbox(.config$info$type), + link_code = jsonlite::unbox(.config$links$repository), + link_documentation = jsonlite::unbox( + .config$links$documentation %||% .config$info$documentation_url + ), + link_implementation = jsonlite::unbox(get_implementation_url(.config)), + link_container_image = jsonlite::unbox(get_container_image(.config)), + authors = get_authors_list(.config$authors), + references = get_references_list(.config$references, bibliography), + additional_info = get_additional_info(.config), + version = jsonlite::unbox(.config$version) + ) }) +cat("\n>>> Writing output files...\n") +cat("Writing task info to '", par$output, "'...\n", sep = "") jsonlite::write_json( - outputs, + method_info_json, par$output, - auto_unbox = TRUE, - pretty = TRUE -) \ No newline at end of file + pretty = TRUE, + null = "null" +) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "method_info.json"), + "-r", + file.path(results_schemas, "core.json"), + "-d", + par$output +) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/get_metric_info/config.vsh.yaml b/src/reporting/get_metric_info/config.vsh.yaml index 597f4a420..fc5f05949 100644 --- a/src/reporting/get_metric_info/config.vsh.yaml +++ b/src/reporting/get_metric_info/config.vsh.yaml @@ -1,35 +1,61 @@ name: get_metric_info namespace: reporting -description: Extract metric info -arguments: - - name: --input - type: file - description: A yaml file - required: true - example: resources_test/openproblems/task_results_v3/raw/metric_configs.yaml - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema +description: Convert metric configs YAML to schema-compliant JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: A YAML file containing metric configs + required: true + example: resources_test/openproblems/task_results_v4/raw/metric_configs.yaml + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: metric_info.json + description: Output JSON file matching metric info schema + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + resources: - type: r_script path: script.R + - path: /src/reporting/shared/functions.R + dest: functions.R + - path: /common/schemas + dest: schemas + - path: /src/reporting/shared/bibliography.bib + dest: bibliography.bib + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1 setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli - type: r - cran: [ purrr, yaml, rlang, processx ] + cran: + - bibtex + - purrr + - stringr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_metric_info/script.R b/src/reporting/get_metric_info/script.R index 0f046bd90..778c84b99 100644 --- a/src/reporting/get_metric_info/script.R +++ b/src/reporting/get_metric_info/script.R @@ -1,102 +1,187 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - ## VIASH START par <- list( - input = "resources_test/openproblems/task_results_v3/raw/metric_configs.yaml", - output = "resources_test/openproblems/task_results_v3/processed/metric_info.json" + input = "resources_test/openproblems/task_results_v4/raw/metric_configs.yaml", + output = "resources_test/openproblems/task_results_v4/processed/metric_info.json" ) ## VIASH END -configs <- yaml::yaml.load_file(par$input) +source(file.path(meta$resources_dir, "functions.R")) -outputs <- map(configs, function(config) { - if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") { - return(NULL) +################################################################################ +# FUNCTIONS +################################################################################ + +get_implementation_url <- function(config) { + paste0( + config$build_info$git_remote, + "/blob/", + config$build_info$git_commit, + "/", + config$build_info$config |> + stringr::str_replace(".*/src/", "src/") |> + stringr::str_remove("/config.vsh.yaml") + ) +} + +get_container_image <- function(config) { + # Check if the method has a docker container to create an image url. + # If it does not have a docker it will be a nextflow component consisting of + # different components that will have a docker image. + engines <- config$engines + has_docker <- any(purrr::map_lgl(engines, ~ .x$type == "docker")) + if (has_docker) { + paste0( + "https://", + config$links$docker_registry, + "/", + config$package_config$organization, + "/", + config$package_config$name, + "/", + config$build_info$config |> + stringr::str_remove(".*/src/") |> + stringr::str_remove("/config.vsh.yaml"), + ":", + config$version + ) + } else { + paste0( + "https://github.com/orgs/openproblems-bio/packages?repo_name=", + config$package_config$name, + "&q=", + config$build_info$config |> + stringr::str_remove(".*/src/") |> + stringr::str_remove("/config.vsh.yaml") + ) } +} + +get_additional_info <- function(info, exclude, name_prefix = "") { + additional <- info[setdiff(names(info), exclude)] |> + purrr::map(recurse_unbox) - # prep for viash 0.9.0 - build_info <- config$build_info %||% config$info - if ("functionality" %in% names(config)) { - config[names(config$functionality)] <- config$functionality - config[["functionality"]] <- NULL + rlang::set_names(additional, paste0(name_prefix, names(additional))) +} + +recurse_unbox <- function(x) { + if (is.list(x)) { + purrr::map(x, recurse_unbox) + } else if (length(x) == 1) { + jsonlite::unbox(x) + } else { + x } +} - map( - config$info$metrics, - function(info) { - # add extra info - info$comp_path <- gsub(".*/src/", "src/", build_info$config) %>% gsub("/config.vsh.yaml", "", .) - info$task_id <- gsub("/.*", "", config$namespace) - info$id <- info$name - info$name <- NULL - info$component_name <- config$name - info$namespace <- config$namespace - info$commit_sha <- build_info$git_commit %||% "missing-sha" - info$code_version <- config$version %||% "missing-version" - info$image_url <- paste0( - "https://", - config$links$docker_registry, "/", - config$package_config$organization, "/", - config$package_config$name, "/", - gsub("src/", "", info$comp_path), - ":", - info$code_version - ) - info$implementation_url <- paste0( - build_info$git_remote, "/blob/", - build_info$git_commit, "/", - info$comp_path - ) - # Flatten references - if (!is.null(info$references) && info$references != "") { - info <- imap(info$references, function(value, key) { - info[[paste0("references_", key)]] <- value - return(info) - })[[1]] - } - info$references <- NULL - - # ↑ this could be used as the new format - - # construct v1 format - out <- list( - task_id = info$task_id, - component_name = info$component_name, - metric_id = info$id, - metric_name = info$label, - metric_summary = info$summary, - metric_description = info$description, - references_doi = info$references_doi %||% NA_character_, - references_bibtex = info$references_bibtex %||% NA_character_, - implementation_url = info$implementation_url %||% NA_character_, - image = info$image_url %||% NA_character_, - code_version = info$code_version %||% NA_character_, - commit_sha = info$commit_sha, - maximize = info$maximize - ) - - # show warning when certain data is missing and return null? - for (n in names(out)) { - if (is.null(out[[n]])) { - out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) - stop("missing value for value '", n, "' in ", out_as_str) - } - } - - # return output - out - } - ) -}) +################################################################################ +# MAIN SCRIPT +################################################################################ + +cat("====== Get metric info ======\n") + +`%||%` <- rlang::`%||%` + +cat("\n>>> Reading input files...\n") +cat("Reading metric info from '", par$input, "'...\n", sep = "") +metric_configs <- yaml::yaml.load_file(par$input) -outputs <- unlist(outputs, recursive = FALSE) +cat( + "\n>>> Processing ", + length(metric_configs), + " metric configs...\n", + sep = "" +) +bibliography <- read_bibliography( + file.path(meta$resources_dir, "bibliography.bib") +) +metric_info_json <- purrr::map(metric_configs, function(.config) { + if (.config$status == "disabled") { + cat("Skipping disabled metric component '", .config$name, "'\n", sep = "") + return(NULL) + } else { + cat("Processing metric component '", .config$name, "'\n", sep = "") + } + + purrr::map(.config$info$metrics, function(.metric) { + list( + name = jsonlite::unbox(.metric$name), + label = jsonlite::unbox(.metric$label), + commit = jsonlite::unbox( + .config$build_info$git_commit %||% "missing-sha" + ), + summary = .metric$summary |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + description = .metric$description |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + maximize = jsonlite::unbox(.metric$maximize), + link_implementation = jsonlite::unbox(get_implementation_url(.config)), + link_container_image = jsonlite::unbox(get_container_image(.config)), + component_name = jsonlite::unbox(.config$name), + authors = get_authors_list(.metric$authors), + references = get_references_list(.metric$references, bibliography), + additional_info = c( + get_additional_info( + .config$info, + exclude = c("metrics", "type", "type_info"), + name_prefix = "component_" + ), + get_additional_info( + .metric, + exclude = c( + "name", + "label", + "summary", + "description", + "maximize", + "min", + "max", + "links", + "authors", + "references" + ) + ) + ), + version = jsonlite::unbox(.config$version) + ) + }) +}) |> + purrr::list_flatten() +cat("\n>>> Writing output files...\n") +cat("Writing task info to '", par$output, "'...\n", sep = "") jsonlite::write_json( - outputs, + metric_info_json, par$output, - auto_unbox = TRUE, - pretty = TRUE -) \ No newline at end of file + pretty = TRUE, + null = "null" +) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "metric_info.json"), + "-r", + file.path(results_schemas, "core.json"), + "-d", + par$output +) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/get_results/config.vsh.yaml b/src/reporting/get_results/config.vsh.yaml index 3b5f7c6eb..20477062c 100644 --- a/src/reporting/get_results/config.vsh.yaml +++ b/src/reporting/get_results/config.vsh.yaml @@ -1,63 +1,92 @@ name: get_results namespace: reporting -description: Extract execution info +description: Create a schema-compliant results JSON + argument_groups: - name: Inputs arguments: - name: --input_scores type: file - description: Scores file - example: resources_test/openproblems/task_results_v3/raw/score_uns.yaml - - name: --input_execution + description: Scores YAML file + required: true + example: resources_test/openproblems/task_results_v4/raw/score_uns.yaml + - name: --input_trace type: file - description: Nextflow log file - example: resources_test/openproblems/task_results_v3/raw/trace.txt + description: Nextflow trace file + required: true + example: resources_test/openproblems/task_results_v4/raw/trace.txt - name: --input_dataset_info type: file - description: Method info file - example: resources_test/openproblems/task_results_v3/processed/dataset_info.json + description: Dataset info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json - name: --input_method_info type: file - description: Method info file - example: resources_test/openproblems/task_results_v3/processed/method_info.json + description: Method info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/method_info.json - name: --input_metric_info type: file - description: Metric info file - example: resources_test/openproblems/task_results_v3/processed/metric_info.json - - name: Outputs - arguments: - - name: --output_results - type: file - direction: output - description: Output json - default: results.json + description: Metric info JSON file info: format: type: json - # TODO: add schema - - name: --output_metric_execution_info + schema: /common/schemas/results_v4/metric_info.json + required: true + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + + - name: Outputs + arguments: + - name: --output type: file direction: output - description: Output metric execution info - default: metric_execution_info.json + description: Output JSON file matching results schema + default: results.json info: format: type: json - # TODO: add schema + schema: /common/schemas/results_v4/results.json + example: resources_test/openproblems/task_results_v4/processed/results.json + resources: - type: r_script path: script.R + - path: /common/schemas + dest: schemas + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1 setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli - type: r - cran: [ purrr, yaml, rlang, dplyr, tidyr, readr, lubridate, dynutils, processx ] + cran: + - dplyr + - lubridate + - purrr + - readr + - stringr + - tidyr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_results/script.R b/src/reporting/get_results/script.R index 6b4555665..82c2b1691 100644 --- a/src/reporting/get_results/script.R +++ b/src/reporting/get_results/script.R @@ -1,302 +1,342 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -requireNamespace("dynutils", quietly = TRUE) -requireNamespace("readr", quietly = TRUE) -requireNamespace("lubridate", quietly = TRUE) -library(dplyr, warn.conflicts = FALSE) -library(tidyr, warn.conflicts = FALSE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - ## VIASH START -# raw_dir <- "resources_test/openproblems/task_results_v3/raw" -# processed_dir <- "resources_test/openproblems/task_results_v3/processed" -# raw_dir <- "/home/rcannood/workspace/openproblems-bio/task_perturbation_prediction/resources/results/run_2024-10-31_06-14-14" -# processed_dir <- "/home/rcannood/workspace/openproblems-bio/website/results/perturbation_prediction/data" -raw_dir <- "/home/rcannood/workspace/openproblems-bio/task_batch_integration/resources/results/run_2024-11-20_12-47-03" -processed_dir <- "/home/rcannood/workspace/openproblems-bio/website/results/batch_integration/data" +raw_dir <- "resources_test/openproblems/task_results_v4/raw" +processed_dir <- "resources_test/openproblems/task_results_v4/processed" par <- list( - # inputs + # Inputs input_scores = paste0(raw_dir, "/score_uns.yaml"), - input_execution = paste0(raw_dir, "/trace.txt"), + input_trace = paste0(raw_dir, "/trace.txt"), input_dataset_info = paste0(processed_dir, "/dataset_info.json"), input_method_info = paste0(processed_dir, "/method_info.json"), - input_method_configs = paste0(raw_dir, "/method_configs.yaml"), input_metric_info = paste0(processed_dir, "/metric_info.json"), - # outputs - output_results = paste0(processed_dir, "/results.json"), - output_metric_execution_info = paste0(processed_dir, "/metric_execution_info.json") + # Outputs + output = paste0(processed_dir, "/results.json") ) ## VIASH END -# --- helper functions --------------------------------------------------------- -cat("Loading helper functions\n") -parse_exit <- function(x) { - if (is.na(x) || x == "-") { - NA_integer_ - } else { - as.integer(x) - } -} -parse_duration <- function(x) { - if (is.na(x) || x == "-") { - NA_real_ - } else { - as.numeric(lubridate::duration(toupper(x))) - } +################################################################################ +# FUNCTIONS +################################################################################ + +parse_exit_code <- function(exit_codes) { + exit_codes <- as.integer(exit_codes) + # Set missing exit codes to -1 for "Unknown error" + exit_codes[is.na(exit_codes)] <- -1L + exit_codes } -parse_cpu <- function(x) { - if (is.na(x) || x == "-") { - NA_real_ - } else { - as.numeric(gsub(" *%", "", x)) - } + +parse_duration <- function(durations) { + durations |> + toupper() |> + lubridate::duration() |> + as.numeric() } -parse_size <- function(x) { - out <- - if (is.na(x) || x == "-") { - NA_integer_ - } else if (grepl("TB", x)) { - as.numeric(gsub(" *TB", "", x)) * 1024 * 1024 - } else if (grepl("GB", x)) { - as.numeric(gsub(" *GB", "", x)) * 1024 - } else if (grepl("MB", x)) { - as.numeric(gsub(" *MB", "", x)) - } else if (grepl("KB", x)) { - as.numeric(gsub(" *KB", "", x)) / 1024 - } else if (grepl("B", x)) { - as.numeric(gsub(" *B", "", x)) / 1024 / 1024 - } else { - NA_integer_ - } - as.integer(ceiling(out)) + +parse_cpu_pct <- function(cpu_pcts) { + cpu_pcts |> + stringr::str_remove(" *%") |> + as.numeric() } -# --- read input files --------------------------------------------------------- -cat("Reading input files\n") -# read scores -raw_scores <- - yaml::yaml.load_file(par$input_scores) %>% - map_df(function(x) { - tryCatch({ - as_tibble(as.data.frame( - x[c("dataset_id", "method_id", "metric_ids", "metric_values")] - )) - }, error = function(e) { - message("Encountered error while reading scores.\n Error: ", e$message, "\n Data: ", paste(paste0(names(x), "=", x), collapse = ", ")) - NULL - }) - }) +parse_memory <- function(memories) { + values <- memories |> + stringr::str_remove("[[:blank:][:alpha:]]+") |> + as.numeric() -# read metric info -dataset_info <- jsonlite::read_json(par$input_dataset_info, simplifyVector = TRUE) -method_info <- jsonlite::read_json(par$input_method_info, simplifyVector = TRUE) -metric_info <- jsonlite::read_json(par$input_metric_info, simplifyVector = TRUE) + units <- stringr::str_remove(memories, "[[:digit:]\\.[:blank:]]+") -# --- process scores and execution info ---------------------------------------- -cat("Processing scores and execution info\n") -scale_scores <- function(values, is_control, maximize) { - control_values <- values[is_control & !is.na(values)] - if (length(control_values) < 2) { - return(NA_real_) - } - - min_control_value <- min(control_values) - max_control_value <- max(control_values) + multipliers <- dplyr::case_when( + units == "TB" ~ 1024 * 1024, + units == "GB" ~ 1024, + units == "MB" ~ 1, + units == "KB" ~ 1 / 1024, + units == "B" ~ 1 / 1024 / 1024, + TRUE ~ NA + ) - if (min_control_value == max_control_value) { - return(NA_real_) - } + (values * multipliers) |> + ceiling() |> + as.integer() +} - scaled <- (values - min_control_value) / (max_control_value - min_control_value) +missing_to_empty <- function( + values, + mode = c("character", "numeric", "integer") +) { + mode <- match.arg(mode) - if (maximize) { - scaled + if (is.null(values) || (length(values) == 1 && is.na(values))) { + switch( + mode, + character = character(0), + numeric = numeric(0), + integer = integer(0) + ) } else { - 1 - scaled + values } } -aggregate_scores <- function(scaled_score) { - mean(pmin(1, pmax(0, scaled_score)) %|% 0) -} -scores <- raw_scores %>% - complete( - dataset_id, - method_id, - metric_ids, - fill = list(metric_values = NA_real_) - ) %>% - left_join(method_info %>% select(method_id, is_baseline), by = "method_id") %>% - left_join(metric_info %>% select(metric_ids = metric_id, maximize), by = "metric_ids") %>% - group_by(metric_ids, dataset_id) %>% - mutate(scaled_score = scale_scores(metric_values, is_baseline, maximize[[1]]) %|% 0) %>% - group_by(dataset_id, method_id) %>% - summarise( - metric_values = list(as.list(setNames(metric_values, metric_ids))), - scaled_scores = list(as.list(setNames(scaled_score, metric_ids))), - mean_score = aggregate_scores(scaled_score), - .groups = "drop" - ) - -# read execution info -# -> only keep the last execution of each process -input_execution <- readr::read_tsv(par$input_execution) |> - group_by(name) |> - mutate(num_runs = n()) |> - slice(which.max(submit)) |> - ungroup() +map_missing_to_empty <- function( + values_list, + mode = c("character", "numeric") +) { + purrr::map(values_list, missing_to_empty, mode = mode) +} -method_lookup <- map_dfr(method_info$method_id, function(method_id) { - regex <- paste0("(.*:", method_id, ":[^ ]*)") - name <- - input_execution$name[grepl(regex, input_execution$name)] |> - unique() - name_ <- name[!grepl(":publishStatesProc", name)] - tibble(method_id = method_id, name = name_) -}) -dataset_lookup <- map_dfr(dataset_info$dataset_id, function(dataset_id) { - regex <- paste0(".*[(.](", dataset_id, ")[)./].*") - name <- - input_execution$name[grepl(regex, input_execution$name)] |> - unique() - tibble(dataset_id = dataset_id, name = name) -}) +################################################################################ +# MAIN SCRIPT +################################################################################ -# parse values -execution_info_ind <- input_execution |> - left_join(method_lookup, by = "name") |> - left_join(dataset_lookup, by = "name") |> - filter(!is.na(method_id)) %>% - rowwise() |> - mutate( - process_id = gsub(" .*", "", name), - submit = strptime(submit, "%Y-%m-%d %H:%M:%S"), - exit_code = parse_exit(exit), - duration_sec = parse_duration(realtime), - cpu_pct = parse_cpu(`%cpu`), - peak_memory_mb = parse_size(peak_vmem), - disk_read_mb = parse_size(rchar), - disk_write_mb = parse_size(wchar) - ) |> - ungroup() +cat("====== Get results ======\n") -execution_info <- execution_info_ind |> - group_by(dataset_id, method_id) |> - summarise( - resources = list(list( - submit = min(submit), - exit_code = max(exit_code), - duration_sec = sum(duration_sec), - cpu_pct = sum(cpu_pct * duration_sec) / sum(duration_sec), - peak_memory_mb = max(peak_memory_mb), - disk_read_mb = sum(disk_read_mb), - disk_write_mb = sum(disk_write_mb) - )), - .groups = "drop" +cat("\n>>> Reading input files...\n") +cat("Reading method info from '", par$input_method_info, "'...\n", sep = "") +method_info <- jsonlite::read_json(par$input_method_info) +cat("Reading dataset info from '", par$input_dataset_info, "'...\n", sep = "") +dataset_info <- jsonlite::read_json(par$input_dataset_info) +cat("Reading metric info from '", par$input_metric_info, "'...\n", sep = "") +metric_info <- jsonlite::read_json(par$input_metric_info) +cat("Reading scores from '", par$input_scores, "'...\n", sep = "") +scores <- yaml::yaml.load_file(par$input_scores) |> + purrr::map_dfr(\(.x) { + .x[c("dataset_id", "method_id", "metric_ids", "metric_values")] |> + tibble::as_tibble() + }) |> + dplyr::rename( + dataset_name = dataset_id, + method_name = method_id, + metric_name = metric_ids, + metric_value = metric_values ) - -# combine scores with execution info -# fill up missing entries with NAs and 0s -metric_ids <- unique(raw_scores$metric_ids) -rep_names <- function(val) { - setNames( - as.list(rep(val, length(metric_ids))), - metric_ids +cat("Reading execution trace from '", par$input_trace, "'...\n", sep = "") +method_names <- purrr::map_chr(method_info, "name") +metric_components <- unique(purrr::map_chr(metric_info, "component_name")) +trace <- readr::read_tsv( + par$input_trace, + col_types = readr::cols( + task_id = readr::col_integer(), + submit = readr::col_datetime(), + .default = readr::col_character(), + ), + na = c("", "-", "NA") +) |> + # Only keep the most recent run of each process + dplyr::group_by(name) |> + dplyr::slice_max(submit) |> + dplyr::ungroup() |> + # Separate process name and id + dplyr::mutate(name_copy = name) |> + tidyr::separate_wider_delim(name_copy, " ", names = c("process", "id")) |> + # Extract component from process name + dplyr::mutate( + component = purrr::map_chr(process, \(.process) { + rev(stringr::str_split(.process, ":")[[1]])[1] + }) + ) |> + dplyr::mutate(component = stringr::str_remove(component, "_process")) |> + # Only keep method and metric components + dplyr::filter( + component %in% method_names | component %in% metric_components + ) |> + dplyr::mutate(id = stringr::str_remove_all(id, "\\(|\\)")) |> + # Split ID into dataset, method, metric + tidyr::separate_wider_delim( + id, + delim = ".", + names = c("dataset_name", "method_name", "metric_component"), + too_few = "align_start" + ) |> + # Parse resources + dplyr::mutate( + run_exit_code = parse_exit_code(exit), + run_duration_secs = parse_duration(realtime), + run_cpu_pct = parse_cpu_pct(`%cpu`), + run_peak_memory_mb = parse_memory(peak_vmem), + run_disk_read_mb = parse_memory(rchar), + run_disk_write_mb = parse_memory(wchar) + ) |> + # Select columns + dplyr::select( + name, + process, + component, + dataset_name, + method_name, + metric_component, + tidyselect::starts_with("run_") ) -} -out <- full_join( - scores, - execution_info, - by = c("method_id", "dataset_id") -) %>% - rowwise() %>% - mutate( - task_id = par$task_id, - metric_values = list(metric_values %||% rep_names(NA_real_)), - scaled_scores = list(scaled_scores %||% rep_names(0)), - mean_score = mean_score %|% 0, - ) %>% - ungroup() +# Dataset names in the trace may have normalisations appended, map back to the name +dataset_names <- purrr::map_chr(dataset_info, "name") +process_datasets <- unique(trace$dataset_name) +dataset_map <- purrr::map_chr(process_datasets, function(.dataset) { + dataset_names[stringr::str_detect(.dataset, dataset_names)][1] +}) |> + purrr::set_names(process_datasets) +trace$dataset_name <- dataset_map[trace$dataset_name] -# --- process metric execution info -------------------------------------------- -cat("Processing metric execution info\n") - -# manually add component id to metric info -metric_info$component_name <- metric_info$component_name %||% rep(NA_character_, nrow(metric_info)) %|% - gsub(".*/([^/]*)/config\\.vsh\\.yaml", "\\1", metric_info$implementation_url) - -metric_lookup2 <- pmap_dfr(metric_info, function(metric_id, component_name, ...) { - regex <- paste0("(.*:", component_name, ":[^ ]*)") - name <- - input_execution$name[grepl(regex, input_execution$name)] |> - unique() - name_ <- name[!grepl(":publishStatesProc", name)] - tibble(metric_id = metric_id, component_name = component_name, name = name_) -}) -dataset_lookup2 <- map_dfr(dataset_info$dataset_id, function(dataset_id) { - regex <- paste0(".*[(.](", dataset_id, ")[)./].*") - name <- - input_execution$name[grepl(regex, input_execution$name)] |> - unique() - tibble(dataset_id = dataset_id, name = name) -}) -method_lookup2 <- map_dfr(method_info$method_id, function(method_id) { - regex <- paste0(".*[(.](", method_id, ")[)./].*") - name <- - input_execution$name[grepl(regex, input_execution$name)] |> - unique() - tibble(method_id = method_id, name = name) -}) +cat("\n>>> Extracting resources...\n") +cat("Extracting method resources...\n", sep = "") +method_resources <- trace |> + dplyr::filter(component %in% method_names) |> + dplyr::group_by(dataset_name, method_name) |> + dplyr::summarise( + run_exit_code = list(run_exit_code), + run_duration_secs = list(run_duration_secs), + run_cpu_pct = list(run_cpu_pct), + run_peak_memory_mb = list(run_peak_memory_mb), + run_disk_read_mb = list(run_disk_read_mb), + run_disk_write_mb = list(run_disk_write_mb), + .groups = "drop" + ) |> + dplyr::mutate( + succeeded = purrr::map_lgl(run_exit_code, ~ all(.x == 0)), + run_exit_code = map_missing_to_empty(run_exit_code, mode = "integer"), + run_duration_secs = map_missing_to_empty( + run_duration_secs, + mode = "numeric" + ), + run_cpu_pct = map_missing_to_empty(run_cpu_pct, mode = "numeric"), + run_peak_memory_mb = map_missing_to_empty( + run_peak_memory_mb, + mode = "numeric" + ), + run_disk_read_mb = map_missing_to_empty(run_disk_read_mb, mode = "numeric"), + run_disk_write_mb = map_missing_to_empty( + run_disk_write_mb, + mode = "numeric" + ) + ) |> + dplyr::relocate(succeeded, .after = method_name) -metric_execution_info_ind <- input_execution |> - left_join(metric_lookup2, by = "name") |> - left_join(dataset_lookup2, by = "name") |> - left_join(method_lookup2, by = "name") |> - filter(!is.na(metric_id)) %>% - rowwise() |> - mutate( - process_id = gsub(" .*", "", name), - submit = strptime(submit, "%Y-%m-%d %H:%M:%S"), - exit_code = parse_exit(exit), - duration_sec = parse_duration(realtime), - cpu_pct = parse_cpu(`%cpu`), - peak_memory_mb = parse_size(peak_vmem), - disk_read_mb = parse_size(rchar), - disk_write_mb = parse_size(wchar) +cat("Extracting metric resources...\n", sep = "") +metric_resources <- trace |> + dplyr::filter(component %in% metric_components) |> + dplyr::group_by(dataset_name, method_name, metric_component) |> + dplyr::summarise( + run_exit_code = list(run_exit_code), + run_duration_secs = list(run_duration_secs), + run_cpu_pct = list(run_cpu_pct), + run_peak_memory_mb = list(run_peak_memory_mb), + run_disk_read_mb = list(run_disk_read_mb), + run_disk_write_mb = list(run_disk_write_mb), + .groups = "drop" ) |> - ungroup() + dplyr::mutate( + succeeded = purrr::map_lgl(run_exit_code, ~ all(.x == 0)), + run_exit_code = map_missing_to_empty(run_exit_code, mode = "integer"), + run_duration_secs = map_missing_to_empty( + run_duration_secs, + mode = "numeric" + ), + run_cpu_pct = map_missing_to_empty(run_cpu_pct, mode = "numeric"), + run_peak_memory_mb = map_missing_to_empty( + run_peak_memory_mb, + mode = "numeric" + ), + run_disk_read_mb = map_missing_to_empty(run_disk_read_mb, mode = "numeric"), + run_disk_write_mb = map_missing_to_empty( + run_disk_write_mb, + mode = "numeric" + ) + ) |> + dplyr::relocate(succeeded, .after = method_name) -metric_execution_info <- metric_execution_info_ind |> - group_by(dataset_id, method_id, metric_component_name = component_name) |> - summarise( - resources = list(list( - submit = min(submit), - exit_code = max(exit_code), - duration_sec = sum(duration_sec), - cpu_pct = sum(cpu_pct * duration_sec) / sum(duration_sec), - peak_memory_mb = max(peak_memory_mb), - disk_read_mb = sum(disk_read_mb), - disk_write_mb = sum(disk_write_mb) - )), +cat("\n>>> Summarising results...\n") +metric_component_names <- purrr::map_chr(metric_info, "component_name") +metric_component_map <- purrr::map_chr(metric_info, "name") |> + purrr::set_names(metric_component_names) +results <- scores |> + # There shouldn't be any but remove missing/NaN values just in case + dplyr::filter( + !is.na(metric_value) & is.finite(metric_value) + ) |> + dplyr::arrange(dataset_name, method_name, metric_name) |> + dplyr::group_by(dataset_name, method_name) |> + dplyr::summarise( + metric_names = list(metric_name), + metric_values = list(metric_value), .groups = "drop" + ) |> + dplyr::full_join(method_resources, by = c("dataset_name", "method_name")) |> + dplyr::mutate( + metric_components = purrr::map2( + dataset_name, + method_name, + function(.dataset, .method) { + metric_resources |> + dplyr::filter( + dataset_name == .dataset, + method_name == .method + ) |> + dplyr::mutate( + metric_names = purrr::map(metric_component, function(.component) { + metric_component_map[names(metric_component_map) == .component] + }) + ) |> + dplyr::select( + component_name = metric_component, + metric_names, + succeeded, + tidyselect::starts_with("run_") + ) + } + ) + ) |> + # TODO: Add these once available in output + dplyr::mutate( + paramset_name = NA, + paramset = NA + ) |> + dplyr::mutate( + metric_names = map_missing_to_empty(metric_names, mode = "character"), + metric_values = map_missing_to_empty(metric_values, mode = "numeric") + ) |> + dplyr::select( + dataset_name, + method_name, + paramset_name, + paramset, + succeeded, + tidyselect::starts_with("run_"), + metric_names, + metric_values, + metric_components ) +dplyr::glimpse(results) -# --- write output files ------------------------------------------------------- -cat("Writing output files\n") -# write output files +cat("\n>>> Writing output files...\n") +cat("Writing results to '", par$output, "'...\n", sep = "") jsonlite::write_json( - purrr::transpose(out), - par$output_results, - auto_unbox = TRUE, - pretty = TRUE + results, + par$output, + pretty = TRUE, + null = "null", + na = "null" ) -jsonlite::write_json( - purrr::transpose(metric_execution_info), - par$output_metric_execution_info, - auto_unbox = TRUE, - pretty = TRUE + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "results.json"), + "-d", + par$output ) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/get_task_info/config.vsh.yaml b/src/reporting/get_task_info/config.vsh.yaml index 0798159ee..408147d3e 100644 --- a/src/reporting/get_task_info/config.vsh.yaml +++ b/src/reporting/get_task_info/config.vsh.yaml @@ -1,35 +1,61 @@ name: get_task_info namespace: reporting -description: Extract task info -arguments: - - name: --input - type: file - description: A yaml file - required: true - example: resources_test/openproblems/task_results_v3/raw/task_info.yaml - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema +description: Convert task info YAML to schema-compliant JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: Task info YAML file + required: true + example: resources_test/openproblems/task_results_v4/raw/task_info.yaml + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: task_info.json + description: Output JSON file matching task info schema + info: + format: + type: json + schema: /common/schemas/results_v4/task_info.json + example: resources_test/openproblems/task_results_v4/processed/task_info.json + resources: - type: r_script path: script.R + - path: /src/reporting/shared/functions.R + dest: functions.R + - path: /common/schemas + dest: schemas + - path: /src/reporting/shared/bibliography.bib + dest: bibliography.bib + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1 setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli - type: r - cran: [ purrr, yaml, rlang, processx ] + cran: + - bibtex + - purrr + - stringr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_task_info/script.R b/src/reporting/get_task_info/script.R index 5e22fe485..9f1b249ec 100644 --- a/src/reporting/get_task_info/script.R +++ b/src/reporting/get_task_info/script.R @@ -1,57 +1,83 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - -## VIASH START +### VIASH START par <- list( - input = "resources_test/openproblems/task_results_v3/raw/task_info.yaml", - output = "resources_test/openproblems/task_results_v3/processed/task_info.json" + input = "resources_test/openproblems/task_results_v4/raw/task_info.yaml", + output = "task_info.json" ) ## VIASH END -info <- yaml::yaml.load_file(par$input) -# ↑ this could be used as the new format - -# construct v1 format -repo <- - if ("links" %in% names(info) && "repository" %in% names(info$links)) { - info$links$repository - } else if ("name" %in% names(info) && "organization" %in% names(info)) { - paste0(info$organization, "/", info$name) - } else { - "openproblems-bio/openproblems" - } -description <- - if ("motivation" %in% names(info)) { - paste0(info$motivation, "\n\n", info$description) - } else { - info$description - } -out <- list( - task_id = info$name, - commit_sha = NA_character_, - task_name = info$label, - task_summary = info$summary, - task_description = description, - repo = repo, - issue_tracker = info$links$issue_tracker %||% NA_character_, - authors = info$authors, - version = info$version, - license = info$license %||% NA_character_ +source(file.path(meta$resources_dir, "functions.R")) + +cat("====== Get task info ======\n") + +`%||%` <- rlang::`%||%` +cat("\n>>> Reading input files...\n") +cat("Reading task info from '", par$input, "'...\n", sep = "") +task_info_yaml <- yaml::read_yaml(par$input) + +cat("\n>>> Getting references...\n") +bibliography <- read_bibliography( + file.path(meta$resources_dir, "bibliography.bib") ) +references <- get_references_list(task_info_yaml$references, bibliography) +str(references) -# show warning when certain data is missing and return null? -for (n in names(out)) { - if (is.null(out[[n]])) { - out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) - stop("missing value for value '", n, "' in ", out_as_str) - } -} +cat("\n>>> Getting authors...\n") +authors <- get_authors_list(task_info_yaml$authors) +cat("Found", length(authors), "authors\n") + +cat("\n>>> Creating JSON list...\n") +task_info_json <- list( + name = jsonlite::unbox(sub("^task_", "", task_info_yaml$name)), # Remove "task_" prefix + commit = jsonlite::unbox(NA_character_), # TODO: Add when available in task_info.yaml + label = jsonlite::unbox(task_info_yaml$label), + summary = task_info_yaml$summary |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + description = task_info_yaml$description |> + stringr::str_trim() |> + stringr::str_remove_all('(^"|"$|^\'|\'$)') |> + jsonlite::unbox(), + repository = jsonlite::unbox(task_info_yaml$links$repository), + authors = authors, + license = jsonlite::unbox(task_info_yaml$license), + references = references, + version = jsonlite::unbox(task_info_yaml$version), + is_prerelease = jsonlite::unbox(TRUE) +) +str(task_info_json) +cat("\n>>> Writing output files...\n") +cat("Writing task info to '", par$output, "'...\n", sep = "") jsonlite::write_json( - out, + task_info_json, par$output, - auto_unbox = TRUE, - pretty = TRUE + pretty = TRUE, + null = "null" ) + +cat("\n>>> Validating output against schema...\n") +results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4") +ajv_args <- paste( + "validate", + "--spec draft2020", + "-s", + file.path(results_schemas, "task_info.json"), + "-r", + file.path(results_schemas, "core.json"), + "-d", + par$output +) + +cat("Running validation command:", "ajv", ajv_args, "\n") +cat("Output:\n") +validation_result <- system2("ajv", ajv_args) + +if (validation_result == 0) { + cat("JSON validation passed successfully!\n") +} else { + cat("JSON validation failed!\n") + stop("Output JSON does not conform to schema") +} + +cat("\n>>> Done!\n") diff --git a/src/reporting/process_task_results/config.vsh.yaml b/src/reporting/process_task_results/config.vsh.yaml index 60b687615..e1703bf52 100644 --- a/src/reporting/process_task_results/config.vsh.yaml +++ b/src/reporting/process_task_results/config.vsh.yaml @@ -1,84 +1,136 @@ name: process_task_results namespace: reporting -description: >- - This workflow transforms the meta information of the results into a format +description: | + This workflow summarises and collects the output from a task run in a format that can be used by the website. + argument_groups: - name: Inputs arguments: - - name: "--input_scores" + - name: "--input_task_info" type: file required: true direction: input - description: A yaml file containing the scores of each of the methods - example: score_uns.yaml - - name: "--input_method_configs" + description: A YAML file containing task information + example: resources_test/openproblems/task_results_v4/raw/task_info.yaml + - name: "--input_dataset_info" type: file required: true direction: input - example: method_configs.yaml - - name: "--input_metric_configs" + description: A YAML file containing dataset information + example: resources_test/openproblems/task_results_v4/raw/dataset_info.yaml + - name: "--input_method_configs" type: file required: true direction: input - example: metric_configs.yaml - - name: "--input_dataset_info" + description: A YAML file containing method configurations + example: resources_test/openproblems/task_results_v4/raw/method_configs.yaml + - name: "--input_metric_configs" type: file required: true direction: input - example: dataset_info.yaml - - name: "--input_execution" + description: A YAML file containing metric configurations + example: resources_test/openproblems/task_results_v4/raw/metric_configs.yaml + - name: "--input_scores" type: file required: true direction: input - example: trace.txt - - name: "--input_task_info" + description: A YAML file containing the scores of each of the methods + example: resources_test/openproblems/task_results_v4/raw/score_uns.yaml + - name: "--input_trace" type: file required: true direction: input - example: task_info.yaml + description: Nextflow execution trace file + example: resources_test/openproblems/task_results_v4/raw/trace.txt + - name: Outputs arguments: - - name: "--output_scores" + - name: "--output_combined" type: file required: true direction: output - description: A yaml file containing the scores of each of the methods - default: results.json - - name: "--output_method_info" + description: Combined task results JSON file + default: combined_output.json + info: + format: + type: json + schema: /common/schemas/results_v4/task_results.json + - name: "--output_report" type: file required: true direction: output - default: method_info.json - - name: "--output_metric_info" + description: HTML run report + default: report.html + info: + format: + type: html + - name: "--output_task_info" type: file required: true direction: output - default: metric_info.json + description: Task info JSON file + default: task_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/task_info.json - name: "--output_dataset_info" type: file required: true direction: output + description: Dataset info JSON file default: dataset_info.json - - name: "--output_task_info" + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + - name: "--output_method_info" type: file required: true direction: output - default: task_info.json - - name: "--output_qc" + description: Method info JSON file + default: method_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + - name: "--output_metric_info" type: file required: true direction: output - default: quality_control.json - - name: "--output_metric_execution_info" + description: Metric info JSON file + default: metric_info.json + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + - name: "--output_results" type: file required: true direction: output - default: metric_execution_info.json + description: Results JSON file + default: results.json + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + - name: "--output_quality_control" + type: file + required: true + direction: output + description: Quality control JSON file + default: quality_control.json + info: + format: + type: json + schema: /common/schemas/results_v4/quality_control.json + resources: - type: nextflow_script path: main.nf entrypoint: run_wf + dependencies: - name: reporting/get_results - name: reporting/get_method_info @@ -86,5 +138,8 @@ dependencies: - name: reporting/get_dataset_info - name: reporting/get_task_info - name: reporting/generate_qc + - name: reporting/combine_output + - name: reporting/render_report + runners: - - type: nextflow \ No newline at end of file + - type: nextflow diff --git a/src/reporting/process_task_results/main.nf b/src/reporting/process_task_results/main.nf index fc85605ff..1fc64f389 100644 --- a/src/reporting/process_task_results/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -1,3 +1,10 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + workflow run_wf { take: input_ch @@ -18,62 +25,81 @@ workflow run_wf { [id, state + ["task_id": task_id]] } + | get_dataset_info.run( + fromState: [ + "input": "input_dataset_info", + ], + toState: ["output_dataset": "output"] + ) + | get_method_info.run( - fromState: [ + fromState: [ "input": "input_method_configs", ], toState: ["output_method": "output"] ) | get_metric_info.run( - fromState: [ + fromState: [ "input": "input_metric_configs", ], toState: ["output_metric": "output"] ) - | get_dataset_info.run( - fromState: [ - "input": "input_dataset_info", - ], - toState: ["output_dataset": "output"] - ) - | get_results.run( - fromState: [ + fromState: [ "input_scores": "input_scores", - "input_execution": "input_execution", + "input_trace": "input_trace", "input_dataset_info": "output_dataset", "input_method_info": "output_method", "input_metric_info": "output_metric" ], toState: [ - "output_results": "output_results", - "output_metric_execution_info": "output_metric_execution_info" + "output_results": "output" ] ) | generate_qc.run( fromState: [ - "task_info": "output_task", - "method_info": "output_method", - "metric_info": "output_metric", - "dataset_info": "output_dataset", - "results": "output_results" + "input_task_info": "output_task", + "input_dataset_info": "output_dataset", + "input_method_info": "output_method", + "input_metric_info": "output_metric", + "input_results": "output_results" ], toState: ["output_qc": "output"] ) + | combine_output.run( + fromState: [ + "input_task_info": "output_task", + "input_quality_control": "output_qc", + "input_metric_info": "output_metric", + "input_method_info": "output_method", + "input_dataset_info": "output_dataset", + "input_results": "output_results" + ], + toState: ["output_combined": "output"] + ) + + | render_report.run( + fromState: [ + "input_task_results": "output_combined" + ], + toState: ["output_report": "output"] + ) + | setState([ - "output_scores": "output_results", + "output_combined": "output_combined", + "output_report": "output_report", + "output_task_info": "output_task", + "output_dataset_info": "output_dataset", "output_method_info": "output_method", "output_metric_info": "output_metric", - "output_dataset_info": "output_dataset", - "output_task_info": "output_task", - "output_qc": "output_qc", - "output_metric_execution_info": "output_metric_execution_info" + "output_results": "output_results", + "output_quality_control": "output_qc" ]) emit: output_ch -} \ No newline at end of file +} diff --git a/src/reporting/render_report/config.vsh.yaml b/src/reporting/render_report/config.vsh.yaml new file mode 100644 index 000000000..8864c7978 --- /dev/null +++ b/src/reporting/render_report/config.vsh.yaml @@ -0,0 +1,82 @@ +name: render_report +namespace: reporting +description: Render a HTML report summarizing the results + +argument_groups: + - name: Inputs + arguments: + - name: --input_task_results + type: file + description: Combined task results JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/combined_output.json + required: true + example: resources_test/openproblems/task_results_v4/processed/combined_output.json + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: HTML report file + default: report.html + info: + format: + type: html + +resources: + - type: r_script + path: script.R + - path: /src/reporting/render_report/report-template.qmd + dest: report.qmd + - path: /src/reporting/render_report/report-functions.R + dest: functions.R + - path: /src/reporting/render_report/logo.svg + dest: logo.svg + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: docker + run: | + export QUARTO_VERSION="1.7.32" && \ + mkdir -p /opt/quarto/${QUARTO_VERSION} && \ + wget -O quarto.tar.gz "https://github.com/quarto-dev/quarto-cli/releases/download/v${QUARTO_VERSION}/quarto-${QUARTO_VERSION}-linux-amd64.tar.gz" && \ + tar -zxvf quarto.tar.gz -C "/opt/quarto/${QUARTO_VERSION}" --strip-components=1 && \ + ln -s /opt/quarto/${QUARTO_VERSION}/bin/quarto /usr/local/bin/quarto && \ + rm quarto.tar.gz + - type: r + cran: + - commonmark + - dplyr + - funkyheatmap + - ggplot2 + - here + - htmltools + - jsonlite + - knitr + - patchwork + - purrr + - quarto + - rcrossref + - reactable + - scales + - stringr + - tibble + - tidyr + - xfun + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/reporting/render_report/logo.svg b/src/reporting/render_report/logo.svg new file mode 100644 index 000000000..70228226e --- /dev/null +++ b/src/reporting/render_report/logo.svg @@ -0,0 +1,54 @@ + + + + + + + diff --git a/src/reporting/render_report/report-functions.R b/src/reporting/render_report/report-functions.R new file mode 100644 index 000000000..fcb3fea28 --- /dev/null +++ b/src/reporting/render_report/report-functions.R @@ -0,0 +1,549 @@ +# Tables ---- + +#' Get authors table +#' +#' @param authors Authors list from results JSON +#' +#' @returns A `reactable` table containing the authors +get_authors_table <- function(authors) { + authors_data <- purrr::map_dfr(authors, function(.author) { + other_info <- purrr::map_chr(names(.author$info), \(.info) { + paste(.info, toString(.author$info[[.info]]), sep = ": ") + }) |> + paste(collapse = ", ") + + data.frame( + name = .author$name, + roles = paste(.author$roles, collapse = ", "), + github = .author$github %||% NA_character_, + orcid = .author$orcid %||% NA_character_, + info = other_info + ) + }) + + colnames(authors_data) <- stringr::str_to_sentence(colnames(authors_data)) + reactable::reactable( + authors_data, + columns = list( + Roles = reactable::colDef(name = "Roles"), + Github = reactable::colDef( + name = "GitHub", + cell = function(value, index, column) { + if (!is.na(value)) { + paste0("", value, "") + } else { + "" + } + }, + style = list("font-family" = "monospace"), + html = TRUE + ), + Orcid = reactable::colDef( + name = "ORCiD", + cell = function(value, index, column) { + if (!is.na(value)) { + paste0("", value, "") + } else { + "" + } + }, + html = TRUE + ) + ), + striped = TRUE, + sortable = FALSE + ) +} + +#' Get references table +#' +#' @param references References list from results JSON +#' +#' @returns A `reactable` table containing the references +#' +#' @details +#' Information for DOI references is retrieved from CrossRef. BibTeX references +#' are formatted as code and ID references are shown as IDs. +get_references_table <- function(references) { + if (all(c("doi", "bibtex") %in% names(references))) { + references_df <- data.frame( + reference_type = character(0), + reference = character(0) + ) + + dois <- references$doi + if (!(is.null(dois) || length(dois) == 0)) { + doi_strs <- unlist(rcrossref::cr_cn(references$doi, format = "text")) + references_df <- dplyr::bind_rows( + references_df, + data.frame( + reference_type = "DOI", + reference = doi_strs + ) + ) + } + + bibtex <- references$bibtex + if (!(is.null(bibtex) || length(bibtex) == 0)) { + bibtex_strs <- purrr::map_chr(bibtex, function(.bibtex) { + prettify_bibtex(.bibtex, output = "html") + }) + references_df <- dplyr::bind_rows( + references_df, + data.frame( + reference_type = "BibTeX", + reference = bibtex_strs + ) + ) + } + } else { + references_df <- data.frame( + reference_type = "ID", + reference = unlist(references) + ) + } + + reactable::reactable( + references_df, + columns = list( + reference = reactable::colDef( + name = "References", + cell = function(value, index, column) { + reference_type <- references_df$reference_type[[index]] + + if (reference_type == "ID") { + paste("ID:", value) + } else { + value + } + }, + style = function(value, row) { + reference_type <- references_df$reference_type[[row]] + + if (reference_type == "BibTeX") { + list("font-family" = "monospace") + } else if (reference_type == "ID") { + list("font-family" = "monospace") + } + }, + html = TRUE + ), + reference_type = reactable::colDef(show = FALSE) + ), + striped = TRUE, + sortable = FALSE + ) +} + +#' Get source table +#' +#' @param details_df A data frame containing details +#' @param source_columns A character vector of column names to include in the +#' source table +#' +#' @returns A `reactable` table containing the source information +#' +#' @details +#' The source columns are formatted as monospace text +get_source_table <- function(details_df, source_columns) { + source_df <- details_df[, source_columns, drop = FALSE] + + reactable::reactable( + source_df, + columns = purrr::map(names(source_columns), function(.label) { + reactable::colDef( + name = .label, + style = list("font-family" = "monospace") + ) + }) |> + purrr::set_names(source_columns), + sortable = FALSE + ) +} + + +#' Get description table +#' +#' @param description_df A data frame containing the description information +#' +#' @returns A `reactable` table containing the description +#' +#' @details +#' The description Markdown is rendered as HTML +get_description_table <- function(description_df) { + reactable::reactable( + description_df, + columns = list( + description = reactable::colDef( + name = "Description", + cell = function(value) { + commonmark::markdown_html(value) + }, + html = TRUE + ) + ), + sortable = FALSE + ) +} + +#' Get links table +#' +#' @param details_df A data frame containing details +#' @param link_columns A character vector of column names to include in the +#' links table +#' +#' @returns A `reactable` table containing the links +#' +#' @details +#' The link columns are formatted as HTML links +get_links_table <- function(details_df, link_columns) { + links_df <- details_df[, link_columns, drop = FALSE] + + reactable::reactable( + links_df, + columns = purrr::map(names(link_columns), function(.label) { + reactable::colDef( + name = .label, + cell = format_html_link, + html = TRUE + ) + }) |> + purrr::set_names(link_columns), + sortable = FALSE + ) +} + +#' Get additional information table +#' +#' @param additional_info A list containing additional information to display in +#' a table +#' +#' @returns A `reactable` table containing the additional information or a HTML +#' div with a message +#' +#' @details +#' Nicer heading are created from the column names, otherwise values are shown +#' as given. The additional information can contain any fields so we cannot +#' handle them specifically. +#' +#' If there are is no additional information, a div containing a message is +#' returned. A message is also returned if the additional information fails to +#' render. +get_additional_info_table <- function(additional_info) { + if (is.null(additional_info) || length(additional_info) == 0) { + return(htmltools::div( + "No additional information found", + style = "padding: 0.5rem" + )) + } + + tryCatch( + { + additional_data <- additional_info |> + purrr::map(\(.x) { + paste(.x, collapse = ", ") + }) |> + as.data.frame() + + colnames(additional_data) <- colnames(additional_data) |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_sentence() + + reactable::reactable( + additional_data + ) + }, + error = function(e) { + htmltools::div( + paste( + "Additional information failed to render with error: ", + e$message + ), + style = "padding: 0.5rem", + ) + } + ) +} + +#' Get quality control table +#' +#' @param qc_df A data frame containing quality control information +#' +#' @returns A `reactable` table containing the quality control checks +get_qc_table <- function(qc_df) { + reactable::reactable( + qc_df[, c("label", "severity")], + + columns = list( + label = reactable::colDef(name = "Check"), + severity = reactable::colDef( + name = "Severity", + cell = function(value) { + switch(value, "1" = "❌", "2" = "❌❌", "3" = "❌❌❌", ) + } + ) + ), + + details = function(index, column) { + details_df <- qc_df[index, , drop = FALSE] + + details_table <- reactable::reactable( + details_df[, c("value", "condition", "severity_value")], + columns = list( + value = reactable::colDef( + name = "Value", + format = reactable::colFormat(digits = 2), + width = 100 + ), + condition = reactable::colDef( + name = "Condition", + style = list("font-family" = "monospace") + ), + severity_value = reactable::colDef( + name = "Severity value", + format = reactable::colFormat(digits = 2) + ) + ), + sortable = FALSE + ) + + message_table <- reactable::reactable( + details_df[, "message", drop = FALSE], + columns = list( + message = reactable::colDef( + name = "Message", + cell = function(value) { + stringr::str_replace_all( + value, + "\n", + "
" + ) + }, + html = TRUE + ) + ), + sortable = FALSE + ) + + htmltools::div( + style = "padding: 1rem", + details_table, + message_table + ) + }, + + striped = TRUE, + highlight = TRUE, + defaultSorted = "severity", + defaultSortOrder = "desc", + defaultPageSize = 25, + showPageSizeOptions = TRUE, + + rowStyle = reactable::JS( + "function(rowInfo) { + return { + borderLeft: '2px solid #104E8B', + fontWeight: 400 + } + }" + ) + ) +} + +# Plotting ---- + +#' Plot scaling +#' +#' @param complete_scores A long data frame containing all scaled metric scores +#' @param sel_metric The metric to plot +#' @param method_details A data frame containing method details +#' @param metric_details A data frame containing metric details +#' +#' @returns A `ggplot` object showing the scaling of the selected metric +#' +#' @details +#' Creates a normalization plot showing scaling of metric values, highlighting +#' values outside the [0, 1] range. A main panel shows all datasets and a +#' secondary panel is faceted by dataset. +plot_scaling <- function( + complete_scores, + sel_metric, + method_details, + metric_details +) { + plot_data <- complete_scores |> + dplyr::filter(metric == sel_metric) |> + dplyr::mutate( + method = factor( + method, + levels = method_details$method, + labels = method_details$method_label + ), + method_type = factor( + method_type, + levels = sort(unique(method_type)), + labels = sort(unique(method_type)) |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_sentence() + ), + ) + + norm_plot <- ggplot2::ggplot( + plot_data, + ggplot2::aes(x = scaled_value, y = method) + ) + + ggplot2::annotate( + geom = "rect", + xmin = -Inf, + xmax = 0, + ymin = -Inf, + ymax = Inf, + fill = "red", + alpha = 0.1 + ) + + ggplot2::annotate( + geom = "rect", + xmin = 1, + xmax = Inf, + ymin = -Inf, + ymax = Inf, + fill = "red", + alpha = 0.1 + ) + + ggplot2::geom_vline( + xintercept = c(0, 1), + linetype = "dashed", + colour = "red" + ) + + ggplot2::geom_path(ggplot2::aes(group = dataset)) + + ggplot2::geom_point(ggplot2::aes(colour = method_type)) + + ggplot2::scale_y_discrete(limits = rev) + + ggplot2::scale_colour_brewer(palette = "Set1") + + ggplot2::labs(x = "Scaled value") + + ggplot2::theme_minimal() + + ggplot2::theme( + panel.border = ggplot2::element_rect(fill = NA), + legend.position = "bottom", + legend.title = ggplot2::element_blank(), + axis.title.y = ggplot2::element_blank() + ) + + norm_facets <- norm_plot + + ggplot2::facet_wrap( + ~dataset, + scales = "free_x", + labeller = ggplot2::as_labeller( + \(.x) { + stringr::str_wrap(.x, width = 10, whitespace_only = FALSE) + } + ) + ) + + norm_panel <- patchwork::wrap_plots( + norm_plot + ggplot2::labs(title = "Overall"), + norm_facets + ggplot2::labs(title = "By dataset"), + ncol = 1, + guides = "collect" + ) & + ggplot2::theme(legend.position = "bottom") + + norm_panel + + patchwork::plot_annotation( + title = metric_details$metric_label[metric_details$metric == sel_metric], + ) +} + +# Formatting ---- + +#' Prettify BibTeX +#' +#' @param bibtex BibTeX string to prettify +#' @param output Output format, either "md" for Markdown or "html" for HTML +#' +#' @returns A prettified BibTeX string formatted for the specified output +prettify_bibtex <- function(bibtex, output = c("md", "html")) { + output <- match.arg(output) + + newline <- switch( + output, + md = "\n", + html = "
" + ) + + bibtex_str <- bibtex |> + stringr::str_squish() |> + stringr::str_replace(", ", paste0(",", newline, " ")) |> + stringr::str_replace_all("\\}, ", paste0("\\},", newline, " ")) |> + stringr::str_replace("\\s?\\}$", paste0(newline, "\\}")) + + if (output == "html") { + bibtex_str <- paste0("
", bibtex_str, "
") + } + + bibtex_str +} + +#' Format HTML link +#' +#' @param value The URL to format as an HTML link +#' +#' @returns A string containing the HTML link +format_html_link <- function(value) { + paste0("", value, "") +} + +#' Label memory +#' +#' @param x_mb A numeric vector of memory sizes in megabytes (MB) +#' @param include_mb A logical indicating whether to include label values less +#' than 1 GB +#' +#' @returns A character vector with memory labels +label_memory <- function(x_mb, include_mb = TRUE) { + dplyr::case_when( + is.na(x_mb) | x_mb < 0 ~ "NA", + x_mb < 1 ~ "<1M", + x_mb < 1e3 & !include_mb ~ "<1G", + x_mb < 1e3 ~ paste0(round(x_mb), "M"), + x_mb < 1e6 ~ paste0(round(x_mb / 1e3), "G"), + x_mb < 1e9 ~ paste0(round(x_mb / 1e6), "T"), + TRUE ~ ">1P" + ) +} + +#' Label time +#' +#' @param time A numeric vector of time values in seconds +#' +#' @returns A character vector with time labels +label_time <- function(time) { + dplyr::case_when( + is.na(time) | time < 0 ~ "NA", + time < 1e-5 ~ "0s", + time < 1 ~ "<1s", + time < 60 ~ paste0(floor(time), "s"), + time < 3600 ~ paste0(floor(time / 60), "m"), + time < 3600 * 24 ~ paste0(floor(time / 3600), "h"), + time < 3600 * 24 * 7 ~ paste0(floor(time / 3600 / 24), "d"), + TRUE ~ ">7d" + ) +} + +# Helpers ---- + +#' Aggregate scores +#' +#' @param scores A vector of scores to aggregate +#' +#' @returns An aggregated mean score +#' +#' @details +#' Values are restricted to between 0 and 1 and missing values are replaced by +#' 0. For use in creating the summary FunkyHeatmap +aggregate_scores <- function(scores) { + scores[is.na(scores)] <- 0 + scores[scores < 0] <- 0 + scores[scores > 1] <- 1 + + mean(scores, na.rm = TRUE) +} diff --git a/src/reporting/render_report/report-template.qmd b/src/reporting/render_report/report-template.qmd new file mode 100644 index 000000000..133d261b4 --- /dev/null +++ b/src/reporting/render_report/report-template.qmd @@ -0,0 +1,1217 @@ +--- +title: "Open Problems task run report" +date: today + +format: + html: + theme: cosmo + toc: true + toc-depth: 2 + embed-resources: true + grid: + body-width: 1000px + +brand: + logo: + medium: favicon.svg + + color: + palette: + black: "#1A1A1A" + white: "#FFFFFF" + blue: "#165AE3" + foreground: black + background: white + primary: blue + + typography: + fonts: + - family: DM Sans + source: google + - family: Plus Jakarta Sans + source: google + + base: DM Sans + headings: + family: Plus Jakarta Sans + weight: 400 + +lightbox: true +number-sections: true + +execute: + echo: false + +knitr: + opts_chunk: + out.width: "100%" + +params: + task_results_json: task_results.json + logo: logo.svg + functions: functions.R +--- + +```{r params, eval=FALSE} +params <- list( + task_results_json = "resources_test/openproblems/task_results_v4/processed/combined_output.json", + logo = "src/reporting/render_report/logo.svg", + functions = "src/reporting/render_report/report-functions.R" +) +``` + +```{r} +#| label: source +source(params$functions) + +`%||%` <- rlang::`%||%` +``` + +```{r} +#| label: load +task_results <- jsonlite::read_json( + params$task_results_json, + simplifyVector = FALSE, + simplifyDataFrame = FALSE +) +``` + +![](`r params$logo`){fig-align="center" width=80%} + +# Introduction + +This report displays and summarizes the output from an Open Problems task run. +You can use it to check the results before they are uploaded to the Open Problems website. + +Please pay particular attention to **@sec-normalization Normalization** and **@sec-quality-control Quality control** to see if there are any issues with the task run. + +::: {.callout-caution} +The results in this report are preliminary and may be slightly different to what +are displayed in the final version on the Open Problems website +::: + +# Task information + +::: {.callout-note} +This section displays the task information as provided in the task `_viash.yaml` file +::: + +```{r} +#| label: task-info +task_info <- task_results$task_info +``` + +## Summary + +**Task:** `r task_info$label` + +`r task_info$summary` + +```{r} +#| label: task-repo_str +task_repo_str <- stringr::str_remove(task_info$repository, "https://github.com/") +``` + +**Repository:** [``r task_repo_str``](`r task_info$repository`) + +**License:** `r task_info$license` + +```{r} +#| label: task-info-version +task_version <- task_info$version +task_prerelease <- ifelse(task_info$is_prerelease, "(Pre-release)", "") +``` + +**Version:** ``r task_version`` `r task_prerelease` + +`r if(!is.null(task_info$commit)) paste0("**Commit:** \x60", task_info$commit, "\x60")` + +## Description + +`r task_info$description` + +## Authors + +```{r} +#| label: task-authors +get_authors_table(task_info$authors) +``` + +## References + +```{r} +#| label: task-references +get_references_table(task_info$references) +``` + +# Dataset information + +::: {.callout-note} +This section displays the dataset information as provided in the dataset `config.vsh.yaml` files. + +Expand each row of the table for more details. +::: + +```{r} +#| label: dataset-info +dataset_info <- task_results$dataset_info + +dataset_summary <- purrr::map_dfr(dataset_info, function(.dataset) { + data.frame( + dataset = .dataset$name, + label = .dataset$label, + summary = .dataset$summary + ) +}) + +dataset_details <- purrr::map_dfr(dataset_info, function(.dataset) { + data.frame( + description = .dataset$description, + modalities = paste(.dataset$modalities, collapse = ", "), + organisms = paste(.dataset$organisms, collapse = ", "), + file_size_mb = .dataset$file_size_mb, + commit = .dataset$commit, + source_url = .dataset$source_url, + common_dataset_names = paste(.dataset$common_dataset_names, collapse = ", "), + date_created = .dataset$date_created + ) +}) + +detail_columns <- c("modalities","organisms", "file_size_mb") +source_columns <- c("commit", "source_url", "common_dataset_names", "date_created") + +reactable::reactable( + dataset_summary, + columns = list( + dataset = reactable::colDef( + name = "Dataset", + style = list("font-family" = "monospace"), + html = TRUE + ), + label = reactable::colDef(name = "Label"), + summary = reactable::colDef(name = "Summary") + ), + + details = function(index, column) { + description_table <- get_description_table( + dataset_details[index, c("description"), drop = FALSE] + ) + + details_table <- reactable::reactable( + dataset_details[index, detail_columns], + columns = list( + modalities = reactable::colDef(name = "Modalities"), + organisms = reactable::colDef(name = "Organisms"), + file_size_mb = reactable::colDef( + name = "File size (MB)", + format = reactable::colFormat(digits = 2) + ) + ), + sortable = FALSE + ) + + source_table <- reactable::reactable( + dataset_details[index, source_columns], + columns = list( + commit = reactable::colDef( + name = "Commit", + style = list("font-family" = "monospace") + ), + source_url = reactable::colDef( + name = "Source URL", + cell = format_html_link, + html = TRUE + ), + common_dataset_names = reactable::colDef(name = "Common datasets"), + date_created = reactable::colDef(name = "Date created") + ), + sortable = FALSE + ) + + if (length(dataset_info[[index]]$authors) > 0) { + authors_table <- get_authors_table(dataset_info[[index]]$authors) + } else { + authors_table <- NULL + } + + references_table <- get_references_table(dataset_info[[index]]$references) + + htmltools::div( + style = "padding: 1rem", + description_table, + details_table, + source_table, + authors_table, + references_table + ) + }, + + highlight = TRUE, + striped = TRUE, + pagination = FALSE, + + rowStyle = reactable::JS( + "function(rowInfo) { + if (rowInfo.level == 0) { // corresponds to row group + return { + borderLeft: '2px solid #104E8B', + fontWeight: 400 + } + } + }" + ) +) +``` + +# Method information + +::: {.callout-note} +This section displays the method information as provided in the method `config.vsh.yaml` files. + +Expand each row of the table for more details. +::: + +```{r} +#| label: method-info +method_info <- task_results$method_info + +method_summary <- purrr::map_dfr(method_info, function(.method) { + data.frame( + method = .method$name, + label = .method$label, + type = .method$type, + summary = .method$summary + ) +}) + +method_details <- purrr::map_dfr(method_info, function(.method) { + method_data <- purrr::map(.method, \(.x) {ifelse(is.null(.x), "", .x)}) + + data.frame( + description = method_data$description, + commit = method_data$commit, + version = method_data$version, + link_code = method_data$link_code, + link_documentation = method_data$link_documentation, + link_implementation = method_data$link_implementation, + link_container_image = method_data$link_container_image + ) +}) + +source_columns <- c( + "Commit" = "commit", + "Version" = "version" +) + +link_columns <- c( + "Code" = "link_code", + "Documentation" = "link_documentation", + "Implementation" = "link_implementation", + "Image" = "link_container_image" +) + +reactable::reactable( + method_summary, + columns = list( + method = reactable::colDef( + name = "Method", + style = list("font-family" = "monospace"), + html = TRUE + ), + label = reactable::colDef(name = "Label"), + type = reactable::colDef( + name = "Type", + cell = function(value) { + value |> + stringr::str_replace_all("_", " ") |> + stringr::str_to_sentence() + }, + ), + summary = reactable::colDef(name = "Summary") + ), + + details = function(index, column) { + description_table <- get_description_table( + method_details[index, c("description"), drop = FALSE] + ) + + source_table <- get_source_table(method_details[index, ], source_columns) + + links_table <- get_links_table(method_details[index, ], link_columns) + + additional_table <- get_additional_info_table( + method_info[[index]]$additional_info + ) + + if (length(method_info[[index]]$authors) > 0) { + authors_table <- get_authors_table(method_info[[index]]$authors) + } else { + authors_table <- NULL + } + + references_table <- get_references_table(method_info[[index]]$references) + + htmltools::div( + style = "padding: 1rem", + description_table, + source_table, + links_table, + additional_table, + authors_table, + references_table + ) + }, + + highlight = TRUE, + striped = TRUE, + pagination = FALSE, + + rowStyle = reactable::JS( + "function(rowInfo) { + if (rowInfo.level == 0) { // corresponds to row group + return { + borderLeft: '2px solid #104E8B', + fontWeight: 400 + } + } + }" + ) +) +``` + +# Metric information + +::: {.callout-note} +This section displays the metric information as provided in the metric `config.vsh.yaml` files. + +Expand each row of the table for more details. +::: + +```{r} +#| label: metric-info +metric_info <- task_results$metric_info + +metric_summary <- purrr::map_dfr(metric_info, function(.metric) { + data.frame( + metric = .metric$name, + label = .metric$label, + summary = .metric$summary + ) +}) + +metric_details <- purrr::map_dfr(metric_info, function(.metric) { + metric_data <- purrr::map(.metric, \(.x) {ifelse(is.null(.x), "", .x)}) + + data.frame( + description = metric_data$description, + component_name = metric_data$component_name, + commit = metric_data$commit, + version = metric_data$version, + maximize = metric_data$maximize, + link_implementation = metric_data$link_implementation, + link_container_image = metric_data$link_container_image + ) +}) + +source_columns <- c( + "Component" = "component_name", + "Commit" = "commit", + "Version" = "version", + "Maximize?" = "maximize" +) +link_columns <- c( + "Implementation" = "link_implementation", + "Image" = "link_container_image" +) + +reactable::reactable( + metric_summary, + columns = list( + metric = reactable::colDef( + name = "Metric", + style = list("font-family" = "monospace"), + html = TRUE + ), + label = reactable::colDef(name = "Label"), + summary = reactable::colDef(name = "Summary") + ), + + details = function(index, column) { + description_table <- get_description_table( + metric_details[index, c("description"), drop = FALSE] + ) + + source_table <- get_source_table(metric_details[index, ], source_columns) + + links_table <- get_links_table(metric_details[index, ], link_columns) + + additional_table <- get_additional_info_table( + metric_info[[index]]$additional_info + ) + + if (length(metric_info[[index]]$authors) > 0) { + authors_table <- get_authors_table(metric_info[[index]]$authors) + } else { + authors_table <- NULL + } + + references_table <- get_references_table(metric_info[[index]]$references) + + htmltools::div( + style = "padding: 1rem", + description_table, + source_table, + links_table, + additional_table, + authors_table, + references_table + ) + }, + + highlight = TRUE, + striped = TRUE, + pagination = FALSE, + + rowStyle = reactable::JS( + "function(rowInfo) { + if (rowInfo.level == 0) { // corresponds to row group + return { + borderLeft: '2px solid #104E8B', + fontWeight: 400 + } + } + }" + ) +) +``` + +# Normalization {#sec-normalization} + +::: {.callout-note} +This section displays the normalization information for each metric. +The scores for control methods are used to create a control range and scale the scores from other methods. +Points outside the control range indicate that a metric is lacking an appropriate control method. + +Click the tabs to see the plots for each metric. +::: + +```{r} +#| label: normalization-controls +#| results: asis +dataset_names <- purrr::map_chr(dataset_info, "name") +method_names <- purrr::map_chr(method_info, "name") +metric_names <- purrr::map_chr(metric_info, "name") + +control_method_names <- method_summary$method[method_summary$type == "control_method"] + +n_controls <- purrr::map_dfr(task_results$results, function(.result) { + data.frame(dataset = .result$dataset_name, method = .result$method) +}) |> + dplyr::filter( + method %in% control_method_names + ) |> + dplyr::group_by(dataset) |> + dplyr::count(name = "n_controls") |> + dplyr::ungroup() |> + dplyr::mutate(dataset = factor(dataset, levels = dataset_names)) |> + tidyr::complete(dataset, fill = list(n_controls = 0)) + +has_controls <- all(n_controls$n_controls >= 2) + +if (isFALSE(has_controls)) { + out <- c("## No normalization performed") + + cat( + c( + "::: {.callout-important}", + "There are less than two control methods for some datasets.", + "**Scaling cannot be peformed and the results sections will be empty.**", + "See the quality control section for more information.", + "", + knitr::kable( + n_controls, + col.names = c("Dataset", "Number of control methods") + ), + ":::" + ), + sep = "\n" + ) +} else if (!all(n_controls$n_controls == length(control_method_names))) { + cat( + c( + "::: {.callout-warning}", + "Some control method results are missing for some datasets.", + "This may affect scaling and results.", + "See the normalization plots and the quality control section for more information.", + "", + knitr::kable( + n_controls, + col.names = c("Dataset", "Number of control methods") + ), + ":::" + ), + sep = "\n" + ) +} +``` + +```{r} +#| label: normalization +#| eval: !expr has_controls +dataset_details <- purrr::map_dfr(dataset_info, function(.dataset) { + data.frame( + dataset = .dataset$name, + dataset_label = .dataset$label + ) +}) |> + dplyr::arrange(dataset) + +method_details <- purrr::map_dfr(method_info, function(.method) { + data.frame( + method = .method$name, + method_label = .method$label, + method_type = .method$type + ) +}) |> + dplyr::arrange(method) + +metric_details <- purrr::map_dfr(metric_info, function(.metric) { + data.frame( + metric = .metric$name, + metric_label = .metric$label + ) +}) |> + dplyr::arrange(metric) + +scores <- purrr::map_dfr(task_results$results, function(.result) { + if (!.result$succeeded) { + return(NULL) + } + + data.frame( + dataset = .result$dataset_name, + method = .result$method, + metric = unlist(.result$metric_names), + value = unlist(.result$metric_values) + ) +}) + +control_ranges <- scores |> + dplyr::left_join(method_details, by = "method") |> + dplyr::filter(method_type == "control_method") |> + dplyr::group_by(dataset, metric) |> + dplyr::summarise( + control_min = min(value, na.rm = TRUE), + control_max = max(value, na.rm = TRUE), + .groups = "drop" + ) + +scaled_scores <- scores |> + dplyr::left_join(control_ranges, by = c("dataset", "metric")) |> + dplyr::mutate( + scaled_value = (value - control_min) / (control_max - control_min) + ) + +complete_scores <- tidyr::expand_grid( + dataset = dataset_names, + method = method_names, + metric = metric_names +) |> + dplyr::left_join(dataset_details, by = "dataset") |> + dplyr::relocate(method, metric, .after = dplyr::last_col()) |> + dplyr::left_join(method_details, by = "method") |> + dplyr::relocate(metric, .after = dplyr::last_col()) |> + dplyr::left_join(metric_details, by = "metric") |> + dplyr::left_join(scaled_scores, by = c("dataset", "method", "metric")) |> + tidyr::replace_na(list(scaled_value = 0)) |> + dplyr::arrange(dataset, method, metric) +``` + +::: {.panel-tabset} + +```{r} +#| label: normalization-plots +#| results: hide +#| eval: !expr has_controls +fig_height <- 0.8 * length(metric_names) + 1 + +src_list <- purrr::map(metric_names, function(.metric) { + metric_label <- metric_details$metric_label[metric_details$metric == .metric] + + src <- c( + "## <> {.unnumbered .unlisted}", + "", + "```{r normalization-plot-<<.metric>>, fig.height=<>}", + "plot_scaling(complete_scores, '<<.metric>>', method_details, metric_details)", + "```", + "" + ) + knitr::knit_expand(text = src, delim = c("<<", ">>")) +}) + +out <- knitr::knit_child(text = unlist(src_list), options = list(cache = FALSE)) +``` + +`r out` + +::: + +# Quality control {#sec-quality-control} + +::: {.callout-note} +This section displays quality control information about the task run. + +Click on the tabs to see each category of quality control checks and expand the rows to see more information. +::: + +```{r} +#| label: quality-control +quality_control <- task_results$quality_control |> + purrr::map_dfr(as.data.frame) |> + dplyr::mutate(check = ifelse(severity > 0, "failed", "passed")) + +qc_summary <- quality_control |> + dplyr::group_by(category, check) |> + dplyr::count(name = "amount") |> + dplyr::ungroup() |> + tidyr::spread(check, amount, fill = 0) |> + dplyr::mutate( + category = factor( + category, + levels = c( + "Task info", + "Dataset info", + "Method info", + "Metric info", + "Raw results", + "Scaling" + ) + ) + ) |> + tidyr::complete(category, fill = list(passed = 0, failed = 0)) + +reactable::reactable( + qc_summary, + columns = list( + category = reactable::colDef(name = "Category"), + passed = reactable::colDef(name = "Passed checks"), + failed = reactable::colDef(name = "Failed checks") + ), + sortable = FALSE, + striped = TRUE +) +``` + +::: {.panel-tabset} + +## Task information {.unnumbered .unlisted} + +```{r} +#| label: quality-control-task +quality_control |> + dplyr::filter(category == "Task info", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +## Dataset information {.unnumbered .unlisted} + +```{r} +#| label: quality-control-datasets +quality_control |> + dplyr::filter(category == "Dataset info", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +## Method information {.unnumbered .unlisted} + +```{r} +#| label: quality-control-methods +quality_control |> + dplyr::filter(category == "Method info", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +## Metric information {.unnumbered .unlisted} + +```{r} +#| label: quality-control-metrics +quality_control |> + dplyr::filter(category == "Metric info", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +## Raw results {.unnumbered .unlisted} + +```{r} +#| label: quality-control-results +quality_control |> + dplyr::filter(category == "Raw results", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +## Scaling {.unnumbered .unlisted} + +```{r} +#| label: quality-control-scaling +quality_control |> + dplyr::filter(category == "Scaling", check == "failed") |> + dplyr::select(-category, -check) |> + get_qc_table() +``` + +::: + +# Results + +```{r} +#| label: results +#| eval: !expr has_controls +mean_scores <- complete_scores |> + dplyr::group_by(dataset, method) |> + dplyr::summarise( + mean_score = aggregate_scores(scaled_value), + .groups = "drop" + ) + +dataset_scores <- complete_scores |> + dplyr::select(dataset, method, metric, scaled_value) |> + tidyr::pivot_wider( + names_from = metric, + values_from = scaled_value + ) |> + dplyr::left_join(mean_scores, by = c("dataset", "method")) + +overall_scores <- dataset_scores |> + dplyr::group_by(method) |> + dplyr::summarise( + dataset = "overall", + dplyr::across( + tidyselect::where(is.numeric), + aggregate_scores + ), + .groups = "drop" + ) + +exit_names <- c( + "Memory limit exceeded", + "Time limit exceeded", + "Execution error", + "Unknown error", + "Not applicable", + "No error" +) + +exit_codes <- purrr::map_dfr(task_results$results, function(.result) { + data.frame( + dataset = .result$dataset_name, + method = .result$method + ) +}) |> + dplyr::mutate( + exit_codes = purrr::map(task_results$results, "run_exit_code") + ) |> + dplyr::mutate( + exit_codes = purrr::map(exit_codes, \(.codes) { + if (length(.codes) == 0) { + 0 + } else { + .codes + } + }) + ) |> + dplyr::group_by(method) |> + dplyr::summarise( + exit_codes = list(unlist(exit_codes)), + .groups = "drop" + ) |> + dplyr::mutate( + all_codes = purrr::map_chr(exit_codes, function(.codes) { + paste(.codes, collapse = ", ") + }), + pct_oom = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes == 137) + }), + pct_timeout = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes == 143) + }), + pct_error = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes > 0 & .codes != 137 & .codes != 143 & .codes != 99) + }), + pct_unknown = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes < 0) + }), + pct_na = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes == 99) + }), + pct_ok = purrr::map_dbl(exit_codes, function(.codes) { + mean(.codes == 0) + }), + ) |> + tidyr::nest(exit_summary = tidyselect::starts_with("pct")) |> + dplyr::mutate(exit_summary = purrr::map(exit_summary, \(.summary) { + exit_vec <- unlist(as.vector(.summary)) + names(exit_vec) <- exit_names + + exit_vec + })) + +resources <- purrr::map_dfr(task_results$results, function(.result) { + data.frame( + dataset = .result$dataset_name, + method = .result$method + ) +}) |> + dplyr::mutate( + run_duration_secs = purrr::map(task_results$results, "run_duration_secs"), + run_cpu_pct = purrr::map(task_results$results, "run_cpu_pct"), + run_peak_memory_mb = purrr::map(task_results$results, "run_peak_memory_mb"), + run_disk_read_mb = purrr::map(task_results$results, "run_disk_read_mb"), + run_disk_write_mb = purrr::map(task_results$results, "run_disk_write_mb") + ) |> + # Summarise per task + dplyr::mutate( + run_cpu_pct = purrr::map_dbl(run_cpu_pct, function(.values) { + if (length(.values) == 0) { + return(NA_real_) + } + + mean(unlist(.values), na.rm = TRUE) + }), + run_peak_memory_mb = purrr::map_dbl(run_peak_memory_mb, function(.values) { + if (length(.values) == 0) { + return(NA_real_) + } + + max(unlist(.values), na.rm = TRUE) + }), + run_disk_read_mb = purrr::map_dbl(run_disk_read_mb, function(.values) { + if (length(.values) == 0) { + return(NA_real_) + } + + sum(unlist(.values), na.rm = TRUE) + }), + run_disk_write_mb = purrr::map_dbl(run_disk_write_mb, function(.values) { + if (length(.values) == 0) { + return(NA_real_) + } + + sum(unlist(.values), na.rm = TRUE) + }), + run_duration_secs = purrr::map_dbl(run_duration_secs, function(.values) { + if (length(.values) == 0) { + return(NA_real_) + } + + sum(unlist(.values), na.rm = TRUE) + }) + ) |> + # Summarise by method + dplyr::group_by(method) |> + dplyr::summarise( + mean_cpu_pct = mean(run_cpu_pct, na.rm = TRUE), + mean_peak_memory_mb = mean(run_peak_memory_mb, na.rm = TRUE), + mean_disk_read_mb = mean(run_disk_read_mb, na.rm = TRUE), + mean_disk_write_mb = mean(run_disk_write_mb, na.rm = TRUE), + mean_duration_secs = mean(run_duration_secs, na.rm = TRUE), + .groups = "drop" + ) |> + dplyr::mutate( + mean_peak_memory_mb_log = -log10(mean_peak_memory_mb), + mean_peak_memory_label = paste0(" ", label_memory(mean_peak_memory_mb), " "), + mean_disk_read_mb_log = -log10(mean_disk_read_mb), + mean_disk_read_label = paste0(" ", label_memory(mean_disk_read_mb), " "), + mean_disk_write_mb_log = -log10(mean_disk_write_mb), + mean_disk_write_label = paste0(" ", label_memory(mean_disk_write_mb), " "), + mean_duration_secs_log = -log10(mean_duration_secs), + mean_duration_label = paste0(" ", label_time(mean_duration_secs), " ") + ) +``` + +## Summary figure + +::: {.callout-note} +This is a static version of the main summary figure shown on the Open Problems website. + +Click on the image to expand it. +::: + +```{r} +#| label: results-figure +#| message: false +#| fig-width: 18 +#| fig-height: 16 +#| eval: !expr has_controls +figure_data <- overall_scores |> + dplyr::select(-dataset) |> + dplyr::relocate(mean_score, .after = method) |> + dplyr::left_join( + dplyr::select(exit_codes, method, exit_summary), + by = "method" + ) |> + dplyr::relocate(exit_summary, .after = mean_score) |> + # Fill in missing exit summaries for methods that were skipped + dplyr::mutate( + exit_summary = purrr::map(exit_summary, \(.summary) { + if (!is.null(.summary)) { + return(.summary) + } else { + rep(0, length(exit_names)) |> + setNames(exit_names) + } + }) + ) |> + dplyr::left_join( + mean_scores |> + dplyr::arrange(dataset) |> + tidyr::pivot_wider(names_from = "dataset", values_from = "mean_score"), + by = "method" + ) |> + dplyr::relocate( + tidyselect::all_of(dataset_details$dataset), + .after = exit_summary + ) |> + dplyr::left_join( + resources |> + dplyr::select( + method, + mean_cpu_pct, + mean_peak_memory_mb_log, + mean_peak_memory_label, + mean_disk_read_mb_log, + mean_disk_read_label, + mean_disk_write_mb_log, + mean_disk_write_label, + mean_duration_secs_log, + mean_duration_label + ), + by = "method" + ) |> + # Resources are not 0-1 so need to be rescaled + dplyr::mutate( + mean_cpu_pct = scales::rescale(mean_cpu_pct), + mean_peak_memory_mb_log = scales::rescale(mean_peak_memory_mb_log), + mean_disk_read_mb_log = scales::rescale(mean_disk_read_mb_log), + mean_disk_write_mb_log = scales::rescale(mean_disk_write_mb_log), + mean_duration_secs_log = scales::rescale(mean_duration_secs_log) + ) |> + dplyr::arrange(dplyr::desc(mean_score)) |> + dplyr::mutate( + method = factor( + method, + levels = method_details$method, + labels = method_details$method_label + ) + ) |> + dplyr::rename(id = method) + +column_info <- tibble::tibble( + id = colnames(figure_data), + name = c( + "Method", + "Overall score", + "Error reason", + dataset_details$dataset_label, + metric_details$metric_label, + "% CPU", + "Peak memory", + "", + "Disk read", + "", + "Disk write", + "", + "Duration", + "" + ), + geom = c( + "text", + "bar", + "pie", + rep("funkyrect", length(dataset_names)), + rep("funkyrect", length(metric_names)), + c("funkyrect", rep(c("rect", "text"), 4)) + ), + group = c( + NA, + "overall", + "overall", + rep("datasets", length(dataset_names)), + rep("metrics", length(metric_names)), + rep("resources", 9) + ), + palette = c( + NA, + "overall_palette", + "error_reason_palette", + rep("datasets_palette", length(dataset_names)), + rep("metrics_palette", length(metric_names)), + "resources_palette", rep(c("resources_palette", "black"), 4) + ), + width = c( + 12, + 4, + 1, + rep(1, length(dataset_names)), + rep(1, length(metric_names)), + rep(1, 9) + ), + overlay = c( + FALSE, + FALSE, + FALSE, + rep(FALSE, length(dataset_names)), + rep(FALSE, length(metric_names)), + FALSE, rep(c(FALSE, TRUE), 4) + ), + hjust = c( + 0, + 0, + 0.5, + rep(0.5, length(dataset_names)), + rep(0.5, length(metric_names)), + rep(0.5, 9) + ) +) + +column_groups <- tibble::tibble( + group = c("overall", "datasets", "metrics", "resources"), + category = c("Overall", "Datasets", "Metrics", "Resources"), + palette = c("overall_palette", "datasets_palette", "metrics_palette", "resources_palette"), +) + +palettes <- list( + overall_palette = "Greys", + error_reason_palette = c( + "#8DD3C7", + "#FFFFB3", + "#BEBADA", + "#fdb462", + "#999999", + "#FFFFFF" + ), + datasets_palette = "Blues", + metrics_palette = "Reds", + resources_palette = "YlOrBr", + black = c("black", "black") +) +names(palettes$error_reason_palette) <- exit_names + +legends <- list( + list( + geom = "funkyrect", + title = "Score", + colour = "white" + ), + list( + palette = "overall_palette", + enabled = FALSE + ), + list( + palette = "error_reason_palette", + geom = "pie", + title = "", + label_width = 5 + ), + list( + palette = "datasets_palette", + enabled = FALSE + ), + list( + palette = "metrics_palette", + enabled = FALSE + ), + list( + palette = "resources_palette", + enabled = FALSE + ) +) + +funkyheatmap::funky_heatmap( + figure_data, + column_info = column_info, + column_groups = column_groups, + palettes = palettes, + legends = legends, + scale_column = FALSE, + position_args = funkyheatmap::position_arguments( + col_space = 0.2, + col_bigspace = 0.8, + col_annot_offset = 6 + ) +) +``` + +## Table + +::: {.callout-note} +This table displays the scaled metric scores. +The "Overall" dataset gives the mean score across all of the actual datasets. + +Sort and filter the table to check scores you are interested in. +::: + +```{r} +#| label: results-table +#| eval: !expr has_controls +table_data <- dataset_scores |> + dplyr::bind_rows(overall_scores) |> + dplyr::mutate( + dataset = factor( + dataset, + levels = c("overall", dataset_details$dataset), + labels = c("Overall", dataset_details$dataset_label) + ), + method = factor( + method, + levels = method_details$method, + labels = method_details$method_label + ) + ) |> + dplyr::relocate(dataset, .after = method) |> + dplyr::relocate(mean_score, .after = dataset) |> + dplyr::arrange(dataset, method) + +reactable::reactable( + table_data, + + columns = c( + list( + method = reactable::colDef( + name = "Method", + sticky = "left" + ), + dataset = reactable::colDef( + name = "Dataset", + sticky = "left", + style = list(borderRight = "2px solid #999"), + headerStyle = list(borderRight = "2px solid #999") + ), + mean_score = reactable::colDef( + name = "Mean score", + format = reactable::colFormat(digits = 3) + ) + ), + purrr::map( metric_details$metric_label, + function(.metric_label) { + reactable::colDef( + name = .metric_label, + format = reactable::colFormat(digits = 3) + ) + } + ) |> + purrr::set_names(metric_details$metric) + ), + + highlight = TRUE, + striped = TRUE, + defaultPageSize = 25, + showPageSizeOptions = TRUE, + filterable = TRUE, + searchable = TRUE +) +``` diff --git a/src/reporting/render_report/script.R b/src/reporting/render_report/script.R new file mode 100644 index 000000000..286e20f37 --- /dev/null +++ b/src/reporting/render_report/script.R @@ -0,0 +1,70 @@ +## VIASH START +processed_dir <- "resources_test/openproblems/task_results_v4/processed" + +par <- list( + # Inputs + input_task_results = paste0(processed_dir, "/task_info.json"), + # Outputs + output = "report.html" +) +## VIASH END + +################################################################################ +# MAIN SCRIPT +################################################################################ + +cat("====== Render report ======\n") + +cat("\n>>> Copying input file to temporary directory...\n") +tmp_dir <- file.path(tempdir(), "render-report") +dir.create(tmp_dir, recursive = TRUE) +cat("Temporary directory: ", tmp_dir, "\n", sep = "") +file.copy( + par$input_task_results, + file.path(tmp_dir, "task_results.json"), + overwrite = TRUE +) + +cat("\n>>> Copying resources to temporary directory...\n") +cat("Copying 'report.qmd'...\n") +file.copy( + file.path(meta$resources_dir, "report.qmd"), + tmp_dir, + overwrite = TRUE +) +cat("Copying 'logo.svg'...\n") +file.copy( + file.path(meta$resources_dir, "logo.svg"), + tmp_dir, + overwrite = TRUE +) +cat("Copying 'functions.R'...\n") +file.copy( + file.path(meta$resources_dir, "functions.R"), + tmp_dir, + overwrite = TRUE +) + +cat("\n>>> Rendering report...\n") +cat("Quarto version: ", as.character(quarto::quarto_version()), sep = "") +xfun::in_dir( + tmp_dir, + quarto::quarto_render( + input = "report.qmd", + output_file = "report.html", + execute_params = list( + task_results_json = "task_results.json", + logo = "logo.svg", + functions = "functions.R" + ) + ) +) + +cat("\n>>> Copying output file...\n") +file.copy( + file.path(tmp_dir, "report.html"), + par$output, + overwrite = TRUE +) + +cat("\n>>> Done!\n") diff --git a/src/reporting/shared/bibliography.bib b/src/reporting/shared/bibliography.bib new file mode 100644 index 000000000..639c74476 --- /dev/null +++ b/src/reporting/shared/bibliography.bib @@ -0,0 +1,2058 @@ +@misc{10x2018pbmc, + title = {1k PBMCs from a Healthy Donor (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2018}, + url = {https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0} +} + +@misc{10x2019heart, + title = {Human Heart}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0} +} + +@misc{10x2019lymph, + title = {Human Lymph Node}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0} +} + +@misc{10x2019pbmc, + title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0} +} + +@misc{10x2020breast, + title = {Human Breast Cancer: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0} +} + +@misc{10x2020cerebellum, + title = {Human Cerebellum: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0} +} + +@misc{10x2020kidney, + title = {Mouse Kidney Section (Coronal)}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0} +} + +@misc{10x2021breast, + title = {Human Breast Cancer: Ductal Carcinoma In Situ, Invasive Carcinoma (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0} +} + +@misc{10x2021prostate, + title = {Normal Human Prostate (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0} +} + +@misc{10x2022brain, + title = {Mouse Brain Coronal Section 1 (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard} +} + +@misc{10x2022cervical, + title = {Human Cervical Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard} +} + +@misc{10x2022olfactory, + title = {Adult Mouse Olfactory Bulb}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1} +} + +@misc{10x2022intestine, + title = {Human Intestine Cancer (FPPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard} +} + +@misc{10x2022melanoma, + title = {Human Melanoma, IF Stained (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard} +} + +@misc{10x2022prostate, + title = {Human Prostate Cancer, Adjacent Normal Section with IF Staining (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard} +} + +@misc{10x2023brain, + title = {Human Brain Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard} +} + +@misc{10x2023colon, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Human Colon Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard} +} + +@misc{10x2023colorectal, + title = {Human Colorectal Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard} +} + +@misc{10x2023embryo, + title = {Visium CytAssist, Mouse Embryo, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard} +} + +@misc{10x2023kidney, + title = {Human Kidney, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard} +} + +@misc{10x2023lung, + title = {Human Lung Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard} +} + +@misc{10x2023mousebrain, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Mouse Brain (FF)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard} +} + +@article{agostinis2022newwave, + doi = {10.1093/bioinformatics/btac149}, + url = {https://doi.org/10.1093/bioinformatics/btac149}, + year = {2022}, + month = {Mar.}, + publisher = {Oxford University Press ({OUP})}, + volume = {38}, + number = {9}, + pages = {2648--2650}, + author = {Federico Agostinis and Chiara Romualdi and Gabriele Sales and Davide Risso}, + editor = {Yann Ponty}, + title = {NewWave: a scalable R/Bioconductor package for the dimensionality reduction and batch effect removal of single-cell {RNA}-seq data}, + journal = {Bioinformatics} +} + +@article{agrawal2021mde, + title = {Minimum-Distortion Embedding}, + author = {Akshay Agrawal and Alnur Ali and Stephen Boyd}, + year = {2021}, + journal = {Foundations and Trends{\textregistered} in Machine Learning}, + publisher = {Now Publishers}, + volume = {14}, + number = {3}, + pages = {211--378}, + doi = {10.1561/2200000090}, + url = {https://doi.org/10.1561/2200000090} +} + +@article{aliee2021autogenes, + title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution}, + author = {Hananeh Aliee and Fabian J. Theis}, + year = {2021}, + month = {Jul.}, + journal = {Cell Systems}, + publisher = {Elsevier {BV}}, + volume = {12}, + number = {7}, + pages = {706--715.e4}, + doi = {10.1016/j.cels.2021.05.006}, + url = {https://doi.org/10.1016/j.cels.2021.05.006} +} + +@inproceedings{amelio2015normalized, + doi = {10.1145/2808797.2809344}, + url = {https://doi.org/10.1145/2808797.2809344}, + year = {2015}, + month = {Aug.}, + publisher = {{ACM}}, + author = {Alessia Amelio and Clara Pizzuti}, + title = {Is Normalized Mutual Information a Fair Measure for Comparing Community Detection Methods?}, + booktitle = {Proceedings of the 2015 {IEEE}/{ACM} International Conference on Advances in Social Networks Analysis and Mining 2015} +} + +@article{andersson2020single, + title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography}, + author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg}, + year = {2020}, + month = {Oct.}, + journal = {Communications Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {3}, + number = {1}, + doi = {10.1038/s42003-020-01247-y}, + url = {https://doi.org/10.1038/s42003-020-01247-y} +} + +@article{andersson2021sepal, + title = {sepal: Identifying transcript profiles with spatial patterns by diffusion-based modeling}, + author = {Andersson, Alma and Lundeberg, Joakim}, + journal = {Bioinformatics}, + volume = {37}, + number = {17}, + pages = {2644--2650}, + year = {2021}, + publisher = {Oxford University Press}, + doi = {10.1093/bioinformatics/btab164} +} + +@article{batson2019molecular, + title = {Molecular Cross-Validation for Single-Cell RNA-seq}, + author = {Batson, Joshua and Royer, Lo{\"\i}c and Webber, James}, + year = {2019}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/786269}, + url = {https://www.biorxiv.org/content/early/2019/09/30/786269}, + elocation-id = {786269}, + eprint = {https://www.biorxiv.org/content/early/2019/09/30/786269.full.pdf} +} + +@article{biancalani2021deep, + title = {Deep learning and alignment of spatially resolved single-cell transcriptomes with Tangram}, + author = {Tommaso Biancalani and Gabriele Scalia and Lorenzo Buffoni and Raghav Avasthi and Ziqing Lu and Aman Sanger and Neriman Tokcan and Charles R. Vanderburg and {\AA}sa Segerstolpe and Meng Zhang and Inbal Avraham-Davidi and Sanja Vickovic and Mor Nitzan and Sai Ma and Ayshwarya Subramanian and Michal Lipinski and Jason Buenrostro and Nik Bear Brown and Duccio Fanelli and Xiaowei Zhuang and Evan Z. Macosko and Aviv Regev}, + year = {2021}, + month = {Oct.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {18}, + number = {11}, + pages = {1352--1362}, + doi = {10.1038/s41592-021-01264-7}, + url = {https://doi.org/10.1038/s41592-021-01264-7} +} + +@article{bintayyash2021non, + author = {BinTayyash, Nuha and Georgaka, Sokratia and John, S T and Ahmed, Sumon and Boukouvalas, Alexis and Hensman, James and Rattray, Magnus}, + title = {{Non-parametric modelling of temporal and spatial counts data from RNA-seq experiments}}, + journal = {Bioinformatics}, + volume = {37}, + number = {21}, + pages = {3788-3795}, + year = {2021}, + month = {07}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab486}, + url = {https://doi.org/10.1093/bioinformatics/btab486}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/21/3788/50336570/btab486.pdf} +} + +@article{bland2000odds, + title = {Statistics Notes: The odds ratio}, + author = {J. M. Bland}, + year = {2000}, + month = {May}, + journal = {{BMJ}}, + publisher = {{BMJ}}, + volume = {320}, + number = {7247}, + pages = {1468--1468}, + doi = {10.1136/bmj.320.7247.1468}, + url = {https://doi.org/10.1136/bmj.320.7247.1468} +} + +@article{breiman2001random, + title = {{Random forests}}, + author = {Breiman, Leo}, + journal = {Machine learning}, + publisher = {Springer Science and Business Media LLC}, + volume = 45, + number = 1, + pages = {5--32}, + month = oct, + year = 2001, + doi = {10.1023/a:1010933404324}, + issn = {0885-6125,1573-0565}, + language = {en} +} + +@article{bttner2018test, + title = {A test metric for assessing single-cell {RNA}-seq batch correction}, + author = {Maren B\"{u}ttner and Zhichao Miao and F. Alexander Wolf and Sarah A. Teichmann and Fabian J. Theis}, + year = {2018}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + pages = {43--49}, + doi = {10.1038/s41592-018-0254-1}, + url = {https://doi.org/10.1038/s41592-018-0254-1} +} + +@article{cabello2020singlecellsignalr, + title = {{SingleCellSignalR}: inference of intercellular networks from single-cell transcriptomics}, + author = {Simon Cabello-Aguilar and M{\'{e}}lissa Alame and Fabien Kon-Sun-Tack and Caroline Fau and Matthieu Lacroix and Jacques Colinge}, + year = {2020}, + month = {Mar.}, + journal = {Nucleic Acids Research}, + publisher = {Oxford University Press ({OUP})}, + volume = {48}, + number = {10}, + pages = {e55--e55}, + doi = {10.1093/nar/gkaa183}, + url = {https://doi.org/10.1093/nar/gkaa183} +} + +@article{cable2021robust, + title = {Robust decomposition of cell type mixtures in spatial transcriptomics}, + author = {Dylan M. Cable and Evan Murray and Luli S. Zou and Aleksandrina Goeva and Evan Z. Macosko and Fei Chen and Rafael A. Irizarry}, + year = {2021}, + month = {Feb.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {4}, + pages = {517--526}, + doi = {10.1038/s41587-021-00830-w}, + url = {https://doi.org/10.1038/s41587-021-00830-w} +} + +@misc{cannoodt2021viashfromscripts, + doi = {10.48550/ARXIV.2110.11494}, + url = {https://arxiv.org/abs/2110.11494}, + author = {Cannoodt, Robrecht and Cannoodt, Hendrik and Van de Kerckhove, Eric and Boschmans, Andy and De Maeyer, Dries and Verbeiren, Toni}, + keywords = {Software Engineering (cs.SE), FOS: Computer and information sciences, FOS: Computer and information sciences}, + title = {Viash: from scripts to pipelines}, + publisher = {arXiv}, + year = {2021}, + copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International} +} + +@article{cai2023spanve, + title = {Spanve: an Statistical Method to Detect Clustering-friendly Spatially Variable Genes in Large-scale Spatial Transcriptomics Data}, + author = {Cai, Guoxin and Chen, Yichang and Chen, Shuqing and Gu, Xun and Zhou, Zhan}, + journal = {bioRxiv}, + pages = {2023--02}, + year = {2023}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2023.02.08.527623} +} + +@article{cao2018joint, + title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, + author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, + year = {2018}, + month = {Sep}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {361}, + number = {6409}, + pages = {1380--1385}, + doi = {10.1126/science.aau0730}, + url = {https://doi.org/10.1126/science.aau0730} +} + +@article{cao2020human, + title = {A human cell atlas of fetal gene expression}, + author = {Junyue Cao and Diana R. O'Day and Hannah A. Pliner and Paul D. Kingsley and Mei Deng and Riza M. Daza and Michael A. Zager and Kimberly A. Aldinger and Ronnie Blecher-Gonen and Fan Zhang and Malte Spielmann and James Palis and Dan Doherty and Frank J. Steemers and Ian A. Glass and Cole Trapnell and Jay Shendure}, + year = {2020}, + month = {Nov.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {370}, + number = {6518}, + doi = {10.1126/science.aba7721}, + url = {https://doi.org/10.1126/science.aba7721} +} + +@article{chai2014root, + title = {{Root mean square error (RMSE) or mean absolute error (MAE)?}}, + author = {Chai, T and Draxler, R R}, + journal = {Geoscientific model development discussions}, + publisher = {Copernicus GmbH}, + volume = 7, + number = 1, + pages = {1525--1534}, + month = feb, + year = 2014, + doi = {10.5194/gmdd-7-1525-2014}, + issn = {1991-962X}, + language = {en} +} + +@article{chang2022spatial, + title = {Spatial omics representation and functional tissue module inference using graph Fourier transform}, + author = {Chang, Yuzhou and Liu, Jixin and Ma, Anjun and Jiang, Sizun and Krull, Jordan and Yeo, Yao Yu and Liu, Yang and Rodig, Scott J and Barouch, Dan H and Fan, Rong and others}, + journal = {bioRxiv}, + pages = {2022--12}, + year = {2022}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2022.12.10.519929} +} + +@article{chazarragil2021flexible, + doi = {10.1093/nar/gkab004}, + url = {https://doi.org/10.1093/nar/gkab004}, + year = {2021}, + month = {Feb.}, + publisher = {Oxford University Press ({OUP})}, + volume = {49}, + number = {7}, + pages = {e42--e42}, + author = {Ruben Chazarra-Gil and Stijn van~Dongen and Vladimir~Yu Kiselev and Martin Hemberg}, + title = {Flexible comparison of batch correction methods for single-cell {RNA}-seq using {BatchBench}}, + journal = {Nucleic Acids Research} +} + +@article{chen2009local, + title = {Local Multidimensional Scaling for Nonlinear Dimension Reduction, Graph Drawing, and Proximity Analysis}, + author = {Lisha Chen and Andreas Buja}, + year = {2009}, + month = {Mar.}, + journal = {Journal of the American Statistical Association}, + publisher = {Informa {UK} Limited}, + volume = {104}, + number = {485}, + pages = {209--219}, + doi = {10.1198/jasa.2009.0111}, + url = {https://doi.org/10.1198/jasa.2009.0111} +} + +@inproceedings{chen2016xgboost, + title = {{XGBoost}}, + author = {Tianqi Chen and Carlos Guestrin}, + year = {2016}, + month = {Aug.}, + booktitle = {Proceedings of the 22nd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining}, + publisher = {{Acm}}, + doi = {10.1145/2939672.2939785}, + url = {https://doi.org/10.1145/2939672.2939785} +} + +@article{cichocki2009fast, + title = {Fast Local Algorithms for Large Scale Nonnegative Matrix and Tensor Factorizations}, + author = {Andrzej Cichocki and Anh-Huy Phan}, + year = {2009}, + journal = {{IEICE} Transactions on Fundamentals of Electronics, Communications and Computer Sciences}, + publisher = {Institute of Electronics, Information and Communications Engineers ({IEICE})}, + volume = {E92-a}, + number = {3}, + pages = {708--721}, + doi = {10.1587/transfun.e92.a.708}, + url = {https://doi.org/10.1587/transfun.e92.a.708} +} + +@article{coifman2006diffusion, + title = {Diffusion maps}, + author = {Ronald R. Coifman and St{\'{e}}phane Lafon}, + year = {2006}, + month = {Jul.}, + journal = {Applied and Computational Harmonic Analysis}, + publisher = {Elsevier {BV}}, + volume = {21}, + number = {1}, + pages = {5--30}, + doi = {10.1016/j.acha.2006.04.006}, + url = {https://doi.org/10.1016/j.acha.2006.04.006} +} + +@article{cover1967nearest, + title = {Nearest neighbor pattern classification}, + author = {T. Cover and P. Hart}, + year = {1967}, + month = {Jan}, + journal = {{IEEE} Transactions on Information Theory}, + publisher = {Institute of Electrical and Electronics Engineers ({IEEE})}, + volume = {13}, + number = {1}, + pages = {21--27}, + doi = {10.1109/tit.1967.1053964}, + url = {https://doi.org/10.1109/tit.1967.1053964} +} + +@inproceedings{davis2006prauc, + title = {The relationship between Precision-Recall and {ROC} curves}, + author = {Jesse Davis and Mark Goadrich}, + year = {2006}, + booktitle = {Proceedings of the 23rd international conference on Machine learning - {ICML} {\textquotesingle}06}, + publisher = {{ACM} Press}, + doi = {10.1145/1143844.1143874}, + url = {https://doi.org/10.1145/1143844.1143874} +} + +@article{Demetci2020scot, + author = {Pinar Demetci and Rebecca Santorella and Bj{\"o}rn Sandstede and William Stafford Noble and Ritambhara Singh}, + title = {Gromov-Wasserstein optimal transport to align single-cell multi-omics data}, + elocation-id = {2020.04.28.066787}, + year = {2020}, + doi = {10.1101/2020.04.28.066787}, + publisher = {Cold Spring Harbor Laboratory}, + url = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787}, + eprint = {https://www.biorxiv.org/content/early/2020/11/11/2020.04.28.066787.full.pdf}, + journal = {bioRxiv} +} + +@article{dimitrov2022comparison, + title = {Comparison of methods and resources for cell-cell communication inference from single-cell {RNA}-Seq data}, + author = {Daniel Dimitrov and D{\'{e}}nes T\"{u}rei and Martin Garrido-Rodriguez and Paul L. Burmedi and James S. Nagai and Charlotte Boys and Ricardo O. Ramirez Flores and Hyojin Kim and Bence Szalai and Ivan G. Costa and Alberto Valdeolivas and Aur{\'{e}}lien Dugourd and Julio Saez-Rodriguez}, + year = {2022}, + month = {Jun.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-30755-0}, + url = {https://doi.org/10.1038/s41467-022-30755-0} +} + +@article{donoho2017yearsdatascience, + doi = {10.1080/10618600.2017.1384734}, + url = {https://doi.org/10.1080/10618600.2017.1384734}, + year = {2017}, + month = {Oct.}, + publisher = {Informa {UK} Limited}, + volume = {26}, + number = {4}, + pages = {745--766}, + author = {David Donoho}, + title = {50 Years of Data Science}, + journal = {Journal of Computational and Graphical Statistics} +} + +@article{efremova2020cellphonedb, + title = {{CellPhoneDB}: inferring cell-cell communication from combined expression of multi-subunit ligand-receptor complexes}, + author = {Mirjana Efremova and Miquel Vento-Tormo and Sarah A. Teichmann and Roser Vento-Tormo}, + year = {2020}, + month = {Feb.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {4}, + pages = {1484--1506}, + doi = {10.1038/s41596-020-0292-x}, + url = {https://doi.org/10.1038/s41596-020-0292-x} +} + +@article{emmons2016analysis, + title = {Analysis of Network Clustering Algorithms and Cluster Quality Metrics at Scale}, + volume = {11}, + issn = {1932-6203}, + url = {http://dx.doi.org/10.1371/journal.pone.0159161}, + doi = {10.1371/journal.pone.0159161}, + number = {7}, + journal = {PLOS ONE}, + publisher = {Public Library of Science (PLoS)}, + author = {Emmons, Scott and Kobourov, Stephen and Gallant, Mike and B\"{o}rner, Katy}, + editor = {Dovrolis, Constantine}, + year = {2016}, + month = jul, + pages = {e0159161} +} + +@article{eraslan2019single, + title = {Single-cell {RNA}-seq denoising using a deep count autoencoder}, + author = {G\"{o}kcen Eraslan and Lukas M. Simon and Maria Mircea and Nikola S. Mueller and Fabian J. Theis}, + year = {2019}, + month = {Jan}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {10}, + number = {1}, + doi = {10.1038/s41467-018-07931-2}, + url = {https://doi.org/10.1038/s41467-018-07931-2} +} + +@article{fang2022conservation, + title = {Conservation and divergence of cortical cell organization in human and mouse revealed by MERFISH}, + volume = {377}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abm1741}, + doi = {10.1126/science.abm1741}, + number = {6601}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Fang, Rongxin and Xia, Chenglong and Close, Jennie L. and Zhang, Meng and He, Jiang and Huang, Zhengkai and Halpern, Aaron R. and Long, Brian and Miller, Jeremy A. and Lein, Ed S. and Zhuang, Xiaowei}, + year = {2022}, + month = jul, + pages = {56-62} +} + +@article{fix1989discriminatory, + doi = {10.2307/1403797}, + url = {https://doi.org/10.2307/1403797}, + year = {1989}, + month = {Dec.}, + publisher = {{JSTOR}}, + volume = {57}, + number = {3}, + pages = {238}, + author = {Evelyn Fix and J. L. Hodges}, + title = {Discriminatory Analysis. Nonparametric Discrimination: Consistency Properties}, + journal = {International Statistical Review / Revue Internationale de Statistique} +} + +@article{gower1975generalized, + title = {Generalized procrustes analysis}, + author = {J. C. Gower}, + year = {1975}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {1}, + pages = {33--51}, + doi = {10.1007/bf02291478}, + url = {https://doi.org/10.1007/bf02291478} +} + +@article{grandini2020metrics, + title = {Metrics for Multi-Class Classification: an Overview}, + author = {Grandini, Margherita and Bagli, Enrico and Visani, Giorgio}, + year = {2020}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.2008.05756}, + url = {https://arxiv.org/abs/2008.05756}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + +@article{granja2021archr, + title = {{ArchR} is a scalable software package for integrative single-cell chromatin accessibility analysis}, + author = {Jeffrey M. Granja and M. Ryan Corces and Sarah E. Pierce and S. Tansu Bagdatli and Hani Choudhry and Howard Y. Chang and William J. Greenleaf}, + year = {2021}, + month = {Feb.}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {3}, + pages = {403--411}, + doi = {10.1038/s41588-021-00790-6}, + url = {https://doi.org/10.1038/s41588-021-00790-6} +} + +@article{grn2014validation, + title = {Validation of noise models for single-cell transcriptomics}, + author = {Dominic Gr\"{u}n and Lennart Kester and Alexander van Oudenaarden}, + year = {2014}, + month = {Apr.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {6}, + pages = {637--640}, + doi = {10.1038/nmeth.2930}, + url = {https://doi.org/10.1038/nmeth.2930} +} + +@article{haghverdi2018batch, + title = {Batch effects in single-cell {RNA}-sequencing data are corrected by matching mutual nearest neighbors}, + author = {Laleh Haghverdi and Aaron T L Lun and Michael D Morgan and John C Marioni}, + year = {2018}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {36}, + number = {5}, + pages = {421--427}, + doi = {10.1038/nbt.4091}, + url = {https://doi.org/10.1038/nbt.4091} +} + +@article{hammarlund2018cengen, + title = {The {CeNGEN} Project: The Complete Gene Expression Map of an Entire Nervous System}, + author = {Marc Hammarlund and Oliver Hobert and David M. Miller and Nenad Sestan}, + year = {2018}, + month = {Aug.}, + journal = {Neuron}, + publisher = {Elsevier {BV}}, + volume = {99}, + number = {3}, + pages = {430--433}, + doi = {10.1016/j.neuron.2018.07.042}, + url = {https://doi.org/10.1016/j.neuron.2018.07.042} +} + +@article{hansen2012removing, + title = {Adjusting batch effects in microarray expression data using empirical Bayes methods}, + author = {W. Evan Johnson and Cheng Li and Ariel Rabinovic}, + year = {2006}, + month = {Apr.}, + journal = {Biostatistics}, + publisher = {Oxford University Press ({OUP})}, + volume = {8}, + number = {1}, + pages = {118--127}, + doi = {10.1093/biostatistics/kxj037}, + url = {https://doi.org/10.1093/biostatistics/kxj037} +} + +@article{hao2021integrated, + title = {Integrated analysis of multimodal single-cell data}, + author = {Yuhan Hao and Stephanie Hao and Erica Andersen-Nissen and William M. Mauck and Shiwei Zheng and Andrew Butler and Maddie J. Lee and Aaron J. Wilk and Charlotte Darby and Michael Zager and Paul Hoffman and Marlon Stoeckius and Efthymia Papalexi and Eleni P. Mimitou and Jaison Jain and Avi Srivastava and Tim Stuart and Lamar M. Fleming and Bertrand Yeung and Angela J. Rogers and Juliana M. McElrath and Catherine A. Blish and Raphael Gottardo and Peter Smibert and Rahul Satija}, + year = {2021}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {184}, + number = {13}, + pages = {3573--3587.e29}, + doi = {10.1016/j.cell.2021.04.048}, + url = {https://doi.org/10.1016/j.cell.2021.04.048} +} + +@article{hao2021somde, + title = {SOMDE: a scalable method for identifying spatially variable genes with self-organizing map}, + author = {Hao, Minsheng and Hua, Kui and Zhang, Xuegong}, + journal = {Bioinformatics}, + volume = {37}, + number = {23}, + pages = {4392--4398}, + year = {2021}, + publisher = {Oxford University Press}, + doi = {10.1093/bioinformatics/btab471} +} + +@article{hie2019efficient, + title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, + author = {Brian Hie and Bryan Bryson and Bonnie Berger}, + year = {2019}, + month = {May}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {6}, + pages = {685--691}, + doi = {10.1038/s41587-019-0113-3}, + url = {https://doi.org/10.1038/s41587-019-0113-3} +} + +@article{hinton1989connectionist, + title = {Connectionist learning procedures}, + author = {Geoffrey E. Hinton}, + year = {1989}, + month = {Sep}, + journal = {Artificial Intelligence}, + publisher = {Elsevier {BV}}, + volume = {40}, + number = {1-3}, + pages = {185--234}, + doi = {10.1016/0004-3702(89)90049-0}, + url = {https://doi.org/10.1016/0004-3702(89)90049-0} +} + +@book{hosmer2013applied, + title = {Applied logistic regression}, + author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, + year = {2013}, + publisher = {John Wiley \& Sons}, + volume = {398} +} + +@article{hou2019scmatch, + title = {{scMatch}: a single-cell gene expression profile annotation tool using reference datasets}, + author = {Rui Hou and Elena Denisenko and Alistair R R Forrest}, + year = {2019}, + month = {Apr.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + volume = {35}, + number = {22}, + pages = {4688--4695}, + doi = {10.1093/bioinformatics/btz292}, + url = {https://doi.org/10.1093/bioinformatics/btz292}, + editor = {Janet Kelso} +} + +@article{hou2020predicting, + title = {Predicting cell-to-cell communication networks using {NATMI}}, + author = {Rui Hou and Elena Denisenko and Huan Ting Ong and Jordan A. Ramilowski and Alistair R. R. Forrest}, + year = {2020}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {11}, + number = {1}, + doi = {10.1038/s41467-020-18873-z}, + url = {https://doi.org/10.1038/s41467-020-18873-z} +} + +@article{hou2020systematic, + title = {A systematic evaluation of single-cell {RNA}-sequencing imputation methods}, + author = {Wenpin Hou and Zhicheng Ji and Hongkai Ji and Stephanie C. Hicks}, + year = {2020}, + month = {Aug.}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + doi = {10.1186/s13059-020-02132-x}, + url = {https://doi.org/10.1186/s13059-020-02132-x} +} + +@article{hubert1985comparing, + doi = {10.1007/bf01908075}, + url = {https://doi.org/10.1007/bf01908075}, + year = {1985}, + month = {Dec.}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {2}, + number = {1}, + pages = {193--218}, + author = {Lawrence Hubert and Phipps Arabie}, + title = {Comparing partitions}, + journal = {Journal of Classification} +} + +@article{hu2021spagcn, + title = {SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network}, + author = {Hu, Jian and Li, Xiangjie and Coleman, Kyle and Schroeder, Amelia and Ma, Nan and Irwin, David J and Lee, Edward B and Shinohara, Russell T and Li, Mingyao}, + journal = {Nature methods}, + volume = {18}, + number = {11}, + pages = {1342--1351}, + year = {2021}, + publisher = {Nature Publishing Group US New York}, + doi = {10.1038/s41592-021-01255-8} +} + +@article{kats2021spatialde2, + title = {SpatialDE2: fast and localized variance component analysis of spatial transcriptomics}, + author = {Kats, Ilia and Vento-Tormo, Roser and Stegle, Oliver}, + journal = {Biorxiv}, + pages = {2021--10}, + year = {2021}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2021.10.27.466045} +} + +@article{kendall1938new, + doi = {10.1093/biomet/30.1-2.81}, + url = {https://doi.org/10.1093/biomet/30.1-2.81}, + year = {1938}, + month = {Jun.}, + publisher = {Oxford University Press ({OUP})}, + volume = {30}, + number = {1-2}, + pages = {81--93}, + author = {M. G. KENDALL}, + title = {A new measure of rank correlation}, + journal = {Biometrika} +} + +@article{kiselev2019challenges, + title = {Challenges in unsupervised clustering of single-cell {RNA}-seq data}, + author = {Vladimir Yu Kiselev and Tallulah S. Andrews and Martin Hemberg}, + year = {2019}, + month = {Jan}, + journal = {Nature Reviews Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {20}, + number = {5}, + pages = {273--282}, + doi = {10.1038/s41576-018-0088-9}, + url = {https://doi.org/10.1038/s41576-018-0088-9} +} + +@article{kleshchevnikov2022cell2location, + title = {Cell2location maps fine-grained cell types in spatial transcriptomics}, + author = {Vitalii Kleshchevnikov and Artem Shmatko and Emma Dann and Alexander Aivazidis and Hamish W. King and Tong Li and Rasa Elmentaite and Artem Lomakin and Veronika Kedlian and Adam Gayoso and Mika Sarkin Jain and Jun Sung Park and Lauma Ramona and Elizabeth Tuck and Anna Arutyunyan and Roser Vento-Tormo and Moritz Gerstung and Louisa James and Oliver Stegle and Omer Ali Bayraktar}, + year = {2022}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {5}, + pages = {661--671}, + doi = {10.1038/s41587-021-01139-4}, + url = {https://doi.org/10.1038/s41587-021-01139-4} +} + +@article{korsunsky2019fast, + title = {Fast, sensitive and accurate integration of single-cell data with Harmony}, + author = {Ilya Korsunsky and Nghia Millard and Jean Fan and Kamil Slowikowski and Fan Zhang and Kevin Wei and Yuriy Baglaenko and Michael Brenner and Po-ru Loh and Soumya Raychaudhuri}, + year = {2019}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {12}, + pages = {1289--1296}, + doi = {10.1038/s41592-019-0619-0}, + url = {https://doi.org/10.1038/s41592-019-0619-0} +} + +@article{kraemer2018dimred, + title = {{dimRed} and {coRanking} - Unifying Dimensionality Reduction in R}, + author = {Guido Kraemer and Markus Reichstein and Miguel, D. Mahecha}, + year = {2018}, + journal = {The R Journal}, + publisher = {The R Foundation}, + volume = {10}, + number = {1}, + pages = {342}, + doi = {10.32614/rj-2018-039}, + url = {https://doi.org/10.32614/rj-2018-039} +} + +@article{kruskal1964mds, + title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, + author = {J. B. Kruskal}, + year = {1964}, + month = {Mar.}, + journal = {Psychometrika}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {29}, + number = {1}, + pages = {1--27}, + doi = {10.1007/bf02289565}, + url = {https://doi.org/10.1007/bf02289565} +} + +@article{kuppe2022spatial, + title = {Spatial multi-omic map of human myocardial infarction}, + author = {Kuppe, Christoph and Ramirez Flores, Ricardo O and Li, Zhijian and Hayat, Sikander and Levinson, Rebecca T and Liao, Xian and Hannani, Monica T and Tanevski, Jovan and W{\"u}nnemann, Florian and Nagai, James S and others}, + journal = {Nature}, + volume = {608}, + number = {7924}, + pages = {766--777}, + year = {2022}, + publisher = {Nature Publishing Group UK London} +} + +@article{lance2022multimodal, + title = {Multimodal single cell data integration challenge: results and lessons learned}, + author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, + year = {2022}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2022.04.11.487796}, + url = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796}, + elocation-id = {2022.04.11.487796}, + eprint = {https://www.biorxiv.org/content/early/2022/04/12/2022.04.11.487796.full.pdf} +} + +@article{lance2024predicting, + title = {Predicting cellular profiles across modalities in longitudinal single-cell data: An Open Problems competition}, + author = {...}, + year = {2024}, + journal = {In preparation} +} + +@book{lawson1995solving, + title = {Solving Least Squares Problems}, + author = {Charles L. Lawson and Richard J. Hanson}, + year = {1995}, + month = {Jan}, + publisher = {Society for Industrial and Applied Mathematics}, + doi = {10.1137/1.9781611971217}, + url = {https://doi.org/10.1137/1.9781611971217} +} + +@article{lee2009quality, + title = {Quality assessment of dimensionality reduction: Rank-based criteria}, + author = {John A. Lee and Michel Verleysen}, + year = {2009}, + month = {Mar.}, + journal = {Neurocomputing}, + publisher = {Elsevier {BV}}, + volume = {72}, + number = {7-9}, + pages = {1431--1443}, + doi = {10.1016/j.neucom.2008.12.017}, + url = {https://doi.org/10.1016/j.neucom.2008.12.017} +} + +@article{li2021bayesian, + author = {Li, Qiwei and Zhang, Minzhe and Xie, Yang and Xiao, Guanghua}, + title = {{Bayesian modeling of spatial molecular profiling data via Gaussian process}}, + journal = {Bioinformatics}, + volume = {37}, + number = {22}, + pages = {4129-4136}, + year = {2021}, + month = {06}, + abstract = {{The location, timing and abundance of gene expression (both mRNA and proteins) within a tissue define the molecular mechanisms of cell functions. Recent technology breakthroughs in spatial molecular profiling, including imaging-based technologies and sequencing-based technologies, have enabled the comprehensive molecular characterization of single cells while preserving their spatial and morphological contexts. This new bioinformatics scenario calls for effective and robust computational methods to identify genes with spatial patterns.We represent a novel Bayesian hierarchical model to analyze spatial transcriptomics data, with several unique characteristics. It models the zero-inflated and over-dispersed counts by deploying a zero-inflated negative binomial model that greatly increases model stability and robustness. Besides, the Bayesian inference framework allows us to borrow strength in parameter estimation in a de novo fashion. As a result, the proposed model shows competitive performances in accuracy and robustness over existing methods in both simulation studies and two real data applications.The related R/C++ source code is available at https://github.com/Minzhe/BOOST-GP.Supplementary data are available at Bioinformatics online. }}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab455}, + url = {https://doi.org/10.1093/bioinformatics/btab455}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/22/4129/50335106/btab455.pdf} +} + +@article{linderman2018zero, + title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, + author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/397588}, + url = {https://www.biorxiv.org/content/early/2018/08/22/397588}, + elocation-id = {397588}, + eprint = {https://www.biorxiv.org/content/early/2018/08/22/397588.full.pdf} +} + +@article{liu2020high, + title = {High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue}, + volume = {183}, + issn = {0092-8674}, + url = {http://dx.doi.org/10.1016/j.cell.2020.10.026}, + doi = {10.1016/j.cell.2020.10.026}, + number = {6}, + journal = {Cell}, + publisher = {Elsevier BV}, + author = {Liu, Yang and Yang, Mingyu and Deng, Yanxiang and Su, Graham and Enninful, Archibald and Guo, Cindy C. and Tebaldi, Toma and Zhang, Di and Kim, Dongjoo and Bai, Zhiliang and Norris, Eileen and Pan, Alisia and Li, Jiatong and Xiao, Yang and Halene, Stephanie and Fan, Rong}, + year = {2020}, + month = dec, + pages = {1665--1681.e18} +} + +@article{lohoff2021integration, + title = {Integration of spatial and single-cell transcriptomic data elucidates mouse organogenesis}, + volume = {40}, + issn = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-021-01006-2}, + doi = {10.1038/s41587-021-01006-2}, + number = {1}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Lohoff, T. and Ghazanfar, S. and Missarova, A. and Koulena, N. and Pierson, N. and Griffiths, J. A. and Bardot, E. S. and Eng, C.-H. L. and Tyser, R. C. V. and Argelaguet, R. and Guibentif, C. and Srinivas, S. and Briscoe, J. and Simons, B. D. and Hadjantonakis, A.-K. and G\"{o}ttgens, B. and Reik, W. and Nichols, J. and Cai, L. and Marioni, J. C.}, + year = {2021}, + month = sep, + pages = {74-85} +} + +@article{lopez2018deep, + title = {Deep generative modeling for single-cell transcriptomics}, + author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, + year = {2018}, + month = {Nov.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {15}, + number = {12}, + pages = {1053--1058}, + doi = {10.1038/s41592-018-0229-2}, + url = {https://doi.org/10.1038/s41592-018-0229-2} +} + +@article{lopez2022destvi, + title = {{DestVI} identifies continuums of cell types in spatial transcriptomics data}, + author = {Romain Lopez and Baoguo Li and Hadas Keren-Shaul and Pierre Boyeau and Merav Kedmi and David Pilzer and Adam Jelinski and Ido Yofe and Eyal David and Allon Wagner and Can Ergen and Yoseph Addadi and Ofra Golani and Franca Ronchese and Michael I. Jordan and Ido Amit and Nir Yosef}, + year = {2022}, + month = {Apr.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {40}, + number = {9}, + pages = {1360--1369}, + doi = {10.1038/s41587-022-01272-8}, + url = {https://doi.org/10.1038/s41587-022-01272-8} +} + +@article{lotfollahi2020query, + title = {Query to reference single-cell integration with transfer learning}, + author = {Lotfollahi, Mohammad and Naghipourfar, Mohsen and Luecken, Malte D. and Khajavi, Matin and B{\"u}ttner, Maren and Avsec, Ziga and Misharin, Alexander V. and Theis, Fabian J.}, + year = {2020}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/2020.07.16.205997}, + url = {https://doi.org/10.1101/2020.07.16.205997}, + elocation-id = {2020.07.16.205997}, + eprint = {https://www.biorxiv.org/content/early/2020/07/16/2020.07.16.205997.full.pdf} +} + +@article{luecken2022benchmarking, + title = {Benchmarking atlas-level data integration in single-cell genomics}, + author = {Malte D. Luecken and M. B\"{u}ttner and K. Chaichoompu and A. Danese and M. Interlandi and M. F. Mueller and D. C. Strobl and L. Zappia and M. Dugas and M. Colom{\'{e}}-Tatch{\'{e}} and Fabian J. Theis}, + year = {2021}, + month = {Dec.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {1}, + pages = {41--50}, + doi = {10.1038/s41592-021-01336-8}, + url = {https://doi.org/10.1038/s41592-021-01336-8} +} + +@article{lueks2011evaluate, + title = {How to Evaluate Dimensionality Reduction? - Improving the Co-ranking Matrix}, + author = {Lueks, Wouter and Mokbel, Bassam and Biehl, Michael and Hammer, Barbara}, + year = {2011}, + journal = {arXiv}, + doi = {10.48550/ARXIV.1110.3917}, + url = {https://arxiv.org/abs/1110.3917}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (cs.LG), Information Retrieval (cs.IR), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + +@misc{lun2019fastmnn, + title = {A description of the theory behind the fastMNN algorithm}, + author = {Lun, Aaron}, + year = {2019}, + url = {https://marionilab.github.io/FurtherMNN2018/theory/description.html} +} + +@article{mcinnes2018umap, + title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}, + author = {McInnes, Leland and Healy, John and Melville, James}, + year = {2018}, + journal = {arXiv}, + publisher = {Cornell University}, + doi = {10.48550/arxiv.1802.03426}, + url = {https://arxiv.org/abs/1802.03426}, + copyright = {arXiv.org perpetual, non-exclusive license}, + keywords = {Machine Learning (stat.ML), Computational Geometry (cs.CG), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences} +} + +@article{mereu2020benchmarking, + doi = {10.1038/s41587-020-0469-4}, + author = {Mereu, Elisabetta and Lafzi, Atefeh and Moutinho, Catia and Ziegenhain, Christoph and McCarthy, Davis J and Alvarez-Varela, Adrian and Batlle, Eduard and Sagar and Gruen, Dominic and Lau, Julia K and others}, + journal = {Nature biotechnology}, + number = {6}, + pages = {747--755}, + publisher = {Nature Publishing Group US New York}, + title = {Benchmarking single-cell {RNA}-sequencing protocols for cell atlas projects}, + volume = {38}, + year = {2020} +} + +@inbook{miles2005rsquared, + title = "{{R-Squared}, Adjusted {R-Squared}}", + chapter = "{{R-Squared}, Adjusted {R-Squared}}", + author = "Miles, Jeremy", + booktitle = "{Encyclopedia of Statistics in Behavioral Science}", + publisher = "John Wiley \& Sons, Ltd", + address = "Chichester, UK", + month = oct, + year = 2005, + doi = "10.1002/0470013192.bsa526", + isbn = "9780470860809,9780470860809" +} + +@article{moon2019visualizing, + title = {Visualizing structure and transitions in high-dimensional biological data}, + author = {Kevin R. Moon and David van Dijk and Zheng Wang and Scott Gigante and Daniel B. Burkhardt and William S. Chen and Kristina Yim and Antonia van den Elzen and Matthew J. Hirn and Ronald R. Coifman and Natalia B. Ivanova and Guy Wolf and Smita Krishnaswamy}, + year = {2019}, + month = {Dec.}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {37}, + number = {12}, + pages = {1482--1492}, + doi = {10.1038/s41587-019-0336-3}, + url = {https://doi.org/10.1038/s41587-019-0336-3} +} + +@article{narayan2021assessing, + title = {Assessing single-cell transcriptomic variability through density-preserving data visualization}, + author = {Ashwin Narayan and Bonnie Berger and Hyunghoon Cho}, + year = {2021}, + month = {Jan}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {39}, + number = {6}, + pages = {765--774}, + doi = {10.1038/s41587-020-00801-7}, + url = {https://doi.org/10.1038/s41587-020-00801-7} +} + +@article{nestorowa2016single, + title = {A single-cell resolution map of mouse hematopoietic stem and progenitor cell differentiation}, + author = {Sonia Nestorowa and Fiona K. Hamey and Blanca Pijuan Sala and Evangelia Diamanti and Mairi Shepherd and Elisa Laurenti and Nicola K. Wilson and David G. Kent and Berthold G\"{o}ttgens}, + year = {2016}, + month = {Aug.}, + journal = {Blood}, + publisher = {American Society of Hematology}, + volume = {128}, + number = {8}, + pages = {e20--e31}, + doi = {10.1182/blood-2016-05-716480}, + url = {https://doi.org/10.1182/blood-2016-05-716480} +} + +@inproceedings{luecken2021neurips, + author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M}, + booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, + editor = {J. Vanschoren and S. Yeung}, + pages = {}, + publisher = {Curran}, + title = {A sandbox for prediction and integration of DNA, RNA, and proteins in single cells}, + url = {https://datasets-benchmarks-proceedings.neurips.cc/paper_files/paper/2021/file/158f3069a435b314a80bdcb024f8e422-Paper-round2.pdf}, + volume = {1}, + year = {2021} +} + +@article{olsson2016single, + title = {Single-cell analysis of mixed-lineage states leading to a binary cell fate choice}, + author = {Andre Olsson and Meenakshi Venkatasubramanian and Viren K. Chaudhri and Bruce J. Aronow and Nathan Salomonis and Harinder Singh and H. Leighton Grimes}, + year = {2016}, + month = {Aug.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {537}, + number = {7622}, + pages = {698--702}, + doi = {10.1038/nature19348}, + url = {https://doi.org/10.1038/nature19348} +} + +@misc{openproblems, + title = {Open Problems}, + author = {{Open Problems for Single Cell Analysis Consortium}}, + year = {2022}, + url = {https://openproblems.bio} +} + +@article{palla2022squidpy, + title = {Squidpy: a scalable framework for spatial omics analysis}, + author = {Palla, Giovanni and Spitzer, Hannah and Klein, Michal and Fischer, David and Schaar, Anna Christina and Kuemmerle, Louis Benedikt and Rybakov, Sergei and Ibarra, Ignacio L and Holmberg, Olle and Virshup, Isaac and others}, + journal = {Nature methods}, + volume = {19}, + number = {2}, + pages = {171--178}, + year = {2022}, + publisher = {Nature Publishing Group US New York}, + doi = {10.1038/s41592-021-01358-2} +} + +@article{pearson1895regression, + doi = {10.1098/rspl.1895.0041}, + title = {VII. Note on regression and inheritance in the case of two parents}, + author = {Pearson, Karl}, + journal = {proceedings of the royal society of London}, + volume = {58}, + number = {347-352}, + pages = {240--242}, + year = {1895}, + publisher = {The Royal Society London} +} + +@article{pearson1901pca, + title = {On lines and planes of closest fit to systems of points in space}, + author = {Karl Pearson}, + year = {1901}, + month = {Nov.}, + journal = {The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science}, + publisher = {Informa {UK} Limited}, + volume = {2}, + number = {11}, + pages = {559--572}, + doi = {10.1080/14786440109462720}, + url = {https://doi.org/10.1080/14786440109462720} +} + +@article{pliner2019supervised, + title = {Supervised classification enables rapid annotation of cell atlases}, + author = {Hannah A. Pliner and Jay Shendure and Cole Trapnell}, + year = {2019}, + month = {Sep}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {10}, + pages = {983--986}, + doi = {10.1038/s41592-019-0535-3}, + url = {https://doi.org/10.1038/s41592-019-0535-3} +} + +@article{polanski2020bbknn, + title = {{BBKNN}: fast batch alignment of single cell transcriptomes}, + author = {Krzysztof Pola{\'{n}}ski and Matthew D Young and Zhichao Miao and Kerstin B Meyer and Sarah A Teichmann and Jong-Eun Park}, + year = {2019}, + month = {Aug.}, + journal = {Bioinformatics}, + publisher = {Oxford University Press ({OUP})}, + doi = {10.1093/bioinformatics/btz625}, + url = {https://doi.org/10.1093/bioinformatics/btz625}, + editor = {Bonnie Berger} +} + +@article{raredon2022computation, + title = {Computation and visualization of cell-cell signaling topologies in single-cell systems data using Connectome}, + author = {Micha Sam Brickman Raredon and Junchen Yang and James Garritano and Meng Wang and Dan Kushnir and Jonas Christian Schupp and Taylor S. Adams and Allison M. Greaney and Katherine L. Leiby and Naftali Kaminski and Yuval Kluger and Andre Levchenko and Laura E. Niklason}, + year = {2022}, + month = {Mar.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {12}, + number = {1}, + doi = {10.1038/s41598-022-07959-x}, + url = {https://doi.org/10.1038/s41598-022-07959-x} +} + +@article{rodriques2019slide, + title = {Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution}, + author = {Samuel G. Rodriques and Robert R. Stickels and Aleksandrina Goeva and Carly A. Martin and Evan Murray and Charles R. Vanderburg and Joshua Welch and Linlin M. Chen and Fei Chen and Evan Z. Macosko}, + year = {2019}, + month = {Mar.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {363}, + number = {6434}, + pages = {1463--1467}, + doi = {10.1126/science.aaw1219}, + url = {https://doi.org/10.1126/science.aaw1219} +} + +@article{russell2023slide, + title = {Slide-tags enables single-nucleus barcoding for multimodal spatial genomics}, + volume = {625}, + issn = {1476-4687}, + url = {http://dx.doi.org/10.1038/s41586-023-06837-4}, + doi = {10.1038/s41586-023-06837-4}, + number = {7993}, + journal = {Nature}, + publisher = {Springer Science and Business Media LLC}, + author = {Russell, Andrew J. C. and Weir, Jackson A. and Nadaf, Naeem M. and Shabet, Matthew and Kumar, Vipin and Kambhampati, Sandeep and Raichur, Ruth and Marrero, Giovanni J. and Liu, Sophia and Balderrama, Karol S. and Vanderburg, Charles R. and Shanmugam, Vignesh and Tian, Luyi and Iorgulescu, J. Bryan and Yoon, Charles H. and Wu, Catherine J. and Macosko, Evan Z. and Chen, Fei}, + year = {2023}, + month = dec, + pages = {101–109} +} + +@inproceedings{santos2009on, + author = {Santos, Jorge M. and Embrechts, Mark"}, + editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios}, + title = {On the Use of the Adjusted Rand Index as a Metric for Evaluating Supervised Classification}, + booktitle = {Artificial Neural Networks -- ICANN 2009}, + year = {2009}, + publisher = {Springer Berlin Heidelberg}, + address = {Berlin, Heidelberg}, + pages = {175--184}, + isbn = {978-3-642-04277-5}, + doi = {10.1007/978-3-642-04277-5_18}, + url = {https://doi.org/10.1007/978-3-642-04277-5_18} +} + +@article{sarkar2021separating, + title = {Separating measurement and expression models clarifies confusion in single-cell {RNA} sequencing analysis}, + author = {Abhishek Sarkar and Matthew Stephens}, + year = {2021}, + month = {May}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {6}, + pages = {770--777}, + doi = {10.1038/s41588-021-00873-4}, + url = {https://doi.org/10.1038/s41588-021-00873-4} +} + +@article{schober2018correlation, + title = {Correlation Coefficients}, + author = {Patrick Schober and Christa Boer and Lothar A. Schwarte}, + year = {2018}, + month = {May}, + journal = {Anesthesia {\&} Analgesia}, + publisher = {Ovid Technologies (Wolters Kluwer Health)}, + volume = {126}, + number = {5}, + pages = {1763--1768}, + doi = {10.1213/ane.0000000000002864}, + url = {https://doi.org/10.1213/ane.0000000000002864} +} + +@inproceedings{stanley2020harmonic, + title = {Harmonic Alignment}, + author = {Jay S. Stanley and Scott Gigante and Guy Wolf and Smita Krishnaswamy}, + year = {2020}, + month = {Jan}, + booktitle = {Proceedings of the 2020 {SIAM} International Conference on Data Mining}, + publisher = {Society for Industrial and Applied Mathematics}, + pages = {316--324}, + doi = {10.1137/1.9781611976236.36}, + url = {https://doi.org/10.1137/1.9781611976236.36} +} + +@article{stickels2020highly, + title = {Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2}, + volume = {39}, + issn = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-020-0739-1}, + doi = {10.1038/s41587-020-0739-1}, + number = {3}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Stickels, Robert R. and Murray, Evan and Kumar, Pawan and Li, Jilong and Marshall, Jamie L. and Di Bella, Daniela J. and Arlotta, Paola and Macosko, Evan Z. and Chen, Fei}, + year = {2020}, + month = dec, + pages = {313–319} +} + +@article{stoeckius2017simultaneous, + title = {Simultaneous epitope and transcriptome measurement in single cells}, + author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, + year = {2017}, + month = {Jul.}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {14}, + number = {9}, + pages = {865--868}, + doi = {10.1038/nmeth.4380}, + url = {https://doi.org/10.1038/nmeth.4380} +} + +@article{stuart2019comprehensive, + title = {Comprehensive Integration of Single-Cell Data}, + author = {Stuart, T. and Butler, A. and Hoffman, P. and Hafemeister, C. and Papalexi, E. and Mauck, W.M. and Hao, Y. and Stoeckius, M. and Smibert, P. and Satija, R.}, + year = {2019}, + journal = {Cell}, + volume = {177}, + number = {7}, + pages = {1888--1902.e21}, + doi = {10.1016/j.cell.2019.05.031} +} + +@article{sun2020statistical, + title = {Statistical analysis of spatial expression patterns for spatially resolved transcriptomic studies}, + author = {Sun, Shiquan and Zhu, Jiaqiang and Zhou, Xiang}, + journal = {Nature methods}, + volume = {17}, + number = {2}, + pages = {193--200}, + year = {2020}, + publisher = {Nature Publishing Group US New York}, + doi = {10.1038/s41592-019-0701-7} +} + +@article{svensson2018spatialde, + title = {SpatialDE: identification of spatially variable genes}, + author = {Svensson, Valentine and Teichmann, Sarah A and Stegle, Oliver}, + journal = {Nature methods}, + volume = {15}, + number = {5}, + pages = {343--346}, + year = {2018}, + publisher = {Nature Publishing Group}, + doi = {10.1038/nmeth.4636} +} + +@article{szubert2019structurepreserving, + title = {Structure-preserving visualisation of high dimensional single-cell datasets}, + author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, + year = {2019}, + month = {Jun.}, + journal = {Scientific Reports}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {9}, + number = {1}, + doi = {10.1038/s41598-019-45301-0}, + url = {https://doi.org/10.1038/s41598-019-45301-0} +} + +@article{tabula2018single, + title = {Single-cell transcriptomics of 20 mouse organs creates a Tabula Muris}, + author = {{Tabula Muris Consortium}}, + year = {2018}, + month = {Oct.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {562}, + number = {7727}, + pages = {367--372}, + doi = {10.1038/s41586-018-0590-4}, + url = {https://doi.org/10.1038/s41586-018-0590-4} +} + +@article{tabula2020single, + title = {A single-cell transcriptomic atlas characterizes ageing tissues in the mouse}, + author = {{Tabula Muris Consortium}}, + year = {2020}, + month = {Jul.}, + journal = {Nature}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {583}, + number = {7817}, + pages = {590--595}, + doi = {10.1038/s41586-020-2496-1}, + url = {https://doi.org/10.1038/s41586-020-2496-1} +} + +@article{tasic2016adult, + title = {Adult mouse cortical cell taxonomy revealed by single cell transcriptomics}, + author = {Bosiljka Tasic and Vilas Menon and Thuc Nghi Nguyen and Tae Kyung Kim and Tim Jarsky and Zizhen Yao and Boaz Levi and Lucas T Gray and Staci A Sorensen and Tim Dolbeare and Darren Bertagnolli and Jeff Goldy and Nadiya Shapovalova and Sheana Parry and Changkyu Lee and Kimberly Smith and Amy Bernard and Linda Madisen and Susan M Sunkin and Michael Hawrylycz and Christof Koch and Hongkui Zeng}, + year = {2016}, + month = {Jan}, + journal = {Nature Neuroscience}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {19}, + number = {2}, + pages = {335--346}, + doi = {10.1038/nn.4216}, + url = {https://doi.org/10.1038/nn.4216} +} + +@article{tian2019benchmarking, + title = {Benchmarking single cell {RNA}-sequencing analysis pipelines using mixture control experiments}, + author = {Luyi Tian and Xueyi Dong and Saskia Freytag and Kim-Anh L{\^{e}} Cao and Shian Su and Abolfazl JalalAbadi and Daniela Amann-Zalcenstein and Tom S. Weber and Azadeh Seidi and Jafar S. Jabbari and Shalin H. Naik and Matthew E. Ritchie}, + year = {2019}, + month = {May}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {6}, + pages = {479--487}, + doi = {10.1038/s41592-019-0425-8}, + url = {https://doi.org/10.1038/s41592-019-0425-8} +} + +@article{tran2020benchmark, + doi = {10.1186/s13059-019-1850-9}, + url = {https://doi.org/10.1186/s13059-019-1850-9}, + year = {2020}, + month = {Jan}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {21}, + number = {1}, + author = {Hoa Thi Nhu Tran and Kok Siong Ang and Marion Chevrier and Xiaomeng Zhang and Nicole Yee Shin Lee and Michelle Goh and Jinmiao Chen}, + title = {A benchmark of batch-effect correction methods for single-cell {RNA} sequencing data}, + journal = {Genome Biology} +} + +@article{van2018recovering, + title = {Recovering Gene Interactions from Single-Cell Data Using Data Diffusion}, + author = {David van Dijk and Roshan Sharma and Juozas Nainys and Kristina Yim and Pooja Kathail and Ambrose J. Carr and Cassandra Burdziak and Kevin R. Moon and Christine L. Chaffer and Diwakar Pattabiraman and Brian Bierie and Linas Mazutis and Guy Wolf and Smita Krishnaswamy and Dana Pe'er}, + year = {2018}, + month = {Jul.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {174}, + number = {3}, + pages = {716--729.e27}, + doi = {10.1016/j.cell.2018.05.061}, + url = {https://doi.org/10.1016/j.cell.2018.05.061} +} + +@article{vandermaaten2008visualizing, + title = {Visualizing Data using t-SNE}, + author = {{van der} Maaten, Laurens and Hinton, Geoffrey}, + year = {2008}, + journal = {Journal of Machine Learning Research}, + volume = {9}, + number = {86}, + pages = {2579--2605}, + url = {http://jmlr.org/papers/v9/vandermaaten08a.html} +} + +@inproceedings{venna2001neighborhood, + title = {Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2001}, + booktitle = {Artificial Neural Networks {\textemdash} {ICANN} 2001}, + publisher = {Springer Berlin Heidelberg}, + pages = {485--491}, + doi = {{10.1007/3-540-44668-0_68}}, + url = {{https://doi.org/10.1007/3-540-44668-0_68}} +} + +@article{venna2006local, + title = {Local multidimensional scaling}, + author = {Jarkko Venna and Samuel Kaski}, + year = {2006}, + month = {Jul.}, + journal = {Neural Networks}, + publisher = {Elsevier {BV}}, + volume = {19}, + number = {6-7}, + pages = {889--899}, + doi = {10.1016/j.neunet.2006.05.014}, + url = {https://doi.org/10.1016/j.neunet.2006.05.014} +} + +@article{virshup2021anndataannotateddata, + title = {{anndata: Annotated data}}, + author = {Virshup, Isaac and Rybakov, Sergei and Theis, Fabian J and + Angerer, Philipp and Alexander Wolf, F}, + journal = {bioRxiv}, + pages = {2021.12.16.473007}, + month = dec, + year = 2021, + doi = {10.1101/2021.12.16.473007}, + language = {en} +} + +@article{wagner2018knearest, + title = {K-nearest neighbor smoothing for high-throughput single-cell RNA-Seq data}, + author = {Wagner, Florian and Yan, Yun and Yanai, Itai}, + year = {2018}, + journal = {bioRxiv}, + publisher = {Cold Spring Harbor Laboratory}, + doi = {10.1101/217737}, + url = {https://www.biorxiv.org/content/early/2018/04/09/217737}, + elocation-id = {217737}, + eprint = {https://www.biorxiv.org/content/early/2018/04/09/217737.full.pdf} +} + +@article{wagner2018single, + title = {Single-cell mapping of gene expression landscapes and lineage in the zebrafish embryo}, + author = {Daniel E. Wagner and Caleb Weinreb and Zach M. Collins and James A. Briggs and Sean G. Megason and Allon M. Klein}, + year = {2018}, + month = {Jun.}, + journal = {Science}, + publisher = {American Association for the Advancement of Science ({AAAS})}, + volume = {360}, + number = {6392}, + pages = {981--987}, + doi = {10.1126/science.aar4362}, + url = {https://doi.org/10.1126/science.aar4362} +} + +@article{wang2013target, + title = {Target analysis by integration of transcriptome and {ChIP}-seq data with {BETA}}, + author = {Su Wang and Hanfei Sun and Jian Ma and Chongzhi Zang and Chenfei Wang and Juan Wang and Qianzi Tang and Clifford A Meyer and Yong Zhang and X Shirley Liu}, + year = {2013}, + month = {Nov.}, + journal = {Nature Protocols}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {8}, + number = {12}, + pages = {2502--2515}, + doi = {10.1038/nprot.2013.150}, + url = {https://doi.org/10.1038/nprot.2013.150} +} + +@article{wang2017visualization, + title = {Visualization and analysis of single-cell {RNA}-seq data by kernel-based similarity learning}, + volume = {14}, + copyright = {2017 Springer Nature America, Inc.}, + issn = {1548-7105}, + url = {https://www.nature.com/articles/nmeth.4207}, + doi = {10.1038/nmeth.4207}, + abstract = {The SIMLR software identifies similarities between cells across a range of single-cell RNA-seq data, enabling effective dimension reduction, clustering and visualization.}, + language = {en}, + number = {4}, + journal = {Nature Methods}, + author = {Wang, Bo and Zhu, Junjie and Pierson, Emma and Ramazzotti, Daniele and Batzoglou, Serafim}, + month = apr, + year = {2017}, + publisher = {Nature Publishing Group}, + keywords = {Gene expression, Genome informatics, Machine learning, Statistical methods}, + pages = {414--416} +} + +@article{wang2018three, + title = {Three-dimensional intact-tissue sequencing of single-cell transcriptional states}, + volume = {361}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.aat5691}, + doi = {10.1126/science.aat5691}, + number = {6400}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Wang, Xiao and Allen, William E. and Wright, Matthew A. and Sylwestrak, Emily L. and Samusik, Nikolay and Vesuna, Sam and Evans, Kathryn and Liu, Cindy and Ramakrishnan, Charu and Liu, Jia and Nolan, Garry P. and Bava, Felice-Alessio and Deisseroth, Karl}, + year = {2018}, + month = jul +} + +@article{wang2022high, + title = {High-resolution 3D spatiotemporal transcriptomic maps of developing Drosophila embryos and larvae}, + volume = {57}, + issn = {1534-5807}, + url = {http://dx.doi.org/10.1016/j.devcel.2022.04.006}, + doi = {10.1016/j.devcel.2022.04.006}, + number = {10}, + journal = {Developmental Cell}, + publisher = {Elsevier BV}, + author = {Wang, Mingyue and Hu, Qinan and Lv, Tianhang and Wang, Yuhang and Lan, Qing and Xiang, Rong and Tu, Zhencheng and Wei, Yanrong and Han, Kai and Shi, Chang and Guo, Junfu and Liu, Chao and Yang, Tao and Du, Wensi and An, Yanru and Cheng, Mengnan and Xu, Jiangshan and Lu, Haorong and Li, Wangsheng and Zhang, Shaofang and Chen, Ao and Chen, Wei and Li, Yuxiang and Wang, Xiaoshan and Xu, Xun and Hu, Yuhui and Liu, Longqi}, + year = {2022}, + month = may, + pages = {1271--1283.e4} +} + +@article{weber2023nnsvg, + title = {nnSVG for the scalable identification of spatially variable genes using nearest-neighbor Gaussian processes}, + author = {Weber, Lukas M and Saha, Arkajyoti and Datta, Abhirup and Hansen, Kasper D and Hicks, Stephanie C}, + journal = {Nature communications}, + volume = {14}, + number = {1}, + pages = {4059}, + year = {2023}, + publisher = {Nature Publishing Group UK London}, + doi = {10.1038/s41467-023-39748-z} +} + +@article{welch2019single, + title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, + author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, + year = {2019}, + month = {Jun.}, + journal = {Cell}, + publisher = {Elsevier {BV}}, + volume = {177}, + number = {7}, + pages = {1873--1887.e17}, + doi = {10.1016/j.cell.2019.05.006}, + url = {https://doi.org/10.1016/j.cell.2019.05.006} +} + +@article{wilkinson1973symbolic, + doi = {10.2307/2346786}, + url = {https://doi.org/10.2307/2346786}, + year = {1973}, + publisher = {{JSTOR}}, + volume = {22}, + number = {3}, + pages = {392}, + author = {G. N. Wilkinson and C. E. Rogers}, + title = {Symbolic Description of Factorial Models for Analysis of Variance}, + journal = {Applied Statistics} +} + +@article{wu2021single, + title = {A single-cell and spatially resolved atlas of human breast cancers}, + author = {Sunny Z. Wu and Ghamdan Al-Eryani and Daniel Lee Roden and Simon Junankar and Kate Harvey and Alma Andersson and Aatish Thennavan and Chenfei Wang and James R. Torpy and Nenad Bartonicek and Taopeng Wang and Ludvig Larsson and Dominik Kaczorowski and Neil I. Weisenfeld and Cedric R. Uytingco and Jennifer G. Chew and Zachary W. Bent and Chia-Ling Chan and Vikkitharan Gnanasambandapillai and Charles-Antoine Dutertre and Laurence Gluch and Mun N. Hui and Jane Beith and Andrew Parker and Elizabeth Robbins and Davendra Segara and Caroline Cooper and Cindy Mak and Belinda Chan and Sanjay Warrier and Florent Ginhoux and Ewan Millar and Joseph E. Powell and Stephen R. Williams and X. Shirley Liu and Sandra O'Toole and Elgene Lim and Joakim Lundeberg and Charles M. Perou and Alexander Swarbrick}, + year = {2021}, + month = {Sep}, + journal = {Nature Genetics}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {53}, + number = {9}, + pages = {1334--1347}, + doi = {10.1038/s41588-021-00911-1}, + url = {https://doi.org/10.1038/s41588-021-00911-1} +} + +@article{xiong2020neuralee, + title = {{NeuralEE}: A {GPU}-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale {scRNA}-Seq Data}, + author = {Jiankang Xiong and Fuzhou Gong and Lin Wan and Liang Ma}, + year = {2020}, + month = {Oct.}, + journal = {Frontiers in Genetics}, + publisher = {Frontiers Media {SA}}, + volume = {11}, + doi = {10.3389/fgene.2020.00786}, + url = {https://doi.org/10.3389/fgene.2020.00786} +} + +@article{xiong2021online, + title = {Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space}, + author = {Lei Xiong and Kang Tian and Yuzhe Li and Weixi Ning and Xin Gao and Qiangfeng Cliff Zhang}, + year = {2022}, + month = {Oct.}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {13}, + number = {1}, + doi = {10.1038/s41467-022-33758-z}, + url = {https://doi.org/10.1038/s41467-022-33758-z} +} + +@article{xu2021probabilistic, + title = {Probabilistic harmonization and annotation of single-cell transcriptomics data with deep generative models}, + author = {Chenling Xu and Romain Lopez and Edouard Mehlman and Jeffrey Regier and Michael I Jordan and Nir Yosef}, + year = {2021}, + month = {Jan}, + journal = {Molecular Systems Biology}, + publisher = {{Embo}}, + volume = {17}, + number = {1}, + doi = {10.15252/msb.20209620}, + url = {https://doi.org/10.15252/msb.20209620} +} + +@article{zappia2018exploring, + doi = {10.1371/journal.pcbi.1006245}, + url = {https://doi.org/10.1371/journal.pcbi.1006245}, + year = {2018}, + month = {Jun.}, + publisher = {Public Library of Science ({PLoS})}, + volume = {14}, + number = {6}, + pages = {e1006245}, + author = {Luke Zappia and Belinda Phipson and Alicia Oshlack}, + editor = {Dina Schneidman}, + title = {Exploring the single-cell {RNA}-seq analysis landscape with the {scRNA}-tools database}, + journal = {{PLOS} Computational Biology} +} + +@article{zhang2021pydrmetrics, + title = {{pyDRMetrics} - A Python toolkit for dimensionality reduction quality assessment}, + author = {Yinsheng Zhang and Qian Shang and Guoming Zhang}, + year = {2021}, + month = {Feb.}, + journal = {Heliyon}, + publisher = {Elsevier {BV}}, + volume = {7}, + number = {2}, + pages = {e06199}, + doi = {10.1016/j.heliyon.2021.e06199}, + url = {https://doi.org/10.1016/j.heliyon.2021.e06199} +} + +@article{zhang2022identification, + title = {Identification of spatially variable genes with graph cuts}, + author = {Zhang, Ke and Feng, Wanwan and Wang, Peng}, + journal = {Nature Communications}, + volume = {13}, + number = {1}, + pages = {5488}, + year = {2022}, + publisher = {Nature Publishing Group UK London}, + doi = {10.1038/s41467-022-33182-3} +} + +@article{zhu2021spark, + title = {SPARK-X: non-parametric modeling enables scalable and robust detection of spatial expression patterns for large spatial transcriptomic studies}, + author = {Zhu, Jiaqiang and Sun, Shiquan and Zhou, Xiang}, + journal = {Genome biology}, + volume = {22}, + number = {1}, + pages = {184}, + year = {2021}, + publisher = {Springer}, + doi = {10.1186/s13059-021-02404-0} +} + +@article{hrovatin2023delineating, + author = {Karin Hrovatin and Aim{\'e}e Bastidas-Ponce and Mostafa Bakhti and Luke Zappia and Maren B{\"u}ttner and Ciro Sallino and Michael Sterr and Anika B{\"o}ttcher and Adriana Migliorini and Heiko Lickert and Fabian J. Theis}, + title = {Delineating mouse β-cell identity during lifetime and in diabetes with a single cell atlas}, + elocation-id = {2022.12.22.521557}, + year = {2023}, + doi = {10.1101/2022.12.22.521557}, + publisher = {Cold Spring Harbor Laboratory}, + url = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557}, + eprint = {https://www.biorxiv.org/content/early/2023/04/25/2022.12.22.521557.full.pdf}, + journal = {bioRxiv} +} + +@article{sikkema2023integrated, + title = {An integrated cell atlas of the lung in health and disease}, + volume = {29}, + issn = {1546-170X}, + url = {http://dx.doi.org/10.1038/s41591-023-02327-2}, + doi = {10.1038/s41591-023-02327-2}, + number = {6}, + journal = {Nature Medicine}, + publisher = {Springer Science and Business Media LLC}, + author = {Sikkema, Lisa and Ramírez-Suástegui, Ciro and Strobl, Daniel C. and Gillett, Tessa E. and Zappia, Luke and Madissoon, Elo and Markov, Nikolay S. and Zaragosi, Laure-Emmanuelle and Ji, Yuge and Ansari, Meshal and Arguel, Marie-Jeanne and Apperloo, Leonie and Banchero, Martin and Bécavin, Christophe and Berg, Marijn and Chichelnitskiy, Evgeny and Chung, Mei-i and Collin, Antoine and Gay, Aurore C. A. and Gote-Schniering, Janine and Hooshiar Kashani, Baharak and Inecik, Kemal and Jain, Manu and Kapellos, Theodore S. and Kole, Tessa M. and Leroy, Sylvie and Mayr, Christoph H. and Oliver, Amanda J. and von Papen, Michael and Peter, Lance and Taylor, Chase J. and Walzthoeni, Thomas and Xu, Chuan and Bui, Linh T. and De Donno, Carlo and Dony, Leander and Faiz, Alen and Guo, Minzhe and Gutierrez, Austin J. and Heumos, Lukas and Huang, Ni and Ibarra, Ignacio L. and Jackson, Nathan D. and Kadur Lakshminarasimha Murthy, Preetish and Lotfollahi, Mohammad and Tabib, Tracy and Talavera-López, Carlos and Travaglini, Kyle J. and Wilbrey-Clark, Anna and Worlock, Kaylee B. and Yoshida, Masahiro and Chen, Yuexin and Hagood, James S. and Agami, Ahmed and Horvath, Peter and Lundeberg, Joakim and Marquette, Charles-Hugo and Pryhuber, Gloria and Samakovlis, Chistos and Sun, Xin and Ware, Lorraine B. and Zhang, Kun and van den Berge, Maarten and Bossé, Yohan and Desai, Tushar J. and Eickelberg, Oliver and Kaminski, Naftali and Krasnow, Mark A. and Lafyatis, Robert and Nikolic, Marko Z. and Powell, Joseph E. and Rajagopal, Jayaraj and Rojas, Mauricio and Rozenblatt-Rosen, Orit and Seibold, Max A. and Sheppard, Dean and Shepherd, Douglas P. and Sin, Don D. and Timens, Wim and Tsankov, Alexander M. and Whitsett, Jeffrey and Xu, Yan and Banovich, Nicholas E. and Barbry, Pascal and Duong, Thu Elizabeth and Falk, Christine S. and Meyer, Kerstin B. and Kropski, Jonathan A. and Pe’er, Dana and Schiller, Herbert B. and Tata, Purushothama Rao and Schultze, Joachim L. and Teichmann, Sara A. and Misharin, Alexander V. and Nawijn, Martijn C. and Luecken, Malte D. and Theis, Fabian J.}, + year = {2023}, + month = jun, + pages = {1563–1577} +} + +@article{consortium2022tabula, + title = {The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans}, + volume = {376}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4896}, + doi = {10.1126/science.abl4896}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Jones, Robert C. and Karkanias, Jim and Krasnow, Mark A. and Pisco, Angela Oliveira and Quake, Stephen R. and Salzman, Julia and Yosef, Nir and Bulthaup, Bryan and Brown, Phillip and Harper, William and Hemenez, Marisa and Ponnusamy, Ravikumar and Salehi, Ahmad and Sanagavarapu, Bhavani A. and Spallino, Eileen and Aaron, Ksenia A. and Concepcion, Waldo and Gardner, James M. and Kelly, Burnett and Neidlinger, Nikole and Wang, Zifa and Crasta, Sheela and Kolluru, Saroja and Morri, Maurizio and Pisco, Angela Oliveira and Tan, Serena Y. and Travaglini, Kyle J. and Xu, Chenling and Alcántara-Hernández, Marcela and Almanzar, Nicole and Antony, Jane and Beyersdorf, Benjamin and Burhan, Deviana and Calcuttawala, Kruti and Carter, Matthew M. and Chan, Charles K. F. and Chang, Charles A. and Chang, Stephen and Colville, Alex and Crasta, Sheela and Culver, Rebecca N. and Cvijović, Ivana and D’Amato, Gaetano and Ezran, Camille and Galdos, Francisco X. and Gillich, Astrid and Goodyer, William R. and Hang, Yan and Hayashi, Alyssa and Houshdaran, Sahar and Huang, Xianxi and Irwin, Juan C. and Jang, SoRi and Juanico, Julia Vallve and Kershner, Aaron M. and Kim, Soochi and Kiss, Bernhard and Kolluru, Saroja and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Li, Baoxiang and Loeb, Gabriel B. and Lu, Wan-Jin and Mantri, Sruthi and Markovic, Maxim and McAlpine, Patrick L. and de Morree, Antoine and Morri, Maurizio and Mrouj, Karim and Mukherjee, Shravani and Muser, Tyler and Neuh\"{o}fer, Patrick and Nguyen, Thi D. and Perez, Kimberly and Phansalkar, Ragini and Pisco, Angela Oliveira and Puluca, Nazan and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Schaum, Nicholas and Scott, Bronwyn and Seddighzadeh, Bobak and Segal, Joe and Sen, Sushmita and Sikandar, Shaheen and Spencer, Sean P. and Steffes, Lea C. and Subramaniam, Varun R. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Van Treuren, Will and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Vo, Kim Chi and Vorperian, Sevahn K. and Wang, Wanxin and Weinstein, Hannah N. W. and Winkler, Juliane and Wu, Timothy T. H. and Xie, Jamie and Yung, Andrea R. and Zhang, Yue and Detweiler, Angela M. and Mekonen, Honey and Neff, Norma F. and Sit, Rene V. and Tan, Michelle and Yan, Jia and Bean, Gregory R. and Charu, Vivek and Forgó, Erna and Martin, Brock A. and Ozawa, Michael G. and Silva, Oscar and Tan, Serena Y. and Toland, Angus and Vemuri, Venkata N. P. and Afik, Shaked and Awayan, Kyle and Botvinnik, Olga Borisovna and Byrne, Ashley and Chen, Michelle and Dehghannasiri, Roozbeh and Detweiler, Angela M. and Gayoso, Adam and Granados, Alejandro A. and Li, Qiqing and Mahmoudabadi, Gita and McGeever, Aaron and de Morree, Antoine and Olivieri, Julia Eve and Park, Madeline and Pisco, Angela Oliveira and Ravikumar, Neha and Salzman, Julia and Stanley, Geoff and Swift, Michael and Tan, Michelle and Tan, Weilun and Tarashansky, Alexander J. and Vanheusden, Rohan and Vorperian, Sevahn K. and Wang, Peter and Wang, Sheng and Xing, Galen and Xu, Chenling and Yosef, Nir and Alcántara-Hernández, Marcela and Antony, Jane and Chan, Charles K. F. and Chang, Charles A. and Colville, Alex and Crasta, Sheela and Culver, Rebecca and Dethlefsen, Les and Ezran, Camille and Gillich, Astrid and Hang, Yan and Ho, Po-Yi and Irwin, Juan C. and Jang, SoRi and Kershner, Aaron M. and Kong, William and Kumar, Maya E. and Kuo, Angera H. and Leylek, Rebecca and Liu, Shixuan and Loeb, Gabriel B. and Lu, Wan-Jin and Maltzman, Jonathan S. and Metzger, Ross J. and de Morree, Antoine and Neuh\"{o}fer, Patrick and Perez, Kimberly and Phansalkar, Ragini and Qi, Zhen and Rao, Poorvi and Raquer-McKay, Hayley and Sasagawa, Koki and Scott, Bronwyn and Sinha, Rahul and Song, Hanbing and Spencer, Sean P. and Swarup, Aditi and Swift, Michael and Travaglini, Kyle J. and Trimm, Emily and Veizades, Stefan and Vijayakumar, Sivakamasundari and Wang, Bruce and Wang, Wanxin and Winkler, Juliane and Xie, Jamie and Yung, Andrea R. and Artandi, Steven E. and Beachy, Philip A. and Clarke, Michael F. and Giudice, Linda C. and Huang, Franklin W. and Huang, Kerwyn Casey and Idoyaga, Juliana and Kim, Seung K. and Krasnow, Mark and Kuo, Christin S. and Nguyen, Patricia and Quake, Stephen R. and Rando, Thomas A. and Red-Horse, Kristy and Reiter, Jeremy and Relman, David A. and Sonnenburg, Justin L. and Wang, Bruce and Wu, Albert and Wu, Sean M. and Wyss-Coray, Tony}, + year = {2022}, + month = may +} + +@article{dominguez2022crosstissue, + title = {Cross-tissue immune cell analysis reveals tissue-specific features in humans}, + volume = {376}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl5197}, + doi = {10.1126/science.abl5197}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Domínguez Conde, C. and Xu, C. and Jarvis, L. B. and Rainbow, D. B. and Wells, S. B. and Gomes, T. and Howlett, S. K. and Suchanek, O. and Polanski, K. and King, H. W. and Mamanova, L. and Huang, N. and Szabo, P. A. and Richardson, L. and Bolt, L. and Fasouli, E. S. and Mahbubani, K. T. and Prete, M. and Tuck, L. and Richoz, N. and Tuong, Z. K. and Campos, L. and Mousa, H. S. and Needham, E. J. and Pritchard, S. and Li, T. and Elmentaite, R. and Park, J. and Rahmani, E. and Chen, D. and Menon, D. K. and Bayraktar, O. A. and James, L. K. and Meyer, K. B. and Yosef, N. and Clatworthy, M. R. and Sims, P. A. and Farber, D. L. and Saeb-Parsy, K. and Jones, J. L. and Teichmann, S. A.}, + year = {2022}, + month = may +} + +@article{eraslan2022singlenucleus, + title = {Single-nucleus cross-tissue molecular reference maps toward understanding disease gene function}, + volume = {376}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abl4290}, + doi = {10.1126/science.abl4290}, + number = {6594}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Eraslan, G\"{o}kcen and Drokhlyansky, Eugene and Anand, Shankara and Fiskin, Evgenij and Subramanian, Ayshwarya and Slyper, Michal and Wang, Jiali and Van Wittenberghe, Nicholas and Rouhana, John M. and Waldman, Julia and Ashenberg, Orr and Lek, Monkol and Dionne, Danielle and Win, Thet Su and Cuoco, Michael S. and Kuksenko, Olena and Tsankov, Alexander M. and Branton, Philip A. and Marshall, Jamie L. and Greka, Anna and Getz, Gad and Segrè, Ayellet V. and Aguet, Fran\c{c}ois and Rozenblatt-Rosen, Orit and Ardlie, Kristin G. and Regev, Aviv}, + year = {2022}, + month = may +} + +@article{li2023integrated, + title = {{Integrated multi-omics single cell atlas of the human retina}}, + author = {Li, Jin and Wang, Jun and Ibarra, Ignacio L and Cheng, Xuesen and + Luecken, Malte D and Lu, Jiaxiong and Monavarfeshani, Aboozar and + Yan, Wenjun and Zheng, Yiqiao and Zuo, Zhen and Zayas Colborn, + Samantha Lynn and Cortez, Berenice Sarahi and Owen, Leah A and + Tran, Nicholas M and Shekhar, Karthik and Sanes, Joshua R and + Stout, J Timothy and Chen, Shiming and Li, Yumei and DeAngelis, + Margaret M and Theis, Fabian J and Chen, Rui}, + journal = {bioRxiv}, + month = nov, + year = 2023, + doi = {10.1101/2023.11.07.566105} +} + +@article{wilson2022multimodal, + title = {Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression}, + volume = {13}, + issn = {2041-1723}, + url = {http://dx.doi.org/10.1038/s41467-022-32972-z}, + doi = {10.1038/s41467-022-32972-z}, + number = {1}, + journal = {Nature Communications}, + publisher = {Springer Science and Business Media LLC}, + author = {Wilson, Parker C. and Muto, Yoshiharu and Wu, Haojia and Karihaloo, Anil and Waikar, Sushrut S. and Humphreys, Benjamin D.}, + year = {2022}, + month = sep +} + +@article{steuernagel2022hypomap, + title = {HypoMap—a unified single-cell gene expression atlas of the murine hypothalamus}, + volume = {4}, + issn = {2522-5812}, + url = {http://dx.doi.org/10.1038/s42255-022-00657-y}, + doi = {10.1038/s42255-022-00657-y}, + number = {10}, + journal = {Nature Metabolism}, + publisher = {Springer Science and Business Media LLC}, + author = {Steuernagel, Lukas and Lam, Brian Y. H. and Klemm, Paul and Dowsett, Georgina K. C. and Bauder, Corinna A. and Tadross, John A. and Hitschfeld, Tamara Sotelo and del Rio Martin, Almudena and Chen, Weiyi and de Solis, Alain J. and Fenselau, Henning and Davidsen, Peter and Cimino, Irene and Kohnke, Sara N. and Rimmington, Debra and Coll, Anthony P. and Beyer, Andreas and Yeo, Giles S. H. and Br\"{u}ning, Jens C.}, + year = {2022}, + month = oct, + pages = {1402–1419} +} + +@article{tian2023singlecell, + title = {Single-cell DNA methylation and 3D genome architecture in the human brain}, + volume = {382}, + issn = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.adf5357}, + doi = {10.1126/science.adf5357}, + number = {6667}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Tian, Wei and Zhou, Jingtian and Bartlett, Anna and Zeng, Qiurui and Liu, Hanqing and Castanon, Rosa G. and Kenworthy, Mia and Altshul, Jordan and Valadon, Cynthia and Aldridge, Andrew and Nery, Joseph R. and Chen, Huaming and Xu, Jiaying and Johnson, Nicholas D. and Lucero, Jacinta and Osteen, Julia K. and Emerson, Nora and Rink, Jon and Lee, Jasper and Li, Yang E. and Siletti, Kimberly and Liem, Michelle and Claffey, Naomi and O’Connor, Carolyn and Yanny, Anna Marie and Nyhus, Julie and Dee, Nick and Casper, Tamara and Shapovalova, Nadiya and Hirschstein, Daniel and Ding, Song-Lin and Hodge, Rebecca and Levi, Boaz P. and Keene, C. Dirk and Linnarsson, Sten and Lein, Ed and Ren, Bing and Behrens, M. Margarita and Ecker, Joseph R.}, + year = {2023}, + month = oct +} + +@article{sonrel2023metaanalysis, + title = {Meta-analysis of (single-cell method) benchmarks reveals the need for extensibility and interoperability}, + volume = {24}, + issn = {1474-760X}, + url = {http://dx.doi.org/10.1186/s13059-023-02962-5}, + doi = {10.1186/s13059-023-02962-5}, + number = {1}, + journal = {Genome Biology}, + publisher = {Springer Science and Business Media LLC}, + author = {Sonrel, Anthony and Luetge, Almut and Soneson, Charlotte and Mallona, Izaskun and Germain, Pierre-Luc and Knyazev, Sergey and Gilis, Jeroen and Gerber, Reto and Seurinck, Ruth and Paul, Dominique and Sonder, Emanuel and Crowell, Helena L. and Fanaswala, Imran and Al-Ajami, Ahmad and Heidari, Elyas and Schmeing, Stephan and Milosavljevic, Stefan and Saeys, Yvan and Mangul, Serghei and Robinson, Mark D.}, + year = {2023}, + month = may +} + +@article{saelens2019comparison, + title = {A comparison of single-cell trajectory inference methods}, + volume = {37}, + issn = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-019-0071-9}, + doi = {10.1038/s41587-019-0071-9}, + number = {5}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Saelens, Wouter and Cannoodt, Robrecht and Todorov, Helena and Saeys, Yvan}, + year = {2019}, + month = apr, + pages = {547–554} +} + +@article{huang2018savergene, + title = {SAVER: gene expression recovery for single-cell RNA sequencing}, + volume = {15}, + issn = {1548-7105}, + url = {http://dx.doi.org/10.1038/s41592-018-0033-z}, + doi = {10.1038/s41592-018-0033-z}, + number = {7}, + journal = {Nature Methods}, + publisher = {Springer Science and Business Media LLC}, + author = {Huang, Mo and Wang, Jingshu and Torre, Eduardo and Dueck, Hannah and Shaffer, Sydney and Bonasio, Roberto and Murray, John I. and Raj, Arjun and Li, Mingyao and Zhang, Nancy R.}, + year = {2018}, + month = jun, + pages = {539–542} +} + +@article{chari2023speciousart, + title = {The specious art of single-cell genomics}, + volume = {19}, + issn = {1553-7358}, + url = {http://dx.doi.org/10.1371/journal.pcbi.1011288}, + doi = {10.1371/journal.pcbi.1011288}, + number = {8}, + journal = {PLOS Computational Biology}, + publisher = {Public Library of Science (PLoS)}, + author = {Chari, Tara and Pachter, Lior}, + editor = {Papin, Jason A.}, + year = {2023}, + month = aug, + pages = {e1011288} +} + +@article{szalata2024transformers, + title = {{Transformers in single-cell omics: a review and new perspectives}}, + author = {Szałata, Artur and Hrovatin, Karin and Becker, Sören and Tejada-Lapuerta, Alejandro and Cui, Haotian and Wang, Bo and Theis, Fabian J}, + journal = {Nature methods}, + publisher = {Springer Science and Business Media LLC}, + volume = 21, + number = 8, + pages = {1430--1443}, + month = aug, + year = 2024, + doi = {10.1038/s41592-024-02353-z}, + pmid = 39122952, + issn = {1548-7091,1548-7105}, + language = {en} +} + +@article{boiarsky2023foundationmodels, + title = {{A deep dive into single-cell RNA sequencing foundation models}}, + author = {Boiarsky, Rebecca and Singh, Nalini and Buendia, Alejandro and Getz, Gad and Sontag, David}, + journal = {bioRxiv}, + pages = {2023.10.19.563100}, + month = oct, + year = 2023, + doi = {10.1101/2023.10.19.563100}, + language = {en} +} + +@article{liu2024foundationmodels, + title = {{Evaluating the utilities of foundation models in single-cell data analysis}}, + author = {Liu, Tianyu and Li, Kexing and Wang, Yuge and Li, Hongyu and Zhao, Hongyu}, + journal = {bioRxiv.org: the preprint server for biology}, + pages = {2023.09.08.555192}, + month = aug, + year = 2024, + doi = {10.1101/2023.09.08.555192}, + pmc = {PMC10925156}, + pmid = 38464157, + language = {en} +} diff --git a/src/reporting/shared/functions.R b/src/reporting/shared/functions.R new file mode 100644 index 000000000..42179d5ce --- /dev/null +++ b/src/reporting/shared/functions.R @@ -0,0 +1,95 @@ +#' Read bibliography +#' +#' @param bib_file Path to a bibliography BibTex file +#' +#' @returns A list with two elements `doi` and `bibtex` where the names are +#' reference keys and the values are the corresponding DOIs or BibTeX entries +read_bibliography <- function(bib_file) { + bibentries <- bibtex::read.bib(bib_file) + + dois <- lapply(bibentries, function(.entry) { + if (!is.null(.entry$doi)) { + .entry$doi + } else if (!is.null(.entry$DOI)) { + .entry$DOI + } else { + NULL + } + }) |> + purrr::compact() + + bibtex <- lapply(bibentries, function(.entry) { + format(.entry, "bibtex") + }) + + list( + doi = dois, + bibtex = bibtex + ) +} + +#' Get references list +#' +#' Convert a reference field from a config file into a references list +#' resolving any legacy reference keys +#' +#' @param reference The reference field value +#' @param bibliography A bibliography list as returned by `read_bibliography()` +#' +#' @returns A list with two elements `doi` and `bibtex`, where each is a character vector +#' containing corresponding DOIs or BibTeX entries +get_references_list <- function(reference, bibliography) { + # If null, return empty references + if (is.null(reference)) { + return(list(doi = character(0), bibtex = character(0))) + } + + # If reference is a list, assume it is in the current format + if (is.list(reference)) { + return( + list( + doi = reference$doi %||% character(0), + bibtex = reference$bibtex %||% character(0) + ) + ) + } + + # If not a list, check if it is a DOI or BibTeX entry + if (startsWith(reference, "@")) { + return(list(doi = character(0), bibtex = reference)) + } else if (startsWith(reference, "1")) { + return(list(doi = reference, bibtex = character(0))) + } + + # Otherwise, assume it is a bibliography key + if (reference %in% names(bibliography$doi)) { + return(list(doi = bibliography$doi[[reference]], bibtex = character(0))) + } else if (reference %in% names(bibliograph$bibtex)) { + return(list(doi = character(0), bibtex = bibliography$bibtex[[reference]])) + } else { + stop("Reference key '", reference, "' not found in bibliography") + } +} + +#' Get authors list +#' +#' Convert a list of authors from a config file into a structured list +#' +#' @param authors The authors field from a config file +#' +#' @returns An authors list in the expected format +get_authors_list <- function(authors) { + `%||%` <- rlang::`%||%` + + purrr::map(authors, function(.author) { + other_fields <- setdiff(names(.author$info), c("github", "orcid")) + + list( + name = jsonlite::unbox(.author$name), + roles = .author$roles %||% character(0), + github = jsonlite::unbox(.author$info$github), + orcid = jsonlite::unbox(.author$info$orcid), + info = .author$info[other_fields] + ) + }) +}