openproblems-bio · lazappi · Feb 3, 2025 · Feb 3, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/_viash.yaml b/_viash.yaml
@@ -11,9 +11,9 @@ keywords: [openproblems, benchmarking, single-cell omics]
 references:
   doi:
     # Malte Luecken, Scott Gigante, Daniel Burkhardt, Robrecht Cannoodt, et al.
-    # Defining and benchmarking open problems in single-cell analysis, 
-    # 03 April 2024, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-4181617/v1]
-    - 10.21203/rs.3.rs-4181617/v1
+    # Defining and benchmarking open problems in single-cell analysis.
+    # Nat Biotechnol 43, 1035–1040 (2025).
+    - 10.1038/s41587-025-02694-w
 
 links:
   issue_tracker: https://github.com/openproblems-bio/openproblems/issues

diff --git a/common b/common
diff --git a/scripts/create_resources/reprocess_task_results_v4.sh b/scripts/create_resources/reprocess_task_results_v4.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+OUT_DIR="resources"
+
+echo ">>> Fetching raw results..."
+aws s3 sync --profile op \
+  s3://openproblems-data/resources/ \
+  "$OUT_DIR/" \
+  --exclude "*" \
+  --include "**/results/run_*/*" \
+  --delete
+
+echo ">>> Patch state.yaml files..."
+# fix state.yaml id and output_trace
+python <<HERE
+import os
+import re
+import glob
+
+def update_state_file(file_path, new_id):
+    with open(file_path, 'r') as file:
+        content = file.read()
+
+    # if output_trace is missing, add it
+    if 'output_trace:' not in content:
+        content += "\noutput_trace: !file trace.txt\n"
+
+    # replace the id with the value of the glob ** pattern
+    content = re.sub(r'id: .+', f'id: {new_id}/processed', content)
+
+    with open(file_path, 'w') as file:
+        file.write(content)
+
+# find all state.yaml files
+state_files = glob.glob('resources/**/state.yaml', recursive=True)
+for state_file in state_files:
+    # extract the id from the path
+    match = re.search(r'resources/(.+?)/state\.yaml', state_file)
+    if match:
+        new_id = match.group(1)
+        update_state_file(state_file, new_id)
+        print(f"Updated {state_file} with id: {new_id}")
+    else:
+        print(f"Could not extract id from {state_file}, skipping.")
+HERE
+
+echo ">>> Creating params.yaml..."
+cat > /tmp/params.yaml << HERE
+input_states: resources/*/results/run_*/state.yaml
+rename_keys: 'input_task_info:output_task_info;input_dataset_info:output_dataset_info;input_method_configs:output_method_configs;input_metric_configs:output_metric_configs;input_scores:output_scores;input_trace:output_trace'
+output_state: '\$id/state.yaml'
+settings: '{"output_combined": "\$id/output_combined.json", "output_report": "\$id/output_report.html", "output_task_info": "\$id/output_task_info.json", "output_dataset_info": "\$id/output_dataset_info.json", "output_method_info": "\$id/output_method_info.json", "output_metric_info": "\$id/output_metric_info.json", "output_results": "\$id/output_results.json", "output_scores": "\$id/output_quality_control.json"}'
+publish_dir: "$OUT_DIR"
+HERE
+
+echo ">>> Processing results..."
+nextflow run target/nextflow/reporting/process_task_results/main.nf \
+  -profile docker \
+  -params-file /tmp/params.yaml \
+  -c common/nextflow_helpers/labels_ci.config \
+  -entry auto \
+  -resume
+
+# find all files in $OUT with the pattern output_report.html
+echo ">>> List reports..."
+find "$OUT_DIR" -name "output_report.html"
+
+# echo ">>> Uploading processed results to S3..."
+# aws s3 sync --profile op \
+#   "resources_test/openproblems/task_results_v4/" \
+#   "s3://openproblems-data/resources_test/openproblems/task_results_v4/" \
+#   --delete --dryrun
+
+# echo
+# echo ">>> Done!"
diff --git a/scripts/create_resources/task_results_v4.sh b/scripts/create_resources/task_results_v4.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+OUT_DIR="resources_test/openproblems/task_results_v4"
+
+echo ">>> Fetching raw results..."
+aws s3 sync --profile op \
+  s3://openproblems-data/resources/task_batch_integration/results/run_2025-01-23_18-03-16/ \
+  "$OUT_DIR/raw/" \
+  --delete
+
+echo
+echo ">>> Processing results..."
+if [ -d "$OUT_DIR/processed" ]; then rm -Rf $OUT_DIR/processed; fi
+nextflow run target/nextflow/reporting/process_task_results/main.nf \
+  -profile docker \
+  --input_task_info $OUT_DIR/raw/task_info.yaml \
+  --input_dataset_info $OUT_DIR/raw/dataset_uns.yaml \
+  --input_method_configs $OUT_DIR/raw/method_configs.yaml \
+  --input_metric_configs $OUT_DIR/raw/metric_configs.yaml \
+  --input_scores $OUT_DIR/raw/score_uns.yaml \
+  --input_trace $OUT_DIR/raw/trace.txt \
+  --output_state state.yaml \
+  --publishDir $OUT_DIR/processed
+
+echo ">>> Uploading processed results to S3..."
+aws s3 sync --profile op \
+  "resources_test/openproblems/task_results_v4/" \
+  "s3://openproblems-data/resources_test/openproblems/task_results_v4/" \
+  --delete --dryrun
+
+echo
+echo ">>> Done!"
diff --git a/src/reporting/combine_output/config.vsh.yaml b/src/reporting/combine_output/config.vsh.yaml
@@ -0,0 +1,102 @@
+name: combine_output
+namespace: reporting
+description: Combine task outputs into a single JSON
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: --input_task_info
+        type: file
+        description: Task info file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/task_info.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/task_info.json
+      - name: --input_dataset_info
+        type: file
+        description: Dataset info file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/dataset_info.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/dataset_info.json
+      - name: --input_method_info
+        type: file
+        description: Method info file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/method_info.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/method_info.json
+      - name: --input_metric_info
+        type: file
+        description: Metric info file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/metric_info.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/metric_info.json
+      - name: --input_results
+        type: file
+        description: Results file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/results.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/results.json
+      - name: --input_quality_control
+        type: file
+        description: Quality control file
+        info:
+          format:
+            type: json
+            schema: /common/schemas/results_v4/quality_control.json
+        required: true
+        example: resources_test/openproblems/task_results_v4/processed/quality_control.json
+
+  - name: Outputs
+    arguments:
+    - name: --output
+      type: file
+      direction: output
+      description: Combined output JSON
+      default: combined_output.json
+      info:
+        format:
+          type: json
+          schema: /common/schemas/results_v4/combined_output.json
+
+resources:
+  - type: r_script
+    path: script.R
+  - path: /common/schemas
+    dest: schemas
+
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/openproblems/task_results_v4
+    dest: resources_test/openproblems/task_results_v4
+
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: apt
+        packages:
+        - nodejs
+        - npm
+      - type: docker
+        run: npm install -g ajv-cli
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowmem, lowtime, lowcpu]
diff --git a/src/reporting/combine_output/script.R b/src/reporting/combine_output/script.R
@@ -0,0 +1,105 @@
+## VIASH START
+processed_dir <- "resources_test/openproblems/task_results_v4/processed"
+
+par <- list(
+  # Inputs
+  input_task_info = paste0(processed_dir, "/task_info.json"),
+  input_quality_control = paste0(processed_dir, "/quality_control.json"),
+  input_metric_info = paste0(processed_dir, "/metric_info.json"),
+  input_method_info = paste0(processed_dir, "/method_info.json"),
+  input_dataset_info = paste0(processed_dir, "/dataset_info.json"),
+  input_results = paste0(processed_dir, "/results.json"),
+  # Outputs
+  output = "task_results.json"
+)
+## VIASH END
+
+################################################################################
+#                              MAIN SCRIPT
+################################################################################
+
+cat("====== Combine output ======\n")
+
+cat("\n>>> Reading input files...\n")
+cat("Reading task info from '", par$input_task_info, "'...\n", sep = "")
+task_info <- jsonlite::read_json(par$input_task_info)
+
+cat(
+  "Reading quality control from '",
+  par$input_quality_control,
+  "'...\n",
+  sep = ""
+)
+quality_control <- jsonlite::read_json(par$input_quality_control)
+
+cat("Reading metric info from '", par$input_metric_info, "'...\n", sep = "")
+metric_info <- jsonlite::read_json(par$input_metric_info)
+
+cat("Reading method info from '", par$input_method_info, "'...\n", sep = "")
+method_info <- jsonlite::read_json(par$input_method_info)
+
+cat("Reading dataset info from '", par$input_dataset_info, "'...\n", sep = "")
+dataset_info <- jsonlite::read_json(par$input_dataset_info)
+
+cat("Reading results from '", par$input_results, "'...\n", sep = "")
+results <- jsonlite::read_json(par$input_results)
+
+cat("\n>>> Combining outputs...\n")
+# Create combined output according to task_results.json
+combined_output <- list(
+  task_info = task_info,
+  dataset_info = dataset_info,
+  method_info = method_info,
+  metric_info = metric_info,
+  results = results,
+  quality_control = quality_control
+)
+
+cat("\n>>> Writing output file...\n")
+cat("Writing combined output to '", par$output, "'...\n", sep = "")
+jsonlite::write_json(
+  combined_output,
+  par$output,
+  pretty = TRUE,
+  null = "null",
+  na = "null",
+  auto_unbox = TRUE
+)
+
+cat("\n>>> Validating output against schema...\n")
+results_schemas <- file.path(meta$resources_dir, "schemas", "results_v4")
+ajv_args <- paste(
+  "validate",
+  "--spec draft2020",
+  "-s",
+  file.path(results_schemas, "combined_output.json"),
+  "-r",
+  file.path(results_schemas, "task_info.json"),
+  "-r",
+  file.path(results_schemas, "dataset_info.json"),
+  "-r",
+  file.path(results_schemas, "method_info.json"),
+  "-r",
+  file.path(results_schemas, "metric_info.json"),
+  "-r",
+  file.path(results_schemas, "results.json"),
+  "-r",
+  file.path(results_schemas, "quality_control.json"),
+  "-r",
+  file.path(results_schemas, "core.json"),
+  "-d",
+  par$output
+)
+
+cat("Running validation command:", "ajv", ajv_args, "\n")
+cat("Output:\n")
+validation_result <- system2("ajv", ajv_args)
+
+if (validation_result == 0) {
+  cat("JSON validation passed successfully!\n")
+} else {
+  cat("JSON validation failed!\n")
+  stop("Output JSON does not conform to schema")
+}
+
+cat("\n>>> Done!\n")
+35 −0		schemas/results_v4/combined_output.json
+63 −0		schemas/results_v4/core.json
+87 −0		schemas/results_v4/dataset_info.json
+84 −0		schemas/results_v4/method_info.json
+77 −0		schemas/results_v4/metric_info.json
+50 −0		schemas/results_v4/quality_control.json
+183 −0		schemas/results_v4/results.json
+64 −0		schemas/results_v4/task_info.json