diff --git a/.github/workflows/upload-to-s3.yml b/.github/workflows/upload-to-s3.yml new file mode 100644 index 000000000..f1c466e30 --- /dev/null +++ b/.github/workflows/upload-to-s3.yml @@ -0,0 +1,252 @@ +name: Upload to S3 + +on: + workflow_dispatch: + inputs: + release_tag: + description: "Tag of the draft release holding the file to upload" + required: true + type: string + s3_destination: + description: "S3 path within the stdpopsim bucket (e.g. annotations/HomSap/file.tar.gz)" + required: true + type: string + expected_sha256: + description: "Expected SHA256 checksum of the file" + required: true + type: string + species_id: + description: "Species ID (e.g. HomSap)" + required: true + type: string + resource_type: + description: "Resource type: genetic_map or annotation" + required: true + type: string + resource_id: + description: "Resource ID as defined in the catalog (e.g. HapMapII_GRCh38)" + required: true + type: string + dry_run: + description: "If true, run all validation but skip the actual S3 upload" + required: false + type: boolean + default: false + +concurrency: + group: s3-upload + cancel-in-progress: false + +jobs: + upload: + runs-on: ubuntu-latest + environment: s3-upload + permissions: + contents: write # needed to download release assets and delete releases + steps: + - name: Validate inputs + run: | + DEST="${{ inputs.s3_destination }}" + TYPE="${{ inputs.resource_type }}" + + # Resource type must be genetic_map or annotation + if [[ "$TYPE" != "genetic_map" && "$TYPE" != "annotation" ]]; then + echo "ERROR: resource_type must be 'genetic_map' or 'annotation', got: $TYPE" + exit 1 + fi + + # S3 destination must start with genetic_maps/ or annotations/ + if [[ ! "$DEST" =~ ^(genetic_maps|annotations)/ ]]; then + echo "ERROR: S3 destination must start with 'genetic_maps/' or 'annotations/'" + exit 1 + fi + + # S3 destination must end with .tar.gz or .tgz + if [[ ! "$DEST" =~ \.(tar\.gz|tgz)$ ]]; then + echo "ERROR: S3 destination must end with .tar.gz or .tgz" + exit 1 + fi + + echo "Input validation passed." + echo " Species: ${{ inputs.species_id }}" + echo " Type: ${{ inputs.resource_type }}" + echo " Resource: ${{ inputs.resource_id }}" + echo " S3 dest: $DEST" + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install stdpopsim + run: | + pip install -e . + + - name: Validate against catalog + run: | + python3 << 'PYEOF' + import stdpopsim + import sys + import re + + species_id = "${{ inputs.species_id }}" + resource_type = "${{ inputs.resource_type }}" + resource_id = "${{ inputs.resource_id }}" + s3_dest = "${{ inputs.s3_destination }}" + expected_sha256 = "${{ inputs.expected_sha256 }}" + + # Verify species exists + try: + species = stdpopsim.get_species(species_id) + except (ValueError, KeyError): + available = [s.id for s in stdpopsim.all_species()] + print(f"ERROR: Species '{species_id}' not found in catalog.") + print(f"Available species: {available}") + sys.exit(1) + + # Verify resource exists and extract expected URL/SHA256 + if resource_type == "genetic_map": + resources = {gm.id: gm for gm in species.genetic_maps} + if resource_id not in resources: + print(f"ERROR: Genetic map '{resource_id}' not found for {species_id}.") + print(f"Available genetic maps: {list(resources.keys())}") + sys.exit(1) + resource = resources[resource_id] + catalog_url = resource.url + catalog_sha256 = resource.sha256 + + elif resource_type == "annotation": + resources = {a.id: a for a in species.annotations} + if resource_id not in resources: + print(f"ERROR: Annotation '{resource_id}' not found for {species_id}.") + print(f"Available annotations: {list(resources.keys())}") + sys.exit(1) + resource = resources[resource_id] + catalog_url = resource.intervals_url + catalog_sha256 = resource.intervals_sha256 + + # Verify the S3 destination matches what the catalog expects + m = re.match(r"https://stdpopsim\.s3[.-]us-west-2\.amazonaws\.com/(.*)", catalog_url) + if not m: + print(f"ERROR: Could not parse S3 URL from catalog: {catalog_url}") + sys.exit(1) + + catalog_s3_dest = m.group(1) + if catalog_s3_dest != s3_dest: + print(f"ERROR: S3 destination mismatch!") + print(f" Provided: {s3_dest}") + print(f" Catalog expects: {catalog_s3_dest}") + sys.exit(1) + + # Verify SHA256 matches what the catalog expects + if catalog_sha256 != expected_sha256: + print(f"ERROR: SHA256 mismatch with catalog!") + print(f" Provided: {expected_sha256}") + print(f" Catalog expects: {catalog_sha256}") + sys.exit(1) + + print("Catalog validation passed!") + print(f" Species: {species_id} ({species.name})") + print(f" Type: {resource_type}") + print(f" Resource: {resource_id}") + print(f" URL: {catalog_url}") + print(f" SHA256: {catalog_sha256}") + PYEOF + + - name: Download asset from draft release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "Downloading asset from release: ${{ inputs.release_tag }}" + gh release download "${{ inputs.release_tag }}" \ + --repo "${{ github.repository }}" \ + --dir ./download + + # Expect exactly one file + FILE_COUNT=$(find ./download -type f | wc -l) + if [ "$FILE_COUNT" -ne 1 ]; then + echo "ERROR: Expected exactly 1 asset, found $FILE_COUNT" + exit 1 + fi + + DOWNLOADED_FILE=$(find ./download -type f) + echo "Downloaded: $DOWNLOADED_FILE" + echo "DOWNLOADED_FILE=$DOWNLOADED_FILE" >> "$GITHUB_ENV" + + - name: Verify SHA256 + run: | + ACTUAL_SHA256=$(sha256sum "$DOWNLOADED_FILE" | awk '{print $1}') + EXPECTED="${{ inputs.expected_sha256 }}" + + echo "Expected SHA256: $EXPECTED" + echo "Actual SHA256: $ACTUAL_SHA256" + + if [ "$ACTUAL_SHA256" != "$EXPECTED" ]; then + echo "ERROR: SHA256 mismatch!" + exit 1 + fi + + echo "SHA256 verified." + + - name: Check if S3 object already exists + if: ${{ inputs.dry_run != true }} + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + DEST="${{ inputs.s3_destination }}" + if aws s3api head-object --bucket stdpopsim --key "$DEST" 2>/dev/null; then + echo "ERROR: s3://stdpopsim/$DEST already exists. Refusing to overwrite." + echo "If you need to replace this file, delete it from S3 first." + exit 1 + fi + echo "Confirmed: s3://stdpopsim/$DEST does not exist yet." + + - name: Upload to S3 + if: ${{ inputs.dry_run != true }} + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + run: | + DEST="${{ inputs.s3_destination }}" + echo "Uploading to s3://stdpopsim/$DEST" + aws s3 cp "$DOWNLOADED_FILE" "s3://stdpopsim/$DEST" + echo "" + echo "Upload complete." + echo "S3 URL: https://stdpopsim.s3-us-west-2.amazonaws.com/$DEST" + echo "SHA256: ${{ inputs.expected_sha256 }}" + + - name: Dry run summary + if: ${{ inputs.dry_run == true }} + run: | + echo "=== DRY RUN COMPLETE ===" + echo "All validation passed. Skipped S3 upload." + echo "" + echo " Species: ${{ inputs.species_id }}" + echo " Type: ${{ inputs.resource_type }}" + echo " Resource: ${{ inputs.resource_id }}" + echo " S3 dest: s3://stdpopsim/${{ inputs.s3_destination }}" + echo " SHA256: ${{ inputs.expected_sha256 }}" + echo "" + echo "To perform the actual upload, re-run without --dry-run." + + - name: Clean up draft release + if: always() + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + TAG="${{ inputs.release_tag }}" + echo "Cleaning up draft release: $TAG" + + # Delete the release + gh release delete "$TAG" \ + --repo "${{ github.repository }}" \ + --yes \ + --cleanup-tag 2>/dev/null || echo "Warning: could not delete release $TAG" + + echo "Cleanup complete." diff --git a/maintenance/upload_to_s3.sh b/maintenance/upload_to_s3.sh new file mode 100755 index 000000000..8fba10a2f --- /dev/null +++ b/maintenance/upload_to_s3.sh @@ -0,0 +1,211 @@ +#!/usr/bin/env bash +# +# Upload a tarball to the stdpopsim S3 bucket via GitHub Actions. +# +# Usage: +# ./maintenance/upload_to_s3.sh [--dry-run] +# +# Examples: +# ./maintenance/upload_to_s3.sh data/HapMapII_GRCh38.tar.gz HomSap genetic_map HapMapII_GRCh38 +# ./maintenance/upload_to_s3.sh data/ensembl_havana_104_exons_v1.tar.gz HomSap annotation ensembl_havana_104_exons +# ./maintenance/upload_to_s3.sh --dry-run data/test.tar.gz HomSap genetic_map HapMapII_GRCh38 +# +# The script looks up the resource in the stdpopsim catalog to find the +# expected S3 URL and SHA256, verifies the local file matches, then uploads +# via a GitHub Actions workflow. +# +# Options: +# --dry-run Run all validation (local + remote) but skip the actual S3 upload. +# The workflow will still run to verify everything end-to-end. +# +# Prerequisites: +# - GitHub CLI (gh) installed and authenticated +# - stdpopsim installed in current Python environment (pip install -e .) +# - Push access to the stdpopsim repository + +set -euo pipefail + +REPO="popsim-consortium/stdpopsim" +WORKFLOW="upload-to-s3.yml" +DRY_RUN=false + +die() { echo "ERROR: $*" >&2; exit 1; } + +# --- Parse --dry-run flag --- +if [ "${1:-}" = "--dry-run" ]; then + DRY_RUN=true + shift +fi + +# --- Validate arguments --- +if [ $# -ne 4 ]; then + echo "Usage: $0 [--dry-run] " >&2 + echo "" >&2 + echo "Examples:" >&2 + echo " $0 data/HapMapII_GRCh38.tar.gz HomSap genetic_map HapMapII_GRCh38" >&2 + echo " $0 data/exons_v1.tar.gz HomSap annotation ensembl_havana_104_exons" >&2 + echo " $0 --dry-run data/test.tar.gz HomSap genetic_map HapMapII_GRCh38" >&2 + exit 1 +fi + +LOCAL_FILE="$1" +SPECIES_ID="$2" +RESOURCE_TYPE="$3" +RESOURCE_ID="$4" + +if [ "$DRY_RUN" = true ]; then + echo "*** DRY RUN MODE — will validate but skip S3 upload ***" + echo "" +fi + +[ -f "$LOCAL_FILE" ] || die "File not found: $LOCAL_FILE" + +case "$LOCAL_FILE" in + *.tar.gz|*.tgz) ;; + *) die "File must be a .tar.gz or .tgz archive: $LOCAL_FILE" ;; +esac + +case "$RESOURCE_TYPE" in + genetic_map|annotation) ;; + *) die "Resource type must be 'genetic_map' or 'annotation', got: $RESOURCE_TYPE" ;; +esac + +# --- Check prerequisites --- +command -v gh >/dev/null 2>&1 || die "GitHub CLI (gh) is not installed. See https://cli.github.com/" +gh auth status >/dev/null 2>&1 || die "GitHub CLI is not authenticated. Run: gh auth login" +python3 -c "import stdpopsim" 2>/dev/null || die "stdpopsim is not installed in the current Python environment" + +# --- Compute SHA256 --- +if command -v sha256sum >/dev/null 2>&1; then + SHA256=$(sha256sum "$LOCAL_FILE" | awk '{print $1}') +elif command -v shasum >/dev/null 2>&1; then + SHA256=$(shasum -a 256 "$LOCAL_FILE" | awk '{print $1}') +else + die "Neither sha256sum nor shasum found" +fi + +# --- Validate against stdpopsim catalog --- +echo "Validating against stdpopsim catalog..." +VALIDATION=$(python3 << PYEOF +import stdpopsim +import sys +import json +import re + +species_id = "${SPECIES_ID}" +resource_type = "${RESOURCE_TYPE}" +resource_id = "${RESOURCE_ID}" +local_sha256 = "${SHA256}" + +try: + species = stdpopsim.get_species(species_id) +except (ValueError, KeyError) as e: + print(json.dumps({"error": f"Species '{species_id}' not found in catalog. " + f"Available: {[s.id for s in stdpopsim.all_species()]}"})) + sys.exit(0) + +if resource_type == "genetic_map": + resources = {gm.id: gm for gm in species.genetic_maps} + if resource_id not in resources: + avail = list(resources.keys()) + print(json.dumps({"error": f"Genetic map '{resource_id}' not found for " + f"{species_id}. Available: {avail}"})) + sys.exit(0) + resource = resources[resource_id] + s3_url = resource.url + expected_sha256 = resource.sha256 + +elif resource_type == "annotation": + resources = {a.id: a for a in species.annotations} + if resource_id not in resources: + avail = list(resources.keys()) + print(json.dumps({"error": f"Annotation '{resource_id}' not found for " + f"{species_id}. Available: {avail}"})) + sys.exit(0) + resource = resources[resource_id] + s3_url = resource.intervals_url + expected_sha256 = resource.intervals_sha256 + +# Extract S3 path from URL (handles both s3-us-west-2 and s3.us-west-2 styles) +m = re.match(r"https://stdpopsim\.s3[.-]us-west-2\.amazonaws\.com/(.*)", s3_url) +if not m: + print(json.dumps({"error": f"Could not parse S3 URL from catalog: {s3_url}"})) + sys.exit(0) + +s3_dest = m.group(1) + +if local_sha256 != expected_sha256: + print(json.dumps({"error": f"SHA256 mismatch!\n" + f" Local file: {local_sha256}\n" + f" Catalog expects: {expected_sha256}"})) + sys.exit(0) + +print(json.dumps({ + "s3_url": s3_url, + "s3_dest": s3_dest, + "sha256": expected_sha256, +})) +PYEOF +) || die "Python validation script failed" + +# Check for validation error +ERROR=$(echo "$VALIDATION" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('error',''))") +if [ -n "$ERROR" ]; then + die "$ERROR" +fi + +# Extract validated fields +S3_DEST=$(echo "$VALIDATION" | python3 -c "import sys,json; print(json.load(sys.stdin)['s3_dest'])") +S3_URL=$(echo "$VALIDATION" | python3 -c "import sys,json; print(json.load(sys.stdin)['s3_url'])") + +echo "" +echo "Catalog validation passed!" +echo " Species: $SPECIES_ID" +echo " Type: $RESOURCE_TYPE" +echo " Resource: $RESOURCE_ID" +echo " File: $LOCAL_FILE" +echo " S3 URL: $S3_URL" +echo " SHA256: $SHA256" +echo "" + +# --- Create draft release as file transport --- +TAG="s3-upload-$(date +%Y%m%d%H%M%S)-$RANDOM" +FILENAME=$(basename "$LOCAL_FILE") + +echo "Creating draft release ($TAG) to transport the file..." +gh release create "$TAG" \ + --repo "$REPO" \ + --draft \ + --title "S3 upload: $SPECIES_ID/$RESOURCE_TYPE/$RESOURCE_ID" \ + --notes "Temporary draft release for S3 upload. Will be cleaned up automatically." \ + "$LOCAL_FILE" + +echo "Draft release created." +echo "" + +# --- Trigger the workflow --- +echo "Triggering upload workflow..." +gh workflow run "$WORKFLOW" \ + --repo "$REPO" \ + --field "release_tag=$TAG" \ + --field "s3_destination=$S3_DEST" \ + --field "expected_sha256=$SHA256" \ + --field "species_id=$SPECIES_ID" \ + --field "resource_type=$RESOURCE_TYPE" \ + --field "resource_id=$RESOURCE_ID" \ + --field "dry_run=$DRY_RUN" + +echo "" +if [ "$DRY_RUN" = true ]; then + echo "Dry-run workflow triggered. It will validate everything but skip the S3 upload." +else + echo "Workflow triggered successfully." +fi +echo "" +echo "Monitor the upload with:" +echo " gh run list --repo $REPO --workflow $WORKFLOW --limit 3" +echo "" +echo "Or watch the latest run:" +echo " gh run watch --repo $REPO \$(gh run list --repo $REPO --workflow $WORKFLOW --limit 1 --json databaseId --jq '.[0].databaseId')" +echo "" +echo "Final S3 URL: $S3_URL"