Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions gnomad_qc/v5/annotations/compute_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
get_logging_path,
qc_temp_prefix,
)
from gnomad_qc.v5.resources.constants import WORKSPACE_BUCKET
from gnomad_qc.v5.resources.constants import GNOMAD_TMP_BUCKET
from gnomad_qc.v5.resources.meta import meta
from gnomad_qc.v5.resources.release import (
release_all_sites_an_tsv_path,
Expand Down Expand Up @@ -641,16 +641,24 @@ def join_aou_and_gnomad_qual_hists_ht(
def main(args):
"""Compute all sites coverage, allele number, and quality histograms for v5 genomes (AoU v8 + gnomAD v4)."""
project = args.project_name
environment = "rwb" if project == "aou" else "dataproc"
if environment == "rwb":
environment = "batch" if project == "aou" else "dataproc"
if environment == "batch":
hl.init(
log="/home/jupyter/workspaces/gnomadproduction/compute_coverage.log",
tmp_dir=f"gs://{WORKSPACE_BUCKET}/tmp/4_day",
backend="batch",
app_name="compute_coverage",
log="compute_coverage.log",
tmp_dir=f"gs://{GNOMAD_TMP_BUCKET}/tmp/4_day",
driver_memory="highmem",
driver_cores=8,
worker_memory="highmem",
worker_cores=8,
gcs_requester_pays_configuration=args.gcp_billing_project,
regions=["us-central1"],
)
else:
hl.init(
log="compute_coverage.log",
tmp_dir="gs://gnomad-tmp-4day",
tmp_dir=f"gs://{GNOMAD_TMP_BUCKET}/tmp/30_day",
)
hl.default_reference("GRCh38")

Expand Down Expand Up @@ -968,6 +976,12 @@ def get_script_argument_parser() -> argparse.ArgumentParser:
type=str,
choices=["aou", "gnomad"],
)
parser.add_argument(
"--gcp-billing-project",
type=str,
default="broad-mpg-gnomad",
help="Google Cloud billing project for reading requester pays buckets.",
)
parser.add_argument(
"--overwrite", help="Overwrite existing hail Tables.", action="store_true"
)
Expand Down
5 changes: 3 additions & 2 deletions gnomad_qc/v5/resources/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def _annotations_root(
test: bool = False,
data_type: str = "genomes",
data_set: str = "aou",
environment: str = "batch",
) -> str:
"""
Get root path to the variant annotation files.
Expand All @@ -25,17 +26,17 @@ def _annotations_root(
full v4 VDS.
:param data_type: Data type of annotation resource. e.g. "exomes" or "genomes". Default is "genomes".
:param data_set: Data set of annotation resource. Default is "aou".
:param environment: Compute environment. One of 'rwb', 'batch', or 'dataproc'. Defaults to 'batch'.
:return: Root path of the variant annotation files.
"""
path_suffix = f"sample_qc/{data_type}/{data_set}"

if test:
environment = "rwb" if data_set == "aou" else "dataproc"
return (
f"{qc_temp_prefix(version=version, environment=environment)}{path_suffix}"
)

base_bucket = WORKSPACE_BUCKET if data_set == "aou" else GNOMAD_BUCKET
base_bucket = WORKSPACE_BUCKET if environment == "rwb" else GNOMAD_BUCKET
return f"gs://{base_bucket}/v{version}/{path_suffix}"


Expand Down
11 changes: 8 additions & 3 deletions gnomad_qc/v5/resources/basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,22 @@ def qc_temp_prefix(
"""
Return path to temporary QC bucket.

.. note::

Function supports three environments becauseAoU QC started in RWB,
then moved to Batch in November 2025.

:param version: Version of annotation path to return.
:param environment: Compute environment, either 'dataproc' or 'rwb'. Defaults to 'dataproc'.
:param environment: Compute environment. One of 'rwb', 'batch', or 'dataproc'. Defaults to 'dataproc'.
:return: Path to bucket with temporary QC data.
"""
if environment == "rwb":
env_bucket = f"{WORKSPACE_BUCKET}/tmp"
elif environment == "dataproc":
elif environment in ("batch", "dataproc"):
env_bucket = GNOMAD_TMP_BUCKET
else:
raise ValueError(
f"Environment {environment} not recognized. Choose 'rwb' or 'dataproc'."
f"Environment {environment} not recognized. Choose 'rwb', 'batch', or 'dataproc'."
)

return f"gs://{env_bucket}/gnomad.genomes.v{version}.qc_data/"
Expand Down
2 changes: 2 additions & 0 deletions gnomad_qc/v5/resources/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
WORKSPACE_BUCKET = "fc-secure-b25d1307-7763-48b8-8045-fcae9caadfa1"
GNOMAD_BUCKET = "gnomad"
GNOMAD_TMP_BUCKET = "gnomad-tmp"

# TODO: Update these constants for Batch if necessary.
AOU_BUCKET = "fc-aou-datasets-controlled/v8"
AOU_WGS_BUCKET = f"{AOU_BUCKET}/wgs/short_read/snpindel"
AOU_WGS_AUX_BUCKET = f"{AOU_WGS_BUCKET}/aux"
Expand Down
26 changes: 13 additions & 13 deletions gnomad_qc/v5/resources/release.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ def _release_root(
test: bool = False,
data_type: str = "genomes",
extension: str = "ht",
environment: str = "rwb",
environment: str = "batch",
) -> str:
"""
Get root path to the release files.

:param version: Version of release path to return.
:param test: Whether to use a tmp path for testing.
:param data_type: Data type of annotation resource. e.g. "exomes" or "genomes".
Default is "exomes".
Default is "genomes".
:param extension: File extension of release file. Default is "ht".
:param environment: Environment to use. Default is "rwb". Must be "rwb" for AoU.
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: Root path of the release files.
"""
path_suffix = f"release/{extension}/{data_type}"
Expand All @@ -57,7 +57,7 @@ def release_coverage_path(
public: bool = False,
test: bool = False,
coverage_type: str = "coverage",
environment: str = "rwb",
environment: str = "batch",
) -> str:
"""
Fetch filepath for v5 (AoU + gnomAD v4 genomes) all sites coverage or allele number release Table.
Expand All @@ -67,7 +67,7 @@ def release_coverage_path(
private (False) bucket. Default is False.
:param test: Whether to use a tmp path for testing. Default is False.
:param coverage_type: 'coverage' or 'allele_number'. Default is 'coverage'.
:param environment: Environment to use. Default is "rwb".
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: File path for desired coverage Hail Table.
"""
assert coverage_type in [
Expand Down Expand Up @@ -101,14 +101,14 @@ def release_coverage_path(
def release_coverage_tsv_path(
release_version: str = CURRENT_COVERAGE_RELEASE["genomes"],
test: bool = False,
environment: str = "rwb",
environment: str = "batch",
) -> str:
"""
Fetch path to coverage TSV file.

:param release_version: Release version. Default is CURRENT_COVERAGE_RELEASE["genomes"].
:param test: Whether to use a tmp path for testing. Default is False.
:param environment: Environment to use. Default is "rwb".
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: Coverage TSV path.
"""
return f"{_release_root(release_version, test=test, extension='tsv', environment=environment)}/gnomad.genomes.v{release_version}.coverage.tsv.bgz"
Expand All @@ -117,14 +117,14 @@ def release_coverage_tsv_path(
def release_all_sites_an_tsv_path(
release_version: str = None,
test: bool = False,
environment: str = "rwb",
environment: str = "batch",
) -> str:
"""
Fetch path to all sites AN TSV file.

:param release_version: Release version. Default is None.
:param test: Whether to use a tmp path for testing. Default is False.
:param environment: Environment to use. Default is "rwb".
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: All sites AN TSV path.
"""
release_version = (
Expand All @@ -138,15 +138,15 @@ def release_all_sites_an_tsv_path(
def release_coverage(
public: bool = False,
test: bool = False,
environment: str = "rwb",
environment: str = "batch",
) -> VersionedTableResource:
"""
Retrieve versioned resource for coverage release Table.

:param public: Determines whether release coverage Table is read from public (True) or
private (False) bucket. Default is False.
:param test: Whether to use a tmp path for testing. Default is False.
:param environment: Environment to use. Default is "rwb".
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: Coverage release Table.
"""
return VersionedTableResource(
Expand All @@ -168,15 +168,15 @@ def release_coverage(
def release_all_sites_an(
public: bool = False,
test: bool = False,
environment: str = "rwb",
environment: str = "batch",
) -> VersionedTableResource:
"""
Retrieve versioned resource for all sites allele number release Table.

:param public: Determines whether release allele number Table is read from public or
private bucket. Default is private.
:param test: Whether to use a tmp path for testing. Default is False.
:param environment: Environment to use. Default is "rwb".
:param environment: Environment to use. Default is "batch". Must be "rwb" or "batch" for AoU.
:return: All sites allele number release Table.
"""
return VersionedTableResource(
Expand Down