diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..cafcf25 Binary files /dev/null and b/.DS_Store differ diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..2a23620 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,3 @@ +FROM debian:bookworm + +WORKDIR /workspaces/genome-exploration \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..651c1b8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,25 @@ +{ + "name": "Pathogenic Variant Finder Dev Container", + "dockerFile": "Dockerfile", + "customizations": { + "vscode": { + "extensions": [ + "ExodiusStudios.comment-anchors", + "ms-vscode-remote.remote-containers", + "ms-azuretools.vscode-docker" + ], + "settings": { + "editor.tabSize": 2, + "terminal.integrated.defaultProfile.linux": "zsh" + } + } + }, + "features": { + "ghcr.io/stuartleeks/dev-container-features/shell-history:0": {}, + "ghcr.io/schlich/devcontainer-features/powerlevel10k:1": {}, + "ghcr.io/nils-geistmann/devcontainers-features/zsh:0": {}, + "ghcr.io/devcontainers/features/rust:1": {}, + "ghcr.io/devcontainers/features/python:1": {} + }, + "postStartCommand": "apt-get update && apt-get install pkg-config libssl-dev && cargo build --release && ln -sf \"${PWD}/target/release/pathogenic_variant_finder\" /usr/local/bin/pathogenic" +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a940b9f --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +#Additional filetypes to ignore +*.log +*.csv + +#Specific directories / files to ignore +reports/ +data/ +clinvar_data/ +Cargo.lock +.cursorrules diff --git a/Cargo.toml b/Cargo.toml index f380e40..6bcaf84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,13 @@ csv = "1.3" rayon = "1.10" # For .gz flate2 = "1.1.0" +# Noodles for VCF parsing/handling - use the meta crate +noodles = { version = "0.95.0", features = ["vcf", "csi", "core"] } +# Date/time handling for logging +chrono = "0.4" +# Progress bars +indicatif = "0.17" +# CPU count for parallel processing +num_cpus = "1.16" +# XZ compression support +xz2 = "0.1" diff --git a/README.md b/README.md index f68e772..9315115 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This tool analyzes VCF (Variant Call Format) files to identify potentially patho ## Features -- **Comprehensive Pathogenic Variant Detection**: Identifies variants classified as pathogenic or likely pathogenic in ClinVar +- **Comprehensive Variant Detection**: Identifies variants classified as pathogenic, benign, or of uncertain significance in ClinVar - **Population Frequency Integration**: Incorporates allele frequencies from 1000 Genomes (global and by population) - **High Performance**: - Parallel processing using Rayon's work-stealing thread pool @@ -21,10 +21,36 @@ This tool analyzes VCF (Variant Call Format) files to identify potentially patho - Mode of inheritance if available - Citations and additional evidence - **Automatic Reference Data Management**: Downloads and maintains necessary reference databases +- **Advanced Reporting**: Generates detailed CSV reports and summary statistics files +- **Flexible Variant Selection**: Configurable inclusion of pathogenic, VUS, and benign variants ## Installation +There are multiple ways to install the Pathogenic Variant Finder: + +### Option 1: Using the installer script (recommended) + +```bash +# Clone the repository +git clone https://github.com/SauersML/pathogenic.git +cd pathogenic + +# Run the installer script (you might need to make it executable first) +chmod +x install.sh +./install.sh + +# Or run it directly with bash +bash install.sh ``` + +The installer will: +1. Build the release version of the tool +2. Give you options to create a symlink in a directory in your PATH +3. Allow you to run the tool simply as `pathogenic` from anywhere + +### Option 2: Manual installation + +```bash # Clone the repository git clone https://github.com/SauersML/pathogenic.git cd pathogenic @@ -32,28 +58,59 @@ cd pathogenic # Build the project cargo build --release -# The binary will be available at target/release/pathogenic +# The binary will be available at target/release/pathogenic_variant_finder +# You can run it directly: +./target/release/pathogenic_variant_finder -b GRCh38 -i your_variants.vcf + +# Optional: Create a symlink for easier access +sudo ln -sf "$(pwd)/target/release/pathogenic_variant_finder" /usr/local/bin/pathogenic ``` +### Development Container Users + +If you're using the provided Dev Container, the tool will be automatically built and made available as `pathogenic` in your PATH when the container starts. + ## Usage -``` -# Basic usage -pathogenic --build GRCh38 --input your_variants.vcf > results.csv +Once installed, you can use the tool simply as: + +```bash +# Basic usage (pathogenic variants only) +pathogenic -b GRCh38 -i your_variants.vcf + +# Include Variants of Uncertain Significance (VUS) +pathogenic -b GRCh38 -i your_variants.vcf -v + +# Include benign variants +pathogenic -b GRCh38 -i your_variants.vcf -n + +# Include both VUS and benign variants +pathogenic -b GRCh38 -i your_variants.vcf -v -n + +# Disable markdown report generation +pathogenic -b GRCh38 -i your_variants.vcf --markdown-report=false ``` ### Command-line Arguments - `--build`, `-b`: Genome build, must be either "GRCh37" (hg19) or "GRCh38" (hg38) - `--input`, `-i`: Path to the input VCF file (can be uncompressed or gzipped) +- `--include-vus`, `-v`, `--vus`: Include variants of uncertain significance in the output +- `--include-benign`, `-n`, `--benign`: Include benign variants in the output +- `--markdown-report`, `--md-report`: Generate markdown report (enabled by default, use `--markdown-report=false` to disable) ## Output -The tool outputs a CSV file to stdout with the following columns: +The tool generates the following output files in the `reports/` directory: + +### 1. CSV Report File + +The CSV report contains detailed information about all identified variants with the following columns: - Chromosome, Position, Reference Allele, Alternate Allele - Clinical Significance - Is Alt Pathogenic +- Significance Category (pathogenic, benign, vus, conflicting) - Gene - ClinVar Allele ID - Clinical Disease Name (CLNDN) @@ -74,6 +131,46 @@ The tool outputs a CSV file to stdout with the following columns: Depending on available data, many of these fields may not be available. +### 2. Statistics Text File + +A companion statistics file summarizing the analysis settings and results: + +- Analysis settings (input file, genome build, variant types included) +- Command used to run the analysis +- Total variants processed and reported +- Number of unique genes +- Counts of each variant classification type +- Distribution of allele frequencies across populations + +### 3. Markdown Report File + +A comprehensive, human-readable markdown report that includes: + +- Analysis settings and command used +- Summary statistics with variant counts by category +- Table of contents linking to different sections +- Detailed variant information organized by clinical significance +- Variants grouped by gene with sortable tables +- Detailed information about each variant including: + - Location, DNA change, and genotype + - Clinical significance and disease associations + - Population frequencies + - Mode of inheritance and other annotations +- Understanding section with explanations of terms + +The markdown report is generated by default but can be disabled using `--markdown-report=false`. + +### Output File Naming + +Output files follow this naming convention: +``` +[input_filename]_[analysis_type]_[timestamp].csv +[input_filename]_[analysis_type]_[timestamp]_stats.txt +[input_filename]_[analysis_type]_[timestamp].md +``` + +Where `analysis_type` indicates which variant types were included in the report. + ## How It Works 1. **Data Collection**: The tool automatically downloads necessary reference databases: @@ -84,18 +181,30 @@ Depending on available data, many of these fields may not be available. 2. **Variant Processing**: - Parses the user's VCF input - Filters for variants present in the sample's genotype - - Matches variants against ClinVar to identify pathogenic variants + - Matches variants against ClinVar based on selected variant types - Integrates 1000 Genomes allele frequency data - Adds detailed annotations from ClinVar summary data 3. **Output Generation**: - Sorts variants by chromosome and position - Outputs comprehensive CSV with all annotations + - Generates a statistics file summarizing the analysis + - Creates a detailed markdown report with interactive sections (unless disabled) ## Logging The tool maintains a log file (`pathogenic.log`) that captures all processing steps and can be useful for troubleshooting. +## Documentation + +For more detailed information, see the documentation in the `docs/` directory: + +- [Implementation Details](docs/implementation_details.md) +- [Noodles Integration](docs/noodles_integration.md) +- [Parallel Processing](docs/parallel_processing.md) +- [Reporting Features](docs/reporting_features.md) +- [1000 Genome Frequency Extraction](docs/1000genome_frequency_extraction.md) + ## Usage Do not use this for clinical purposes. - I might have written a bug in the code. diff --git a/docs/1000genome_frequency_extraction.md b/docs/1000genome_frequency_extraction.md new file mode 100644 index 0000000..fce2a08 --- /dev/null +++ b/docs/1000genome_frequency_extraction.md @@ -0,0 +1,294 @@ +# 1000 Genomes Frequency Extraction in Pathogenic Variant Finder + +This document provides a detailed explanation of how population allele frequencies are extracted from the 1000 Genomes Project data in the Pathogenic Variant Finder tool, a component that has historically been prone to issues during updates and functional improvements. + +## Overview + +The 1000 Genomes Project provides genomic data for over 2,500 individuals from various populations worldwide. The allele frequencies from this dataset are critical for determining how common a particular variant is in different populations, which can help assess its potential clinical significance. + +In the Pathogenic Variant Finder tool, the 1000 Genomes data is used to supplement ClinVar pathogenic variants with population-specific allele frequencies, enabling more comprehensive variant interpretation. + +```mermaid +flowchart TD + subgraph "Initial Preparation" + A[Matched Variants] --> B[Extract Unique Positions] + B --> C[Create Position Set] + end + + subgraph "1000G Indexed Access" + D[1000G VCF File] --- E[CSI Index] + E --> F[Create IndexedReader] + D --> F + F --> G[Read VCF Header] + end + + subgraph "Query Processing" + C -->|Position by Position| H[Create Query Region] + G --> H + H --> I[Query VCF by Position] + I --> J{Match Found?} + J -->|Yes| K[Process Records] + J -->|No| L[Skip Position] + end + + subgraph "Population Frequencies" + K --> M1[Extract AFR Frequency] + K --> M2[Extract AMR Frequency] + K --> M3[Extract EAS Frequency] + K --> M4[Extract EUR Frequency] + K --> M5[Extract SAS Frequency] + + M1 --> N[Population Frequencies Map] + M2 --> N + M3 --> N + M4 --> N + M5 --> N + end + + N --> O[Annotate Variants] + O --> P[Final Records with Frequencies] + + style E fill:#ad02ad,stroke:#333,stroke-width:2px + style I fill:#ad02ad,stroke:#333,stroke-width:2px + style B fill:#0000f5,stroke:#333,stroke-width:1px + style K fill:#0000f5,stroke:#333,stroke-width:1px + style N fill:#0000f5,stroke:#333,stroke-width:1px +``` + +## Data Source and Structure + +The tool uses the 1000 Genomes Phase 3 VCF file, which is downloaded from the Ensembl FTP server: + +``` +# For GRCh37 (hg19) +https://ftp.ensembl.org/pub/grch37/variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz + +# For GRCh38 (hg38) +https://ftp.ensembl.org/pub/current_variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz +``` + +The corresponding CSI (Coordinate-Sorted Index) files are also downloaded to enable efficient random access to the VCF. + +The 1000 Genomes VCF contains frequency information in the INFO field for several populations: +- **AFR**: African populations +- **AMR**: American populations +- **EAS**: East Asian populations +- **EUR**: European populations +- **SAS**: South Asian populations + +## Indexed Access Strategy + +Rather than parsing the entire 1000 Genomes VCF file (which would be computationally expensive), the tool uses an indexed access approach to efficiently query only the specific positions of interest. + +### Workflow for Frequency Extraction + +1. **Identify Variants of Interest** + The tool first processes ClinVar data to identify pathogenic variants from the user's input file. These positions form the "keys of interest" for querying the 1000 Genomes data. + +2. **Create a Set of Unique Genomic Regions** + To optimize performance, duplicate positions are removed by creating a set of unique chromosomes and positions: + + ```rust + let mut unique_regions: HashSet<(String, u32)> = HashSet::new(); + for (chr, pos, _ref, _alt) in keys_of_interest.iter() { + unique_regions.insert((chr.clone(), *pos)); + } + ``` + +3. **Load the CSI Index** + The CSI index is loaded to enable random access to the VCF file: + + ```rust + let index_file = File::open(&onekg_index_path)?; + let mut csi_reader = csi::io::Reader::new(index_file); + let index = csi_reader.read_index()?; + ``` + +4. **Create an Indexed Reader** + The indexed reader combines the VCF file with its index: + + ```rust + let file = File::open(&onekg_file_path)?; + let mut reader = IndexedReader::new(file, index); + let header = reader.read_header()?; + ``` + +5. **Query Each Region** + For each unique region, the tool creates a query using the Noodles library: + + ```rust + let region_str = format!("{}:{}-{}", chr, pos, pos); + let region_obj: Region = region_str.parse()?; + if let Ok(mut query) = reader.query(&header, ®ion_obj) { + // Process query results + } + ``` + +6. **Extract Allele Frequencies** + For each record returned by a query, the tool checks if the variant matches one of the keys of interest. If so, it extracts the population-specific allele frequencies from the INFO field. + +## Frequency Extraction Implementation + +The core of the frequency extraction logic is in the processing of each VCF record's INFO field. For a given record, the tool first checks if it contains one of the variants of interest: + +```rust +for (alt_idx, alt) in record_alts.iter().enumerate() { + let key = (record_chr.clone(), record_pos.get() as u32, record_ref.clone(), alt.clone()); + if keys_of_interest.contains(&key) { + // Extract frequencies for this variant + } +} +``` + +### The `get_freq` Helper Function + +A key component is the `get_freq` helper function, which extracts population frequencies for a specific alternate allele from the INFO field: + +```rust +let get_freq = |key: &str, alt_idx: usize| -> Option { + match info.get(&header, key) { + Some(Ok(Some(Value::Array(array)))) => { + match array { + noodles::vcf::variant::record::info::field::value::Array::Float(values) => { + let mut iter = values.iter(); + if let Some(Ok(Some(f))) = iter.nth(alt_idx) { + Some(f64::from(*f)) + } else { + None + } + }, + _ => None, // Handle other Array variants + } + }, + _ => None, + } +}; +``` + +This function: +1. Attempts to retrieve a specific INFO field (like "AFR", "EUR", etc.) +2. Checks if the field exists and contains an array of values +3. Verifies the array contains floating-point values +4. Extracts the value at the index corresponding to the alternate allele being processed +5. Converts the value to an f64 and returns it wrapped in an Option + +### Extracting Population-Specific Frequencies + +The tool extracts frequencies for each population group: + +```rust +let afr = get_freq("AFR", alt_idx); +let amr = get_freq("AMR", alt_idx); +let eas = get_freq("EAS", alt_idx); +let eur = get_freq("EUR", alt_idx); +let sas = get_freq("SAS", alt_idx); +``` + +These frequencies are then stored in a HashMap, using the variant's key (chromosome, position, reference allele, alternate allele) as the map key: + +```rust +onekg_freqs.insert(key, (afr, amr, eas, eur, sas)); +``` + +## Integrating Frequencies into Final Results + +After extracting all the frequencies, the tool integrates them with the ClinVar data to create the final records: + +```rust +for r in &temp_results { + let key = (r.chr.clone(), r.pos, r.ref_allele.clone(), r.alt_allele.clone()); + let (af_afr, af_amr, af_eas, af_eur, af_sas) = match onekg_freqs.get(&key) { + None => (None, None, None, None, None), + Some((afr, amr, eas, eur, sas)) => (*afr, *amr, *eas, *eur, *sas), + }; + + // Create final record with these frequencies + let final_rec = FinalRecord { + // Other fields... + af_afr, + af_amr, + af_eas, + af_eur, + af_sas, + }; + final_records.push(final_rec); +} +``` + +## Common Issues and Fragility Points + +The 1000 Genomes frequency extraction has several points of potential fragility: + +### 1. VCF Format and INFO Field Changes + +The INFO fields in the 1000 Genomes VCF may change in structure or naming between different releases. For example: +- Population codes might be changed or expanded +- Array structure might be altered +- Field types might shift between Float, Integer, or String + +### 2. Noodles API Changes + +The Noodles library undergoes regular updates, and its API may change. The code that extracts values from INFO fields is particularly sensitive to these changes: + +```rust +match info.get(&header, key) { + Some(Ok(Some(Value::Array(array)))) => { + match array { + noodles::vcf::variant::record::info::field::value::Array::Float(values) => { + // Array access pattern is dependent on Noodles internal implementation + }, + _ => None, + } + }, + _ => None, +} +``` + +Particular areas of concern include: +- Changes to the `Result` and `Option` nesting pattern in the return value of `info.get()` +- Changes in how arrays are represented or accessed +- Changes in type conversion semantics + +### 3. Indexed Access Reliability + +The indexed access approach depends on correctly functioning CSI indexes. Issues can arise if: +- The downloaded index is corrupted or incompatible with the VCF +- The index uses a different coordinate system than expected +- The region query syntax changes in the Noodles library + +### 4. Chromosome Naming Consistency + +Different resources may use different chromosome naming conventions (e.g., "1" vs. "chr1"). The 1000 Genomes data might use a different convention than the ClinVar or user input data. + +## Best Practices for Maintaining the Frequency Extraction Code + +1. **Version-Lock Dependencies**: + Explicitly specify the Noodles dependency version to prevent breaking changes: + + ```toml + noodles = { version = "0.95.0", features = ["vcf", "csi", "core"] } + ``` + +2. **Add Comprehensive Error Handling**: + Every step of the frequency extraction process should have robust error handling with informative error messages. + +3. **Add Defensive Null Checking**: + The code should defensively check for None/null values at every step. The existing pattern of using Option types and match statements helps with this. + +4. **Add Logging for Debugging**: + Add detailed logging to track the frequency extraction process: + + ```rust + println!("Querying region {}:{}", chr, pos); + writeln!(log_file, "Found {} matching variants at position {}:{}", count, chr, pos)?; + ``` + +5. **Add Unit Tests**: + Create unit tests with mock 1000 Genomes data to verify that the frequency extraction works correctly. + +6. **Add Data Validation**: + Validate the extracted frequencies to ensure they are within expected ranges (0.0 to 1.0). + +## Conclusion + +The 1000 Genomes frequency extraction is a critical component of the Pathogenic Variant Finder that enhances the tool's ability to interpret variants by providing population-specific allele frequencies. By understanding the implementation details, potential fragility points, and following best practices, the code can be maintained and updated with minimal risk of introducing bugs. \ No newline at end of file diff --git a/docs/implementation_details.md b/docs/implementation_details.md new file mode 100644 index 0000000..240c2e4 --- /dev/null +++ b/docs/implementation_details.md @@ -0,0 +1,451 @@ +# Pathogenic Variant Finder: Implementation Details + +This document provides an in-depth explanation of how the Pathogenic Variant Finder tool works, including its architecture, data flow, and key implementation details. + +## Table of Contents + +1. [Overview](#overview) +2. [Program Flow](#program-flow) +3. [Key Data Structures](#key-data-structures) +4. [Key Functions and Components](#key-functions-and-components) +5. [Performance Optimizations](#performance-optimizations) +6. [File Format Handling](#file-format-handling) +7. [Future Enhancement Opportunities](#future-enhancement-opportunities) + +## Overview + +The Pathogenic Variant Finder is a Rust-based tool that identifies potentially pathogenic genetic variants by cross-referencing user-supplied VCF files with the ClinVar database and incorporating population frequency data from the 1000 Genomes Project. It supports both GRCh37 (hg19) and GRCh38 (hg38) genome builds. + +The tool leverages Rust's performance capabilities along with several key libraries: +- **Rayon** for parallel processing +- **Noodles** for efficient bioinformatics file format handling +- **Flate2** for gzip compression/decompression +- **Reqwest** for downloading reference data + +## Program Flow + +The program follows this high-level flow: + +1. **Parse Command-Line Arguments**: Process user inputs (genome build, input file). +2. **Setup Environment**: Ensure reference data is available or download it. +3. **Process Reference Data**: + - Parse ClinVar VCF to identify pathogenic variants. + - Parse ClinVar TSV for detailed variant annotations. +4. **Process User Input**: + - Parse user's VCF to extract variants. +5. **Match and Annotate Variants**: + - Identify user variants that match pathogenic variants in ClinVar. + - Enrich matched variants with 1000 Genomes allele frequency data. + - Add detailed annotations from ClinVar TSV. +6. **Output Results**: + - Generate CSV with annotated variants. + - Create summary statistics text file. + - Produce interactive markdown report (unless disabled). + +```mermaid +flowchart TD + A[Command Line Arguments] --> B[Setup Environment] + + B --> C1[Download ClinVar VCF] + B --> C2[Download ClinVar TSV] + B --> C3[Download 1000G Data] + + subgraph "Reference Processing" + C1 --> D1[Parse ClinVar VCF] + D1 --> E1[ClinVarMap] + C2 --> D2[Parse ClinVar TSV] + D2 --> E2[TSV Annotations] + end + + subgraph "User Data Processing" + F[Input VCF] --> G[Parse User VCF] + G --> H[InputVariants] + end + + E1 --> I[Variant Matching] + H --> I + + I --> J[Matched Variants] + J --> K[Query 1000G Database] + C3 --> K + + K --> L[Frequency Enriched Variants] + E2 --> M[Add TSV Annotations] + L --> M + + M --> N[FinalRecords] + + N --> O1[Generate CSV] + N --> O2[Generate Statistics] + N --> O3[Generate Markdown] + + style I fill:#0000f5,stroke:#333,stroke-width:2px + style M fill:#0000f5,stroke:#333,stroke-width:2px + style E1 fill:#007d00,stroke:#333,stroke-width:1px + style E2 fill:#007d00,stroke:#333,stroke-width:1px + style H fill:#007d00,stroke:#333,stroke-width:1px + style N fill:#007d00,stroke:#333,stroke-width:1px + style J fill:#cc6804,stroke:#333,stroke-width:1px +``` + +## Key Data Structures + +### ClinVarRecord + +```rust +struct ClinVarRecord { + chr: String, + pos: u32, + ref_allele: String, + alt_allele: String, + clnsig: String, + is_alt_pathogenic: bool, + gene: Option, + allele_id: Option, + clnrevstat: Option, + clndn: Option, + af_esp: Option, + af_exac: Option, + af_tgp: Option, +} +``` + +This structure holds essential information extracted from the ClinVar VCF file. Each instance represents a single variant with clinical significance information. The `is_alt_pathogenic` flag indicates whether the alternate allele is classified as pathogenic. + +### InputVariant + +```rust +struct InputVariant { + chr: String, + pos: u32, + ref_allele: String, + alts: Vec<(String, bool)>, // (alt_allele, is_present_in_genotype) + genotype: String, +} +``` + +This structure represents a variant from the user's input VCF file. The `alts` field contains a vector of tuples, each holding an alternate allele and a boolean indicating whether that allele is present in the sample's genotype. + +### ClinVarTsvRecord + +```rust +struct ClinVarTsvRecord { + molecular_consequence: Option, + functional_consequence: Option, + mode_of_inheritance: Option, + preferred_values: Option, + citations: Option, + comments: Option, + family_data: Option, + record_status: Option, + description: Option, + date_last_evaluated: Option, +} +``` + +This structure contains detailed annotations extracted from the ClinVar TSV summary file, providing deeper context about variants. + +### FinalRecord + +```rust +struct FinalRecord { + // Basic variant information + chr: String, + pos: u32, + ref_allele: String, + alt_allele: String, + + // ClinVar information + clnsig: String, + is_alt_pathogenic: bool, + gene: Option, + allele_id: Option, + genotype: String, + review_stars: u8, + clndn: Option, + + // Allele frequencies + af_esp: Option, + af_exac: Option, + af_tgp: Option, + af_afr: Option, + af_amr: Option, + af_eas: Option, + af_eur: Option, + af_sas: Option, + + // Detailed ClinVar TSV annotations + molecular_consequence: Option, + functional_consequence: Option, + mode_of_inheritance: Option, + preferred_values: Option, + citations: Option, + comments: Option, + family_data: Option, + record_status: Option, + description: Option, + date_last_evaluated: Option, +} +``` + +This comprehensive structure combines all information about a variant, including basic coordinates, clinical significance, genotype information, allele frequencies from different populations, and detailed annotations. + +## Key Functions and Components + +### Command-Line Argument Parsing + +The tool uses Clap to parse command-line arguments: + +```rust +#[derive(Parser)] +struct Args { + /// Genome build: must be "GRCh37" (hg19) or "GRCh38" (hg38) + #[arg(short, long)] + build: String, + + /// Path to the input VCF (can be uncompressed or gzipped) + #[arg(short, long, value_name = "FILE")] + input: PathBuf, + + /// Include variants of uncertain significance (VUS) + #[arg(long)] + include_vus: bool, + + /// Include benign variants + #[arg(long)] + include_benign: bool, + + /// Generate markdown report (enabled by default) + #[arg(long, default_value = "true")] + markdown_report: bool, +} +``` + +This defines the following arguments: +- `build`: The genome build (GRCh37 or GRCh38) +- `input`: Path to the user's VCF file +- `include_vus`: Flag to include variants of uncertain significance +- `include_benign`: Flag to include benign variants +- `markdown_report`: Flag to toggle generation of markdown report (enabled by default) + +### Reference Data Management + +#### `download_file` + +This function handles downloading reference files with a progress bar. It attempts to use parallel chunk downloads when supported by the server, falling back to a single-threaded download otherwise: + +```rust +fn download_file( + url: &str, + out_path: &Path, + log_file: &mut File, +) -> Result<(), DownloadError> +``` + +Key features: +- Checks if the server supports HTTP range requests for parallel downloads +- Uses atomic counters for tracking download progress +- Implements a fallback mechanism for servers not supporting range requests +- Provides real-time progress visualization + +### VCF Parsing + +#### `parse_clinvar_vcf_gz` + +This function parses the ClinVar VCF to extract pathogenic variants: + +```rust +fn parse_clinvar_vcf_gz( + path_gz: &Path, + log_file: &mut File, +) -> Result<(ClinVarMap, String), Box> +``` + +It utilizes Rayon for parallel processing of VCF lines, significantly improving performance on multi-core systems. The function returns a map of variants keyed by their genomic coordinates and a string containing the file date. + +#### `parse_input_line` + +This helper function parses a single line from the user's VCF file: + +```rust +fn parse_input_line(line: &str) -> Option<(String, InputVariant)> +``` + +It normalizes chromosome names (removing "chr" prefix and handling mitochondrial chromosomes), extracts genotype information, and determines which alternate alleles are present in the sample. + +#### `parse_input_vcf` + +This function processes the user's input VCF file, handling both compressed and uncompressed formats: + +```rust +fn parse_input_vcf(path: &Path, log_file: &mut File) -> Result, Box> +``` + +It dispatches the parsing to Rayon's parallel iterator for efficient multi-threaded processing. + +### Noodles Integration + +The tool uses Noodles for efficient handling of genomic file formats, particularly for indexing and querying the 1000 Genomes VCF: + +```rust +let mut csi_reader = csi::io::Reader::new(index_file); +let index = csi_reader.read_index()?; +let mut reader = vcf::io::indexed_reader::IndexedReader::new(file, index); +``` + +This allows efficient random access to the 1000 Genomes VCF file by genomic position, avoiding the need to read the entire file. + +### Variant Matching and Annotation + +The tool matches user variants against ClinVar pathogenic variants using a HashMap for efficient lookups: + +```rust +let temp_results: Vec = input_variants + .par_iter() + .flat_map_iter(|(_, iv)| { + let mut local_found = Vec::new(); + for (alt_a, is_present) in &iv.alts { + if !is_present { + continue; + } + let key = (iv.chr.clone(), iv.pos, iv.ref_allele.clone(), alt_a.clone()); + if let Some(cv) = clinvar_map.get(&key) { + if !cv.is_alt_pathogenic { + continue; + } + // Add variant to results + // ... + } + } + local_found + }) + .collect(); +``` + +This approach: +1. Processes variants in parallel using Rayon +2. Checks if each alternate allele is present in the genotype +3. Looks up each variant in the ClinVar map +4. Filters for pathogenic variants + +### 1000 Genomes Frequency Integration + +The tool extracts population-specific allele frequencies from the 1000 Genomes VCF using Noodles' indexed querying capabilities: + +```rust +for (chr, pos) in unique_regions { + let region_str = format!("{}:{}-{}", chr, pos, pos); + let region_obj: Region = region_str.parse()?; + if let Ok(mut query) = reader.query(&header, ®ion_obj) { + // Process query results + } +} +``` + +This approach: +1. Creates a set of unique regions (chromosome + position) to query +2. Parses each region into a Noodles Region object +3. Queries the indexed VCF at each region +4. Extracts population-specific frequencies from the INFO field + +## Performance Optimizations + +### Parallel Processing + +The tool extensively uses Rayon for parallel processing: + +```rust +let chunk_maps: Vec = lines + .par_iter() + .map(|line| { + // Process line + }) + .collect(); +``` + +This enables efficient utilization of all available CPU cores, significantly improving performance for parsing large files. + +### Efficient Data Structures + +The tool uses HashMaps and HashSets for fast lookups: + +```rust +type ClinVarMap = HashMap<(String, u32, String, String), ClinVarRecord>; +``` + +This key structure enables O(1) lookups when matching variants by their genomic coordinates. + +### Indexed Access + +For the 1000 Genomes data, the tool uses indexed access via Noodles to avoid scanning the entire file: + +```rust +let mut reader = vcf::io::indexed_reader::IndexedReader::new(file, index); +// ... +let query = reader.query(&header, ®ion_obj)?; +``` + +This allows for efficient retrieval of variants at specific genomic positions. + +### Chunked Downloads + +The `download_file` function implements parallel chunk downloads for faster retrieval of reference data: + +```rust +let num_chunks = num_cpus::get(); +let chunk_size = total_size / num_chunks as u64; +// ... +for (i, (start, end)) in ranges.into_iter().enumerate() { + // Download chunk in parallel +} +``` + +This approach utilizes available bandwidth more efficiently. + +## File Format Handling + +### VCF Processing + +The tool uses a combination of custom parsing and Noodles for VCF handling: + +1. Custom parsing for the ClinVar VCF and user input VCF to extract relevant fields quickly +2. Noodles for indexed access to the 1000 Genomes VCF + +### Gzip Handling + +The tool handles compressed files using Flate2: + +```rust +let decoder = MultiGzDecoder::new(file); +let reader = BufReader::new(decoder); +``` + +This allows processing of gzipped VCF files without manual decompression. + +### CSV Output Generation + +The tool uses the CSV crate for structured output: + +```rust +let mut wtr = csv::Writer::from_writer(std::io::stdout()); +wtr.write_record(&[ + "Chromosome", + "Position", + // More fields... +])?; +``` + +This ensures standards-compliant CSV generation with proper escaping of special characters. + +## Future Enhancement Opportunities + +1. **User-specified filters**: Allow filtering variants by clinical significance, allele frequency, etc. +2. **Integration with additional databases**: Add support for other variant databases like OMIM, ClinGen, etc. +3. **Cache management**: Implement version checking and automatic updates for reference data. +4. **Advanced variant annotation**: Add support for annotating variants with functional predictions, conservation scores, etc. +5. **API integration**: Provide a library interface for integration into other tools. +6. **Configurable output formats**: Support additional output formats like JSON, VCF, etc. +7. **Genome build conversion**: Add tools to convert variants between different genome builds. +8. **Unit and integration tests**: Improve test coverage for more robust development. + +By understanding the implementation details outlined in this document, developers can more effectively maintain and extend the Pathogenic Variant Finder tool to meet evolving requirements in genetic variant analysis. \ No newline at end of file diff --git a/docs/noodles_integration.md b/docs/noodles_integration.md new file mode 100644 index 0000000..f751a97 --- /dev/null +++ b/docs/noodles_integration.md @@ -0,0 +1,234 @@ +# Noodles Integration in Pathogenic Variant Finder + +This document provides a detailed explanation of how the [Noodles](https://github.com/zaeleus/noodles) Rust libraries are integrated into the Pathogenic Variant Finder tool for efficient bioinformatics file handling. + +## Overview of Noodles + +Noodles is a collection of Rust libraries for handling various bioinformatics file formats, including VCF (Variant Call Format), CSI (Coordinate-Sorted Index), and many others. It provides both high-level operations and low-level parsers and writers for these formats. + +In the Pathogenic Variant Finder, we specifically use three core components of Noodles: + +1. **noodles-vcf**: For working with Variant Call Format files +2. **noodles-csi**: For handling Coordinate-Sorted Indices +3. **noodles-core**: For common data structures like genomic regions + +```mermaid +flowchart TD + subgraph "Noodles Components" + A1[noodles-core] --- A2[Region] + B1[noodles-csi] --- B2[CSI Index] + C1[noodles-vcf] --- C2[VCF Records] + C1 --- C3[Headers] + C1 --- C4[INFO Fields] + end + + subgraph "Indexed VCF Access" + D[1000G VCF File] --- E[CSI Index File] + E --> F[IndexedReader] + D --> F + + F --> G[Read Header] + G --> H[Query Region] + + subgraph "Query Processing" + H --> I[Create Region] + I --> J[Query by Position] + J --> K[Process Records] + end + end + + subgraph "Application Integration" + L[Genomic Position] --> I + K --> M[Extract Frequencies] + M --> N[Variant Annotation] + end + + style B2 fill:#ad02ad,stroke:#333,stroke-width:2px + style F fill:#ad02ad,stroke:#333,stroke-width:2px + style J fill:#ad02ad,stroke:#333,stroke-width:2px +``` + +## Dependency Management + +The tool manages Noodles dependencies through the umbrella crate with specific features enabled: + +```toml +# Meta-crate with features enabled +noodles = { version = "0.95.0", features = ["vcf", "csi", "core"] } +``` + +This approach ensures we have access to all needed functionality through a single dependency while only including the components we actually need. + +## Module Imports + +The relevant Noodles modules are imported at the top of the file: + +```rust +// Using the individual crates +use noodles_vcf as vcf; +use noodles_csi as csi; +use noodles_core::region::Region; +``` + +## Indexed VCF Access + +### Loading CSI Index + +Noodles is primarily used for indexed access to the 1000 Genomes VCF file. This begins with loading the CSI index: + +```rust +let index_file = File::open(&onekg_index_path)?; +let mut csi_reader = csi::io::Reader::new(index_file); +let index = csi_reader.read_index()?; +``` + +The CSI index provides a mapping from genomic coordinates to file positions, enabling efficient random access to specific regions within a potentially large VCF file. + +### Creating an Indexed Reader + +Once the index is loaded, an indexed reader is created: + +```rust +let file = File::open(&onekg_file_path)?; +let mut reader = vcf::io::indexed_reader::IndexedReader::new(file, index); +``` + +This reader combines the raw VCF file with its index to allow position-based queries. + +### Reading the VCF Header + +The header contains important metadata about the VCF file, including descriptions of INFO fields that will be needed later: + +```rust +let header = reader.read_header()?; +``` + +This header is required both for understanding the structure of the records and for subsequent queries. + +### Querying by Region + +A key feature of indexed access is the ability to query specific genomic regions: + +```rust +let region_str = format!("{}:{}-{}", chr, pos, pos); +let region_obj: Region = region_str.parse()?; +if let Ok(mut query) = reader.query(&header, ®ion_obj) { + // Process results +} +``` + +This creates a Region object from a string representation (e.g., "1:12345-12345") and uses it to query the indexed VCF. + +### Iterating Through Query Results + +The query returns an iterator over records in the specified region: + +```rust +while let Some(record) = query.next() { + let record = record?; + // Process the record +} +``` + +## VCF Record Processing + +### Accessing Record Fields + +Noodles provides structured access to VCF record fields: + +```rust +let record_chr = record.reference_sequence_name().to_string(); +let record_pos = record.variant_start().expect("Failed to get variant start")?; +let record_ref = record.reference_bases().to_string(); +``` + +This approach is type-safe and handles the complexities of the VCF format. + +### Working with Alternate Alleles + +Multiple alternate alleles are handled through Noodles' iterator interface: + +```rust +let record_alts: Vec = record + .alternate_bases() + .iter() + .filter_map(Result::ok) + .map(|a| a.to_string()) + .collect(); +``` + +### Accessing INFO Fields + +INFO fields contain additional annotations, including allele frequencies. Noodles provides structured access to these fields: + +```rust +let info = record.info(); +``` + +### Extracting Typed Values + +Noodles handles the type conversion from INFO fields: + +```rust +match info.get(&header, key) { + Some(Ok(Some(Value::Array(array)))) => { + match array { + vcf::record::info::field::value::Array::Float(values) => { + let mut iter = values.iter(); + if let Some(Ok(Some(f))) = iter.nth(alt_idx) { + Some(f64::from(*f)) + } else { + None + } + }, + _ => None, // Handle other Array variants + } + }, + _ => None, +} +``` + +This code checks if a specific INFO field exists, verifies it's an array of floats, and extracts the value at a specific index (corresponding to the alternate allele). + +## Benefits of Using Noodles + +1. **Random Access**: Enables efficient querying of specific regions without scanning the entire file +2. **Type Safety**: Provides strongly-typed interfaces to VCF data structures +3. **Error Handling**: Uses Rust's Result type for robust error handling +4. **Standard Compliance**: Implements proper VCF specification handling +5. **Performance**: Optimized for efficiency in both memory and CPU usage + +## Custom Parsing vs. Noodles + +The Pathogenic Variant Finder uses custom parsing for some VCF files (ClinVar VCF and user input) while using Noodles for others (1000 Genomes). This design decision was made for specific reasons: + +1. **Custom Parsing**: Used for full file scans where specific fields need to be extracted and where Rayon's parallel processing can be applied line-by-line. + +2. **Noodles**: Used when random access is required, particularly for the 1000 Genomes VCF where we only need to access specific positions rather than scanning the entire file. + +This hybrid approach combines the flexibility of custom parsing with the efficient indexed access of Noodles. + +## Error Handling with Noodles + +Noodles uses Rust's Result type for error handling. The Pathogenic Variant Finder properly propagates these errors: + +```rust +let region_obj: Region = region_str.parse()?; +if let Ok(mut query) = reader.query(&header, ®ion_obj) { + while let Some(record) = query.next() { + let record = record?; + // Process record + } +} +``` + +The `?` operator is used to propagate errors, ensuring robust error handling. + +## Future Noodles Integration Opportunities + +1. **Replace Custom Parsing**: Consider using Noodles for all VCF parsing to ensure standard compliance. +2. **Async Support**: Leverage Noodles' async features for non-blocking I/O. +3. **Writer Integration**: Use Noodles' writer functionality for outputting results in VCF format. +4. **Additional Formats**: Integrate other Noodles-supported formats like BAM or CRAM for future extensions. + +By understanding the Noodles integration, developers can leverage its capabilities to enhance the Pathogenic Variant Finder with additional features while maintaining efficiency and standard compliance. \ No newline at end of file diff --git a/docs/parallel_processing.md b/docs/parallel_processing.md new file mode 100644 index 0000000..34ace91 --- /dev/null +++ b/docs/parallel_processing.md @@ -0,0 +1,322 @@ +# Parallel Processing in Pathogenic Variant Finder + +This document details how the Pathogenic Variant Finder leverages Rust's parallel processing capabilities, primarily through the Rayon library, to achieve high performance when processing genomic data. + +## Overview of Parallel Processing Strategy + +The Pathogenic Variant Finder employs parallel processing at multiple stages: + +1. **Data Downloading**: Uses parallel chunk downloads for reference databases +2. **VCF Parsing**: Processes VCF lines in parallel using Rayon +3. **Variant Matching**: Conducts parallel lookups against reference databases +4. **Frequency Extraction**: Processes genomic regions concurrently + +These parallel approaches significantly improve performance, especially on multi-core systems. + +```mermaid +flowchart TD + subgraph "Input Processing" + A[Input Data] --> B{Split Work} + B -->|Chunk 1| C1[Worker Thread 1] + B -->|Chunk 2| C2[Worker Thread 2] + B -->|Chunk 3| C3[Worker Thread 3] + B -->|Chunk N| C4[Worker Thread N] + C1 --> D{Combine Results} + C2 --> D + C3 --> D + C4 --> D + D --> E[Final Result] + end + + subgraph "Parallel Components" + PA[VCF Parsing] --> PB[Variant Matching] + PB --> PC[Frequency Extraction] + + PA -.-> R[Rayon Thread Pool] + PB -.-> R + PC -.-> R + end + + subgraph "Download Parallelism" + Z[File Download] --> Y{Split File} + Y -->|Range 1| X1[Download Thread 1] + Y -->|Range 2| X2[Download Thread 2] + Y -->|Range N| X3[Download Thread N] + X1 --> W[Reassemble File] + X2 --> W + X3 --> W + end + + style R fill:#ba00ba,stroke:#333,stroke-width:2px + style B fill:#0000b5,stroke:#333,stroke-width:1px + style D fill:#0000b5,stroke:#333,stroke-width:1px + style Y fill:#0000b5,stroke:#333,stroke-width:1px + style W fill:#0000b5,stroke:#333,stroke-width:1px +``` + +## Rayon Integration + +[Rayon](https://github.com/rayon-rs/rayon) is a data parallelism library for Rust that makes it easy to convert sequential computations into parallel ones. It's particularly well-suited for this application due to its work-stealing scheduler and simple API. + +### Dependencies + +Rayon is included as a dependency in Cargo.toml: + +```toml +# Rayon for parallel iteration (multi-threaded processing) +rayon = "1.10" +``` + +### Imports + +Rayon's parallel iterators are imported at the top of the file: + +```rust +use rayon::prelude::*; +``` + +This brings `par_iter()`, `par_iter_mut()`, and related extension methods into scope. + +## Parallel VCF Parsing + +One of the most computationally intensive tasks is parsing the ClinVar VCF file. Rayon is used to parallelize this process: + +```rust +let chunk_maps: Vec = lines + .par_iter() + .map(|line| { + pb.inc(1); + match parse_clinvar_line(line) { + None => HashMap::new(), + Some(records) => { + let mut local_map = HashMap::with_capacity(records.len()); + for r in records { + let key = (r.chr.clone(), r.pos, r.ref_allele.clone(), r.alt_allele.clone()); + local_map.insert(key, r); + } + local_map + } + } + }) + .collect(); +``` + +Key aspects of this implementation: + +1. **Local Maps**: Each thread builds its own local HashMap +2. **Collection Step**: Results are combined at the end +3. **Progress Tracking**: Atomic counter updates track progress +4. **Thread Safety**: Rayon ensures thread-safe parallel execution + +This approach allows the parser to efficiently utilize all available CPU cores. + +### Similar Pattern for User VCF + +The same pattern is used for parsing the user's input VCF: + +```rust +let chunk_variants: Vec> = lines + .par_iter() + .map(|line| { + pb.inc(1); + match parse_input_line(line) { + None => vec![], + Some((l, iv)) => vec![(l, iv)], + } + }) + .collect(); +``` + +## Parallel Variant Matching + +When matching user variants against ClinVar data, Rayon's `par_iter()` is combined with `flat_map_iter()` for efficient parallel processing: + +```rust +let temp_results: Vec = input_variants + .par_iter() + .flat_map_iter(|(_, iv)| { + let mut local_found = Vec::new(); + for (alt_a, is_present) in &iv.alts { + if !is_present { + continue; + } + let key = (iv.chr.clone(), iv.pos, iv.ref_allele.clone(), alt_a.clone()); + if let Some(cv) = clinvar_map.get(&key) { + if !cv.is_alt_pathogenic { + continue; + } + // Add to local results + local_found.push(TempRecord { /* ... */ }); + } + } + pb.inc(1); + local_found + }) + .collect(); +``` + +This pattern: +1. Processes variants in parallel +2. Returns multiple results per input variant (via `flat_map_iter`) +3. Combines results from all threads automatically + +## Thread-Safe Progress Reporting + +The tool uses progress bars from the `indicatif` crate which are designed to be thread-safe: + +```rust +let pb = ProgressBar::new(total); +pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} ({eta})") + .unwrap() + .progress_chars("=>-"), +); +``` + +Progress is updated from multiple threads: + +```rust +.par_iter() +.map(|line| { + pb.inc(1); + // Process line +}) +``` + +The `ProgressBar` handles thread safety internally, ensuring accurate progress reporting. + +## Parallel Downloads + +Beyond Rayon, parallel processing is also used for downloading reference databases: + +```rust +fn download_file( + url: &str, + out_path: &Path, + log_file: &mut File, +) -> Result<(), DownloadError> { + // ... + let num_chunks = num_cpus::get(); + let chunk_size = total_size / num_chunks as u64; + // ... + + for (i, (start, end)) in ranges.into_iter().enumerate() { + let client = client.clone(); + let url = url.to_string(); + let progress = Arc::clone(&progress); + let handle = thread::spawn(move || -> Result<(usize, Vec), DownloadError> { + // Download chunk + }); + handles.push(handle); + } + // ... +} +``` + +This approach: +1. Uses standard Rust threads rather than Rayon +2. Divides the download into chunks based on CPU count +3. Downloads chunks in parallel +4. Uses atomic counters for combined progress tracking +5. Reassembles the chunks in the correct order + +## Coordination and Data Sharing + +### Atomic Progress Tracking + +For parallel downloads, atomic counters track progress across threads: + +```rust +let progress = Arc::new(AtomicU64::new(0)); +// ... +progress.fetch_add(n as u64, Ordering::Relaxed); +// ... +while progress.load(Ordering::Relaxed) < total_size { + pb.set_position(progress.load(Ordering::Relaxed)); + std::thread::sleep(std::time::Duration::from_millis(100)); +} +``` + +This ensures thread-safe coordination without locks. + +### Shared Immutable State + +The tool follows Rust's ownership principles, sharing immutable state across threads: + +```rust +let temp_results: Vec = input_variants + .par_iter() // Shared immutable access to input_variants + .flat_map_iter(|(_, iv)| { + // ... + if let Some(cv) = clinvar_map.get(&key) { // Shared immutable access to clinvar_map + // ... + } + }) + .collect(); +``` + +This approach avoids locks while ensuring thread safety. + +## Performance Considerations + +### Dynamic Thread Pool + +Rayon automatically creates a thread pool optimized for the current system: + +```rust +println!( + " -> Loaded user VCF ({} lines). Using {} CPU cores.", + lines.len(), + num_cpus::get() +); +``` + +The thread pool size adapts to the available cores, maximizing CPU utilization. + +### Work Stealing + +Rayon's work-stealing scheduler dynamically balances workload across threads. This is particularly beneficial when processing VCF files with varying line complexities. + +### Combining Results + +Results from parallel operations are efficiently combined: + +```rust +// Combine ClinVar maps from parallel processing +let mut final_map = HashMap::new(); +for cm in chunk_maps { + final_map.extend(cm); +} +``` + +This approach minimizes lock contention during the parallel processing phase. + +## CPU Utilization + +The parallel approach ensures high CPU utilization: + +1. **Auto-detection**: Uses `num_cpus` to detect available cores +2. **Adaptive Chunking**: Divides work based on available parallelism +3. **Work Balancing**: Rayon's work-stealing adjusts to varying workloads +4. **Minimal Contention**: Local result aggregation minimizes lock contention + +## Performance Gains + +The parallel approach provides significant performance benefits: + +1. **Near-linear Scaling**: Performance improves with additional cores +2. **Reduced I/O Bottlenecks**: Parallel downloads maximize bandwidth utilization +3. **Balanced CPU Usage**: Work-stealing ensures all cores remain busy +4. **Memory Efficiency**: Local data aggregation minimizes shared state + +## Future Enhancements + +Potential improvements to parallel processing: + +1. **Configurable Parallelism**: Allow users to specify thread count +2. **I/O-Bound Optimizations**: Further optimize for I/O-bound operations +3. **Better Work Distribution**: Implement smarter task division strategies +4. **Pipeline Parallelism**: Implement producer-consumer patterns for streaming processing + +By leveraging Rayon and other parallel processing techniques, the Pathogenic Variant Finder achieves high performance while maintaining code simplicity and safety through Rust's ownership system. \ No newline at end of file diff --git a/docs/reporting_features.md b/docs/reporting_features.md new file mode 100644 index 0000000..e3638b4 --- /dev/null +++ b/docs/reporting_features.md @@ -0,0 +1,215 @@ +# Reporting Features in Pathogenic Variant Finder + +This document details the output reporting features of the Pathogenic Variant Finder tool, including the naming conventions, file formats, and statistical information provided in the output files. + +## Output Files Overview + +The Pathogenic Variant Finder generates three primary output files for each analysis run: + +1. **CSV Report File**: Contains detailed information about all variants detected that match the filtering criteria. +2. **Statistics Text File**: Provides a summary of the analysis settings, results, and distribution statistics. +3. **Markdown Report File**: Offers a comprehensive, human-readable report with organized variant information and explanations. + +All files are automatically generated in the `reports/` directory upon completion of an analysis. The markdown report can be disabled using the `--markdown-report=false` command-line option. + +## Naming Conventions + +The output files follow a specific naming convention to help with organization and identification: + +``` +[input_filename]_[analysis_type]_[timestamp].(csv|txt|md) +``` + +Where: +- `input_filename`: The base name of the input VCF file (without extension) +- `analysis_type`: Indicates the types of variants included in the analysis + - `pathogenic`: Only pathogenic variants + - `pathogenic_vus`: Pathogenic and variants of uncertain significance + - `pathogenic_benign`: Pathogenic and benign variants + - `pathogenic_vus_benign`: All variant types +- `timestamp`: Date and time of the analysis in format `YYYYMMDD_HHMMSS` +- File extensions: `.csv` for the variant report, `_stats.txt` for the statistics file, and `.md` for the markdown report + +## CSV Report Format + +The CSV report contains the following columns: + +| Column | Description | +|--------|-------------| +| Chromosome | Chromosome identifier | +| Position | Genomic position of the variant | +| Reference Allele | The reference allele | +| Alternate Allele | The alternate allele | +| Clinical Significance | ClinVar classification of clinical significance | +| Is Alt Pathogenic | Boolean indicating if the alternate allele is pathogenic | +| Significance Category | Simplified category (pathogenic, benign, vus, conflicting) | +| Gene | Gene symbol | +| ClinVar Allele ID | ClinVar identifier for the allele | +| CLNDN | Clinical disease name associated with the variant | +| Genotype | Genotype of the sample (e.g., 0/1, 1/1) | +| Review Stars | ClinVar review status represented as stars (0-4) | +| AF_ESP | Allele frequency from Exome Sequencing Project | +| AF_EXAC | Allele frequency from Exome Aggregation Consortium | +| AF_TGP | Allele frequency from 1000 Genomes Project | +| AF_AFR | African population allele frequency (1000 Genomes) | +| AF_AMR | American population allele frequency (1000 Genomes) | +| AF_EAS | East Asian population allele frequency (1000 Genomes) | +| AF_EUR | European population allele frequency (1000 Genomes) | +| AF_SAS | South Asian population allele frequency (1000 Genomes) | +| Molecular Consequence | Effect on DNA/RNA | +| Functional Consequence | Effect on protein function | +| Mode of Inheritance | Pattern of inheritance | +| Preferred Values | Preferred variants | +| Citations | Academic citations | +| Comments | Additional notes | +| Family Data | Information about family genetics | +| Record Status | Status of the record | +| Description | Detailed description of the variant | +| Date Last Evaluated | Date of last evaluation in ClinVar | + +Variants in the CSV file are sorted by chromosome and position for ease of analysis. + +## Statistics File Format + +The statistics file provides a comprehensive summary of the analysis in a readable text format. It includes: + +### Analysis Settings Section +- Input file path +- Genome build used (e.g., GRCH38) +- Which variant types were included (VUS, benign) +- Command used to run the analysis +- Date and time of analysis + +### Analysis Results Section +- Total number of variants processed +- Total number of variants reported +- Number of unique genes covered + +### Variant Classifications Section +- Count of each variant classification: + - Pathogenic + - Benign + - VUS (Variants of Uncertain Significance) + +### Allele Frequency Distribution Section +- Distribution of variants across different allele frequency ranges for each population: + - < 0.1% (rare variants) + - 0.1% - 1% (uncommon variants) + - 1% - 5% (low-frequency variants) + - 5% - 10% (common variants) + - > 10% (high-frequency variants) +- Population codes: + - AFR: African + - AMR: American + - EAS: East Asian + - EUR: European + - SAS: South Asian + +## Markdown Report Format + +The markdown report presents a comprehensive, human-readable summary of the analysis with the following sections: + +### Header and Metadata +- Report title and generation timestamp +- Analysis settings: input file, genome build, variant types analyzed +- Command used to generate the report + +### Table of Contents +- Dynamic navigation links to all sections present in the report + +### Disclaimer +- Clear statement about the limitations of the tool and appropriate usage + +### Summary Section +- Overview of total variants processed and reported +- Count of unique genes with identified variants +- Summary table with counts for each variant classification type + +### Variant Sections (Dynamically Generated) +- Separate sections for each variant classification present in the results: + - Pathogenic Variants + - Likely Pathogenic Variants + - Variants of Uncertain Significance (if `--include-vus` is specified) + - Variants with Conflicting Interpretations (if `--include-vus` is specified) + - Benign Variants (if `--include-benign` is specified) + - Likely Benign Variants (if `--include-benign` is specified) + +### Variant Organization +- Variants are grouped by gene and sorted by chromosome and position +- Each variant section includes: + - Summary table with key information + - Detailed information for each variant + +### Understanding Section +- Explanations of key terms and classifications +- Information about data sources and interpretation guidelines +- Guidance on how to use the report + +The markdown report is designed to be both readable as plain text and rendered properly with markdown viewers for enhanced visualization. + +## Example Output + +### Example Statistics File +``` +=== Pathogenic Variant Finder: Analysis Report === +Date/Time: 2025-03-25T23:40:18.591768597+00:00 + +=== Analysis Settings === +Input File: /workspaces/pathogenic/data/TrevorCampbell-SQ63A788-30x-WGS-Sequencing_com-02-22-25.snp-indel.genome.vcf.gz +Genome Build: GRCH38 +Include VUS: true +Include Benign: true +Command Used: pathogenic -b GRCh38 -i data/TrevorCampbell-SQ63A788-30x-WGS-Sequencing_com-02-22-25.snp-indel.genome.vcf.gz -v -n + +=== Analysis Results === +Total Variants Processed: 13027212 +Total Variants Reported: 36853 +Unique Genes: 4882 + +=== Variant Classifications === +benign: 36647 +pathogenic: 11 +vus: 195 + +=== Allele Frequency Distribution === +EAS_0.1% - 1%: 1339 +AMR_1% - 5%: 1165 +AFR_5% - 10%: 2162 +... +``` + +## Usage Recommendations + +1. **Archiving Reports**: All reports have timestamps to prevent overwriting, making it safe to run multiple analyses on the same file with different settings. + +2. **Comparing Analyses**: The naming convention makes it easy to compare different analyses of the same input file. + +3. **Filtering Results**: The CSV format allows for easy filtering and further analysis in spreadsheet software or programming languages. + +4. **Statistical Analysis**: The statistics file provides a quick overview without having to parse the entire CSV file. + +## Implementation Details + +The reporting functionality is implemented in the `main.rs` file. Key components include: + +1. **Filename Generation**: Input filename extraction, analysis type determination, and timestamp generation. + +2. **CSV Writing**: Sorted variant data is written to a CSV file with appropriate headers. + +3. **Statistics Collection**: Counts of various metrics are collected during processing and summarized. + +4. **Statistics File Generation**: A formatted text file is created with sections for different statistical categories. + +## Future Enhancements + +Potential improvements to reporting features: + +1. **HTML Report Generation**: Interactive HTML reports with visualization of variant distributions. + +2. **Advanced Filtering Options**: Ability to specify custom filters for report generation. + +3. **Integration with External Resources**: Links to external databases for each variant. + +4. **Comparison Reports**: Automated comparison between different analysis runs. + +5. **Exportable Visualizations**: Generation of charts and graphs for key statistics. \ No newline at end of file diff --git a/docs/tool_overview.md b/docs/tool_overview.md new file mode 100644 index 0000000..3e593cb --- /dev/null +++ b/docs/tool_overview.md @@ -0,0 +1,106 @@ +# Technical Overview: Pathogenic Variant Finder + +This document provides a high-level technical overview of the Pathogenic Variant Finder's architecture and components. For usage instructions and general information, refer to the [main README](../README.md) in the root directory. + +## Documentation Index + +1. [Implementation Details](implementation_details.md) - Core architecture, data flow, data structures +2. [Noodles Integration](noodles_integration.md) - Integration with bioinformatics libraries +3. [Parallel Processing](parallel_processing.md) - Performance optimization techniques +4. [Reporting Features](reporting_features.md) - Output formats and reporting capabilities +5. [1000 Genome Frequency Extraction](1000genome_frequency_extraction.md) - Population allele frequency integration + +## Technical Architecture + +### Core Components + +The Pathogenic Variant Finder follows a modular pipeline architecture with these key technical components: + +1. **Command Parsing** + - Uses Clap for ergonomic command-line argument parsing + - Supports genome build selection, input files, variant filtering, and report options + +2. **Reference Data Management** + - Downloads necessary reference databases via HTTP(S) + - Implements multi-threaded download with chunking for performance + - Handles HTTP range requests with graceful fallback + - Manages database versions based on genome builds + +3. **VCF Processing** + - Custom parsers for user VCF and ClinVar VCF input + - Memory-efficient line-by-line processing of large files + - Normalizes genomic coordinates and variant representations + - Handles both gzipped and uncompressed files + +4. **Variant Matching Engine** + - Efficient genomic variant matching with HashMap-based lookups + - Parallel processing with Rayon for multi-core utilization + - Classification of variants into pathogenic/benign/VUS categories + +5. **Frequency Extraction System** + - Random access to 1000 Genomes VCF using Noodles CSI indexing + - Population-specific allele frequency extraction + - Targeted queries to minimize memory usage + +6. **Report Generation** + - CSV, text statistics, and markdown report generation + - Chromosome-based sorting algorithms for organized output + - Category-based variant organization + +### Technology Stack + +| Component | Technologies Used | +|-----------|------------------| +| Core Language | Rust | +| Concurrency | Rayon, std::thread, atomic counters | +| File Formats | Noodles (VCF, CSI), flate2 (gzip) | +| Network | reqwest for HTTP requests | +| CLI | clap for argument parsing | +| Progress Tracking | indicatif for progress visualization | +| Output | csv crate, std::fs for file I/O | + +## Pipeline Flow + +```mermaid +flowchart LR + A[Command Parsing] --> B[Reference Data Management] + B --> C[VCF Processing] + C --> D[Variant Matching] + D --> E[Frequency Extraction] + E --> F[Report Generation] + + style A fill:#cc68049,stroke:#333,stroke-width:2px + style B fill:#cc68049,stroke:#333,stroke-width:2px + style C fill:#cc68049,stroke:#333,stroke-width:2px + style D fill:#cc68049,stroke:#333,stroke-width:2px + style E fill:#cc68049,stroke:#333,stroke-width:2px + style F fill:#cc68049,stroke:#333,stroke-width:2px +``` + +## Performance Considerations + +- **Memory Efficiency**: Optimized to handle large genomic datasets with minimal memory usage through targeted data structures and streaming processing. +- **CPU Utilization**: Implements parallel processing at multiple stages to efficiently use available cores. +- **I/O Optimization**: Minimizes disk access through targeted indexed lookups and buffered reading. + +For detailed performance optimization techniques, see [Parallel Processing](parallel_processing.md). + +## Error Handling Strategy + +The application implements robust error handling with: +- Custom error types for different failure modes +- Proper propagation of errors with the `?` operator +- Detailed logging of errors and processing steps +- Graceful fallbacks for non-critical failures + +## Future Technical Directions + +Priority areas for technical enhancement: + +1. **Indexed Input Processing**: Support for indexed access to input VCF files +2. **Cache Management System**: Reference data versioning and automated updates +3. **Configuration Framework**: External configuration files for customization +4. **Testing Infrastructure**: Comprehensive unit and integration tests +5. **Performance Profiling**: Identification of additional optimization opportunities + +For implementation details of specific components, refer to the specialized documentation files listed in the Documentation Index. \ No newline at end of file diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..6ddbbf8 --- /dev/null +++ b/install.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +# Print banner +echo "=====================================" +echo "Pathogenic Variant Finder Installer" +echo "=====================================" +echo + +# Check if Rust is installed +if ! command -v cargo &> /dev/null; then + echo "Error: Rust and Cargo are required but not installed." + echo "Please install Rust from https://rustup.rs/ and try again." + exit 1 +fi + +echo "Building release version..." +cargo build --release + +BINARY_PATH="$(pwd)/target/release/pathogenic_variant_finder" +echo "Binary built at: $BINARY_PATH" + +# Ask user for installation preference +echo +echo "Installation options:" +echo "1) Create symlink in /usr/local/bin (requires sudo, recommended)" +echo "2) Create symlink in ~/.local/bin (no sudo required)" +echo "3) Skip installation (just build)" +echo +read -p "Select an option [1-3]: " choice + +case $choice in + 1) + if [ ! -d "/usr/local/bin" ]; then + sudo mkdir -p /usr/local/bin + fi + echo "Creating symlink in /usr/local/bin..." + sudo ln -sf "$BINARY_PATH" /usr/local/bin/pathogenic + echo "Symlink created: /usr/local/bin/pathogenic" + echo "You can now run the tool using the 'pathogenic' command." + ;; + 2) + if [ ! -d "$HOME/.local/bin" ]; then + mkdir -p "$HOME/.local/bin" + fi + echo "Creating symlink in ~/.local/bin..." + ln -sf "$BINARY_PATH" "$HOME/.local/bin/pathogenic" + echo "Symlink created: $HOME/.local/bin/pathogenic" + + # Check if ~/.local/bin is in PATH + if [[ ":$PATH:" != *":$HOME/.local/bin:"* ]]; then + echo "Note: $HOME/.local/bin is not in your PATH." + echo "Add the following line to your ~/.bashrc or ~/.zshrc:" + echo " export PATH=\"\$HOME/.local/bin:\$PATH\"" + echo "Then restart your terminal or run 'source ~/.bashrc' (or ~/.zshrc)" + else + echo "You can now run the tool using the 'pathogenic' command." + fi + ;; + 3) + echo "Installation skipped. To run the tool, use:" + echo " $BINARY_PATH [arguments]" + ;; + *) + echo "Invalid option. Installation skipped." + echo "You can manually run the tool with:" + echo " $BINARY_PATH [arguments]" + ;; +esac + +echo +echo "Installation complete!" +echo "Run 'pathogenic --help' for usage information." \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 09e7e4f..853cb68 100644 --- a/src/main.rs +++ b/src/main.rs @@ -31,6 +31,18 @@ struct Args { /// Path to the input VCF (can be uncompressed or gzipped) #[arg(short, long, value_name = "FILE")] input: PathBuf, + + /// Include variants of uncertain significance (VUS) + #[arg(short = 'v', long, visible_alias = "vus")] + include_vus: bool, + + /// Include benign variants + #[arg(short = 'n', long, visible_alias = "benign")] + include_benign: bool, + + /// Generate markdown report (enabled by default, use --markdown-report=false to disable) + #[arg(long, visible_alias = "md-report", default_value_t = true, action = clap::ArgAction::Set)] + markdown_report: bool, } /// Custom error type for downloads @@ -248,13 +260,14 @@ struct ClinVarRecord { af_esp: Option, af_exac: Option, af_tgp: Option, + clnsig_category: String, // "pathogenic", "benign", "vus", or "conflicting" } /// Container for ClinVar variants keyed by (chr, pos, ref, alt) type ClinVarMap = HashMap<(String, u32, String, String), ClinVarRecord>; /// Parse a single line from the ClinVar VCF -fn parse_clinvar_line(line: &str) -> Option> { +fn parse_clinvar_line(line: &str, include_vus: bool, include_benign: bool) -> Option> { if line.starts_with('#') || line.trim().is_empty() { return None; } @@ -288,20 +301,35 @@ fn parse_clinvar_line(line: &str) -> Option> { } let clnsig_str = clnsig_opt.unwrap(); - // Only include variants where REF->ALT is pathogenic or likely pathogenic - if !clnsig_str.contains("Pathogenic") && !clnsig_str.contains("Likely_pathogenic") { - return None; - } - - // Exclude ambiguous or conflicting classifications - if clnsig_str.contains("Conflicting_interpretations_of_pathogenicity") - || clnsig_str.contains("Uncertain_significance") + // Filter variants based on clinical significance and command-line flags + let contains_pathogenic = clnsig_str.contains("Pathogenic") || clnsig_str.contains("Likely_pathogenic"); + let contains_benign = clnsig_str.contains("Benign") || clnsig_str.contains("Likely_benign"); + let contains_vus = clnsig_str.contains("Uncertain_significance"); + let contains_conflicting = clnsig_str.contains("Conflicting_interpretations_of_pathogenicity"); + + // Determine the variant classification category + let clnsig_category = if contains_pathogenic && !contains_benign { + "pathogenic" + } else if contains_benign && !contains_pathogenic { + "benign" + } else if contains_vus { + "vus" + } else if contains_conflicting { + "conflicting" + } else { + "other" + }; + + // Skip variants that don't match our inclusion criteria + if !((contains_pathogenic) || + (include_vus && (contains_vus || contains_conflicting)) || + (include_benign && contains_benign)) { return None; } - // If CLNSIG suggests benign, alt is not pathogenic - let is_alt_pathogenic = !(clnsig_str.contains("Benign") || clnsig_str.contains("Protective")); + // Determine if the variant is pathogenic for filtering in the match phase + let is_alt_pathogenic = contains_pathogenic && !(contains_benign); let gene_opt = info_map .get("GENEINFO") @@ -329,6 +357,7 @@ fn parse_clinvar_line(line: &str) -> Option> { af_esp, af_exac, af_tgp, + clnsig_category: clnsig_category.to_string(), }); } Some(recs) @@ -337,34 +366,46 @@ fn parse_clinvar_line(line: &str) -> Option> { /// Parse the ClinVar VCF in parallel fn parse_clinvar_vcf_gz( path_gz: &Path, + include_vus: bool, + include_benign: bool, log_file: &mut File, ) -> Result<(ClinVarMap, String), Box> { println!("[STEP] Parsing ClinVar VCF: {}", path_gz.display()); writeln!(log_file, "[STEP] Parsing ClinVar VCF: {}", path_gz.display())?; - let file = File::open(path_gz)?; + // Extract file date from header if present + let mut file_date = String::new(); + let file = File::open(&path_gz)?; let decoder = MultiGzDecoder::new(file); let reader = BufReader::new(decoder); - let lines: Vec = reader.lines().collect::>()?; - let total = lines.len() as u64; + let mut lines = Vec::new(); - println!(" -> Streaming ClinVar VCF, total lines: {}", total); - writeln!( - log_file, - " -> Streaming ClinVar VCF, total lines: {}", - total - )?; + for line_result in reader.lines() { + let line = line_result?; - let mut file_date = String::new(); - for line in &lines { - if line.starts_with("##fileDate") { - file_date = line.trim_start_matches("##fileDate=").to_string(); - break; + if line.starts_with("##fileDate=") { + file_date = line.trim().replace("##fileDate=", ""); + } else if line.starts_with('#') { + // Skip other header lines + continue; + } else { + // Non-header line, add to batch for parallel processing + lines.push(line); } } - let pb = ProgressBar::new(total); + println!(" -> Found {} non-header lines.", lines.len()); + writeln!(log_file, " -> Found {} non-header lines.", lines.len())?; + + println!(" -> Parsing in parallel with {} threads...", num_cpus::get()); + writeln!( + log_file, + " -> Parsing in parallel with {} threads...", + num_cpus::get() + )?; + + let pb = ProgressBar::new(lines.len() as u64); pb.set_style( ProgressStyle::default_bar() .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} ({eta})") @@ -376,7 +417,7 @@ fn parse_clinvar_vcf_gz( .par_iter() .map(|line| { pb.inc(1); - match parse_clinvar_line(line) { + match parse_clinvar_line(line, include_vus, include_benign) { None => HashMap::new(), Some(records) => { let mut local_map = HashMap::with_capacity(records.len()); @@ -390,8 +431,12 @@ fn parse_clinvar_vcf_gz( }) .collect(); - pb.finish_with_message("ClinVar parse complete."); + pb.finish_with_message("Parsing complete."); + + println!(" -> Merging {} map chunks...", chunk_maps.len()); + writeln!(log_file, " -> Merging {} map chunks...", chunk_maps.len())?; + // Combine maps into a single map let mut final_map = HashMap::new(); for cm in chunk_maps { final_map.extend(cm); @@ -781,6 +826,22 @@ struct FinalRecord { af_eas: Option, af_eur: Option, af_sas: Option, + clnsig_category: String, +} + +/// Get the numeric order for sorting chromosomes +fn get_chromosome_order(chr: &str) -> usize { + if chr == "X" { + return 23; + } else if chr == "Y" { + return 24; + } else if chr == "MT" { + return 25; + } else if let Ok(num) = chr.parse::() { + return num; + } else { + return 100; // Any other chromosome types will be sorted last + } } fn main() -> Result<(), Box> { @@ -799,11 +860,17 @@ fn main() -> Result<(), Box> { let args = Args::parse(); let build = args.build.to_uppercase(); let input_path = args.input.clone(); + let include_vus = args.include_vus; + let include_benign = args.include_benign; println!("[LOG] Genome Build: {}", build); writeln!(log_file, "[LOG] Genome Build: {}", build)?; println!("[LOG] Input File: {}", input_path.display()); writeln!(log_file, "[LOG] Input File: {}", input_path.display())?; + println!("[LOG] Include VUS: {}", include_vus); + writeln!(log_file, "[LOG] Include VUS: {}", include_vus)?; + println!("[LOG] Include Benign: {}", include_benign); + writeln!(log_file, "[LOG] Include Benign: {}", include_benign)?; println!("[STEP] Checking arguments..."); writeln!(log_file, "[STEP] Checking arguments...")?; @@ -814,6 +881,12 @@ fn main() -> Result<(), Box> { return Err(format!("Input VCF not found: {}", input_path.display()).into()); } + // Create reports directory if it doesn't exist + let reports_dir = Path::new("reports"); + fs::create_dir_all(reports_dir)?; + println!("[STEP] Ensuring reports directory exists: {}", reports_dir.display()); + writeln!(log_file, "[STEP] Ensuring reports directory exists: {}", reports_dir.display())?; + // 1) CLINVAR VCF / TBI let (clinvar_url, clinvar_tbi_url) = match build.as_str() { "GRCH37" => ( @@ -923,11 +996,9 @@ fn main() -> Result<(), Box> { )?; } - // Parse the ClinVar VCF - let (clinvar_map, clinvar_file_date) = parse_clinvar_vcf_gz(&clinvar_vcf_path, &mut log_file)?; - println!("[LOG] ClinVar date: {}", clinvar_file_date); - writeln!(log_file, "[LOG] ClinVar date: {}", clinvar_file_date)?; - + // Parse ClinVar VCF for pathogenic variants + let (clinvar_map, _file_date) = parse_clinvar_vcf_gz(&clinvar_vcf_path, include_vus, include_benign, &mut log_file)?; + // Parse the ClinVar TSV for deeper annotation let tsv_map = parse_clinvar_tsv(&tsv_path, &build, &mut log_file)?; @@ -940,9 +1011,7 @@ fn main() -> Result<(), Box> { "[STEP] Matching user variants with ClinVar and 1000G..." )?; - // We'll store partial results first (from user input matched to ClinVar), - // then combine with 1000G data, and finally combine with TSV info. - #[derive(Debug)] + // Structure to hold temporary records during matching process struct TempRecord { chr: String, pos: u32, @@ -958,8 +1027,13 @@ fn main() -> Result<(), Box> { af_exac: Option, af_tgp: Option, clndn: Option, + clnsig_category: String, } + // Match user variants with ClinVar + println!("[STEP] Matching user variants to ClinVar..."); + writeln!(log_file, "[STEP] Matching user variants to ClinVar...")?; + let pb = ProgressBar::new(input_variants.len() as u64); pb.set_style( ProgressStyle::default_bar() @@ -978,9 +1052,8 @@ fn main() -> Result<(), Box> { } let key = (iv.chr.clone(), iv.pos, iv.ref_allele.clone(), alt_a.clone()); if let Some(cv) = clinvar_map.get(&key) { - if !cv.is_alt_pathogenic { - continue; - } + // Include all variants that match our criteria (controlled by flags when parsing) + // rather than checking is_alt_pathogenic here, as we did before let genotype = iv.genotype.clone(); let review_stars = review_status_to_stars(cv.clnrevstat.as_deref()); local_found.push(TempRecord { @@ -998,6 +1071,7 @@ fn main() -> Result<(), Box> { af_exac: cv.af_exac, af_tgp: cv.af_tgp, clndn: cv.clndn.clone(), + clnsig_category: cv.clnsig_category.clone(), }); } } @@ -1009,6 +1083,12 @@ fn main() -> Result<(), Box> { pb.finish_with_message("ClinVar matching complete."); println!(" -> Matched {} variants in ClinVar", temp_results.len()); + writeln!( + log_file, + " -> Matched {} variants in ClinVar", + temp_results.len() + )?; + // Collect keys of interest from temp_results for 1000 Genomes frequency lookup let keys_of_interest: HashSet<(String, u32, String, String)> = temp_results .iter() @@ -1033,78 +1113,172 @@ fn main() -> Result<(), Box> { )?; } - let file = File::open(&onekg_file_path)?; - let index_file = File::open(&onekg_index_path)?; + // Improved error handling + println!(" -> Opening 1000 Genomes file: {}", onekg_file_path.display()); + let file = match File::open(&onekg_file_path) { + Ok(f) => f, + Err(e) => { + let error_msg = format!("Failed to open 1000 Genomes file: {}", e); + println!("ERROR: {}", error_msg); + writeln!(log_file, "ERROR: {}", error_msg)?; + return Err(error_msg.into()); + } + }; + + println!(" -> Opening 1000 Genomes CSI index: {}", onekg_index_path.display()); + let index_file = match File::open(&onekg_index_path) { + Ok(f) => f, + Err(e) => { + let error_msg = format!("Failed to open 1000 Genomes CSI index: {}", e); + println!("ERROR: {}", error_msg); + writeln!(log_file, "ERROR: {}", error_msg)?; + return Err(error_msg.into()); + } + }; + + println!(" -> Reading CSI index..."); let mut csi_reader = csi::io::Reader::new(index_file); - let index = csi_reader.read_index()?; + let index = match csi_reader.read_index() { + Ok(idx) => idx, + Err(e) => { + let error_msg = format!("Failed to read CSI index: {}", e); + println!("ERROR: {}", error_msg); + writeln!(log_file, "ERROR: {}", error_msg)?; + return Err(error_msg.into()); + } + }; + + println!(" -> Creating indexed reader..."); let mut reader = IndexedReader::new(file, index); - let header = reader.read_header()?; + println!(" -> Reading VCF header..."); + let header = match reader.read_header() { + Ok(h) => h, + Err(e) => { + let error_msg = format!("Failed to read VCF header: {}", e); + println!("ERROR: {}", error_msg); + writeln!(log_file, "ERROR: {}", error_msg)?; + return Err(error_msg.into()); + } + }; + + println!(" -> Setting up frequency mapping..."); let mut onekg_freqs: HashMap<(String, u32, String, String), (Option, Option, Option, Option, Option)> = HashMap::with_capacity(keys_of_interest.len()); let mut unique_regions: HashSet<(String, u32)> = HashSet::new(); for (chr, pos, _ref, _alt) in keys_of_interest.iter() { unique_regions.insert((chr.clone(), *pos)); } + println!(" -> Will query {} unique genomic positions", unique_regions.len()); + writeln!(log_file, " -> Will query {} unique genomic positions", unique_regions.len())?; + + // Store the size before moving unique_regions + let unique_regions_count = unique_regions.len(); + + // Progress bar for region querying + let pb = ProgressBar::new(unique_regions_count as u64); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} ({eta})") + .unwrap() + .progress_chars("=>-"), + ); + + // Add success/error counters + let mut successful_queries = 0; + let mut failed_queries = 0; + for (chr, pos) in unique_regions { + pb.inc(1); let region_str = format!("{}:{}-{}", chr, pos, pos); - let region_obj: Region = region_str.parse()?; - if let Ok(mut query) = reader.query(&header, ®ion_obj) { - while let Some(record) = query.next() { - let record = record?; - let record_chr = record.reference_sequence_name().to_string(); - let record_pos = record - .variant_start() - .expect("Failed to get variant start")?; - let record_ref = record.reference_bases().to_string(); - let record_alts: Vec = record - .alternate_bases() - .iter() - .filter_map(Result::ok) - .map(|a| a.to_string()) - .collect(); - let info = record.info(); - for (alt_idx, alt) in record_alts.iter().enumerate() { - let key = (record_chr.clone(), record_pos.get() as u32, record_ref.clone(), alt.clone()); - if keys_of_interest.contains(&key) { - // Helper function to extract the frequency for a specific alternate allele from the INFO field - let get_freq = |key: &str, alt_idx: usize| -> Option { - match info.get(&header, key) { - Some(Ok(Some(Value::Array(array)))) => { - match array { - noodles::vcf::variant::record::info::field::value::Array::Float(values) => { - let mut iter = values.iter(); - if let Some(Ok(Some(f))) = iter.nth(alt_idx) { - Some(f as f64) - } else { - None - } - }, - _ => None, // Handle other Array variants (e.g., Integer, String) - } - }, - _ => None, + let region_obj = match region_str.parse::() { + Ok(r) => r, + Err(e) => { + // Just log and continue to the next region + writeln!(log_file, " -> Failed to parse region {}: {}", region_str, e)?; + failed_queries += 1; + continue; + } + }; + + match reader.query(&header, ®ion_obj) { + Ok(mut query) => { + successful_queries += 1; + while let Some(record_result) = query.next() { + match record_result { + Ok(record) => { + // Process the record as before + let record_chr = record.reference_sequence_name().to_string(); + let record_pos = match record.variant_start() { + Some(Ok(pos)) => pos, + _ => continue, // Skip if we can't get position + }; + let record_ref = record.reference_bases().to_string(); + + // Get alternative alleles + let record_alts: Vec = record.alternate_bases() + .iter() + .filter_map(Result::ok) + .map(|alt| alt.to_string()) + .collect(); + + let info = record.info(); + + for (alt_idx, alt) in record_alts.iter().enumerate() { + let key = (record_chr.clone(), record_pos.get() as u32, record_ref.clone(), alt.clone()); + if keys_of_interest.contains(&key) { + // Helper function to extract the frequency for a specific alternate allele from the INFO field + let get_freq = |key: &str, alt_idx: usize| -> Option { + match info.get(&header, key) { + Some(Ok(Some(Value::Array(array)))) => { + match array { + noodles::vcf::variant::record::info::field::value::Array::Float(values) => { + let mut iter = values.iter(); + if let Some(Ok(Some(f))) = iter.nth(alt_idx) { + // Convert f32 to f64 without dereferencing + Some(f64::from(f)) + } else { + None + } + }, + _ => None, // Handle other Array variants (e.g., Integer, String) + } + }, + _ => None, + } + }; + let afr = get_freq("AFR", alt_idx); + let amr = get_freq("AMR", alt_idx); + let eas = get_freq("EAS", alt_idx); + let eur = get_freq("EUR", alt_idx); + let sas = get_freq("SAS", alt_idx); + onekg_freqs.insert(key, (afr, amr, eas, eur, sas)); + } } - }; - let afr = get_freq("AFR", alt_idx); - let amr = get_freq("AMR", alt_idx); - let eas = get_freq("EAS", alt_idx); - let eur = get_freq("EUR", alt_idx); - let sas = get_freq("SAS", alt_idx); - onekg_freqs.insert(key, (afr, amr, eas, eur, sas)); + }, + Err(e) => { + // Just log error and continue + writeln!(log_file, " -> Error processing record for region {}: {}", region_str, e)?; + } } } + }, + Err(e) => { + // Log the error and continue + writeln!(log_file, " -> Failed to query region {}: {}", region_str, e)?; + failed_queries += 1; } } } + + pb.finish_with_message("1000 Genomes querying complete"); + + println!(" -> Successfully queried {}/{} regions", successful_queries, unique_regions_count); + println!(" -> Failed to query {} regions", failed_queries); + writeln!(log_file, " -> Successfully queried {}/{} regions", successful_queries, unique_regions_count)?; + writeln!(log_file, " -> Failed to query {} regions", failed_queries)?; println!(" -> Extracted frequencies for {} variants", onekg_freqs.len()); writeln!(log_file, " -> Extracted frequencies for {} variants", onekg_freqs.len())?; - - writeln!( - log_file, - " -> Matched {} variants in ClinVar", - temp_results.len() - )?; // Next, we combine with 1000G frequency data let mut final_records: Vec = Vec::new(); @@ -1145,24 +1319,62 @@ fn main() -> Result<(), Box> { af_eas, af_eur, af_sas, + clnsig_category: r.clnsig_category.clone(), }; final_records.push(final_rec); } // Sort final records by chromosome, then by position final_records.sort_by(|a, b| { - let c = a.chr.cmp(&b.chr); - if c == std::cmp::Ordering::Equal { - a.pos.cmp(&b.pos) - } else { - c + let a_order = get_chromosome_order(&a.chr); + let b_order = get_chromosome_order(&b.chr); + + match a_order.cmp(&b_order) { + std::cmp::Ordering::Equal => a.pos.cmp(&b.pos), + other => other } }); - // Output CSV - println!("[STEP] Writing final CSV to stdout..."); - writeln!(log_file, "[STEP] Writing final CSV to stdout...")?; - let mut wtr = csv::Writer::from_writer(std::io::stdout()); + // Get input filename without extension for output naming + let input_filename = input_path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("input") + .to_string() + .replace(".vcf", "") // Remove any additional .vcf if present in the stem + .replace(".gz", ""); // Remove any additional .gz if present in the stem + + // Determine the analysis type for the filename + let analysis_type = match (include_vus, include_benign) { + (false, false) => "pathogenic_only", + (true, false) => "pathogenic_vus", + (false, true) => "pathogenic_benign", + (true, true) => "pathogenic_vus_benign", + }; + + // Create the output filename using the input file name, analysis type, and timestamp + let timestamp = now.format("%Y%m%d_%H%M%S"); + let output_filename = format!( + "{}_{}_{}.csv", + input_filename, + analysis_type, + timestamp + ); + + let output_path = reports_dir.join(output_filename); + + // Create a statistics file with the same base name + let stats_filename = format!( + "{}_{}_{}_stats.txt", + input_filename, + analysis_type, + timestamp + ); + let stats_path = reports_dir.join(stats_filename); + + println!("[STEP] Writing final CSV to: {}", output_path.display()); + writeln!(log_file, "[STEP] Writing final CSV to: {}", output_path.display())?; + let mut wtr = csv::Writer::from_path(&output_path)?; wtr.write_record(&[ "Chromosome", @@ -1171,6 +1383,7 @@ fn main() -> Result<(), Box> { "Alternate Allele", "Clinical Significance", "Is Alt Pathogenic", + "Significance Category", "Gene", "ClinVar Allele ID", "CLNDN", @@ -1204,6 +1417,7 @@ fn main() -> Result<(), Box> { &rec.alt_allele, &rec.clnsig, &rec.is_alt_pathogenic.to_string(), + &rec.clnsig_category, rec.gene.as_deref().unwrap_or(""), &rec.allele_id.map(|id| id.to_string()).unwrap_or_default(), rec.clndn.as_deref().unwrap_or(""), @@ -1232,14 +1446,809 @@ fn main() -> Result<(), Box> { wtr.flush()?; println!( - "Done. Wrote {} variants to CSV with ClinVar VCF data, ClinVar TSV data, and 1000G allele frequencies.", - final_records.len() + "Done. Wrote {} variants to {}", + final_records.len(), + output_path.display() + ); + writeln!( + log_file, + "Done. Wrote {} variants to {}", + final_records.len(), + output_path.display() + )?; + + // Generate and write statistics file + println!("[STEP] Writing statistics file to: {}", stats_path.display()); + writeln!(log_file, "[STEP] Writing statistics file to: {}", stats_path.display())?; + + // Collect statistics + let mut unique_genes = HashSet::new(); + let mut category_counts = HashMap::new(); + let mut af_ranges = HashMap::new(); + + for rec in &final_records { + // Count unique genes + if let Some(gene) = &rec.gene { + unique_genes.insert(gene.clone()); + } + + // Count by classification category + *category_counts.entry(rec.clnsig_category.clone()).or_insert(0) += 1; + + // Group by allele frequency range (for EAS, EUR, AFR, AMR, SAS) + let add_af_range = |af: Option, population: &str, ranges: &mut HashMap| { + if let Some(freq) = af { + let range = if freq < 0.001 { + "< 0.1%" + } else if freq < 0.01 { + "0.1% - 1%" + } else if freq < 0.05 { + "1% - 5%" + } else if freq < 0.10 { + "5% - 10%" + } else { + "> 10%" + }; + + let key = format!("{}_{}", population, range); + *ranges.entry(key).or_insert(0) += 1; + } + }; + + add_af_range(rec.af_afr, "AFR", &mut af_ranges); + add_af_range(rec.af_amr, "AMR", &mut af_ranges); + add_af_range(rec.af_eas, "EAS", &mut af_ranges); + add_af_range(rec.af_eur, "EUR", &mut af_ranges); + add_af_range(rec.af_sas, "SAS", &mut af_ranges); + } + + // Write statistics file + let stats_path_clone = stats_path.clone(); + let mut stats_file = File::create(&stats_path)?; + + // Collect the command run for inclusion in the report and statistics + let args_vec: Vec = std::env::args().collect(); + + // Create a cleaned command for display that uses "pathogenic" as the command name + let mut clean_args = Vec::new(); + + // Skip the binary path (first argument) and replace with "pathogenic" + clean_args.push("pathogenic".to_string()); + + // Add all command line arguments + if args_vec.len() > 1 { + for arg in &args_vec[1..] { + clean_args.push(arg.clone()); + } + } + + let command_run = clean_args.join(" "); + + writeln!(stats_file, "=== Pathogenic Variant Finder: Analysis Report ===")?; + writeln!(stats_file, "Date/Time: {}", now.to_rfc3339())?; + writeln!(stats_file, "\n=== Analysis Settings ===")?; + writeln!(stats_file, "Input File: {}", input_path.display())?; + writeln!(stats_file, "Genome Build: {}", build)?; + writeln!(stats_file, "Include VUS: {}", include_vus)?; + writeln!(stats_file, "Include Benign: {}", include_benign)?; + writeln!(stats_file, "Command Used: {}", command_run)?; + + writeln!(stats_file, "\n=== Analysis Results ===")?; + writeln!(stats_file, "Total Variants Processed: {}", input_variants.len())?; + writeln!(stats_file, "Total Variants Reported: {}", final_records.len())?; + writeln!(stats_file, "Unique Genes: {}", unique_genes.len())?; + + writeln!(stats_file, "\n=== Variant Classifications ===")?; + for (category, count) in category_counts.iter() { + writeln!(stats_file, "{}: {}", category, count)?; + } + + writeln!(stats_file, "\n=== Allele Frequency Distribution ===")?; + for (range, count) in af_ranges.iter() { + writeln!(stats_file, "{}: {}", range, count)?; + } + + writeln!(stats_file, "\n=== Population Coverage ===")?; + let mut pop_coverage = HashMap::new(); + // Count variants with any frequency data for each population + for rec in &final_records { + if rec.af_afr.is_some() { *pop_coverage.entry("AFR").or_insert(0) += 1; } + if rec.af_amr.is_some() { *pop_coverage.entry("AMR").or_insert(0) += 1; } + if rec.af_eas.is_some() { *pop_coverage.entry("EAS").or_insert(0) += 1; } + if rec.af_eur.is_some() { *pop_coverage.entry("EUR").or_insert(0) += 1; } + if rec.af_sas.is_some() { *pop_coverage.entry("SAS").or_insert(0) += 1; } + } + + for (pop, count) in pop_coverage.iter() { + let percentage = if final_records.is_empty() { + 0.0 + } else { + (*count as f64 / final_records.len() as f64) * 100.0 + }; + writeln!(stats_file, "{}: {} variants ({:.1}%)", pop, count, percentage)?; + } + + println!( + "Done. Wrote statistics to {}", + stats_path_clone.display() ); writeln!( log_file, - "Done. Wrote {} variants to CSV with ClinVar VCF data, ClinVar TSV data, and 1000G allele frequencies.", - final_records.len() + "Done. Wrote statistics to {}", + stats_path_clone.display() + )?; + + // Generate markdown report if enabled + if args.markdown_report { + // Create the markdown filename + let markdown_filename = format!( + "{}_{}_{}_report.md", + input_filename, + analysis_type, + timestamp + ); + let markdown_path = reports_dir.join(markdown_filename); + + // Generate the markdown report + println!("[STEP] Generating markdown report: {}", markdown_path.display()); + writeln!(log_file, "[STEP] Generating markdown report: {}", markdown_path.display())?; + + generate_markdown_report( + &final_records, + &markdown_path, + &args, + &category_counts, + &unique_genes, + &input_path, + &build, + input_variants.len(), + ×tamp.to_string(), + &command_run, + &mut log_file, + )?; + + println!( + "Done. Wrote markdown report to {}", + markdown_path.display() + ); + writeln!( + log_file, + "Done. Wrote markdown report to {}", + markdown_path.display() + )?; + } + + Ok(()) +} + +/// Generate markdown report of found variants with dynamic sections based on analysis settings +fn generate_markdown_report( + final_records: &[FinalRecord], + out_path: &Path, + args: &Args, + category_counts: &HashMap, + unique_genes: &HashSet, + input_path: &Path, + build: &str, + total_variants: usize, + _timestamp: &str, + command_run: &str, + log_file: &mut File, +) -> Result<(), Box> { + println!("[STEP] Generating markdown report: {}", out_path.display()); + writeln!(log_file, "[STEP] Generating markdown report: {}", out_path.display())?; + + let mut md_file = File::create(out_path)?; + + // Current date/time for report + let now: DateTime = Utc::now(); + let datetime = now.format("%B %d, %Y @ %H:%M:%S").to_string(); + + // Group variants by classification and gene for better organization + let mut gene_pathogenic: HashMap> = HashMap::new(); // Strictly Pathogenic + let mut gene_likely_pathogenic: HashMap> = HashMap::new(); // Likely Pathogenic + let mut gene_vus: HashMap> = HashMap::new(); // Uncertain Significance + let mut gene_conflicting: HashMap> = HashMap::new(); // Conflicting Interpretations + let mut gene_benign: HashMap> = HashMap::new(); // Strictly Benign + let mut gene_likely_benign: HashMap> = HashMap::new(); // Likely Benign + + // Track which variant types are actually present in the results + let mut has_pathogenic = false; + let mut has_likely_pathogenic = false; + let mut has_vus = false; + let mut has_conflicting = false; + let mut has_benign = false; + let mut has_likely_benign = false; + + // Organize variants by type and gene + for record in final_records { + let gene_name = record.gene.clone().unwrap_or_else(|| "Unknown".to_string()); + let clnsig = &record.clnsig; + + if record.clnsig_category == "pathogenic" { + // Split pathogenic variants based on their specific clnsig value + if clnsig.contains("Likely_pathogenic") && !clnsig.contains("Pathogenic") { + has_likely_pathogenic = true; + gene_likely_pathogenic.entry(gene_name).or_default().push(record); + } else { + // Either strictly Pathogenic or Pathogenic/Likely_pathogenic + has_pathogenic = true; + gene_pathogenic.entry(gene_name).or_default().push(record); + } + } else if record.clnsig_category == "vus" && args.include_vus { + has_vus = true; + gene_vus.entry(gene_name).or_default().push(record); + } else if record.clnsig_category == "benign" && args.include_benign { + // Split benign variants based on their specific clnsig value + if clnsig.contains("Likely_benign") && !clnsig.contains("Benign") { + has_likely_benign = true; + gene_likely_benign.entry(gene_name).or_default().push(record); + } else { + // Either strictly Benign or Benign/Likely_benign + has_benign = true; + gene_benign.entry(gene_name).or_default().push(record); + } + } else if record.clnsig_category == "conflicting" && args.include_vus { + has_conflicting = true; + gene_conflicting.entry(gene_name).or_default().push(record); + } + } + + // Create a list of sections for Table of Contents and report generation + struct Section<'a> { + id: String, + title: String, + present: bool, + variants: HashMap>, + } + + let mut sections = vec![ + Section { + id: "disclaimer".to_string(), + title: "Disclaimer".to_string(), + present: true, // Always include disclaimer + variants: HashMap::new(), + }, + Section { + id: "summary".to_string(), + title: "Summary".to_string(), + present: true, // Always include summary + variants: HashMap::new(), + } + ]; + + // Add pathogenic sections if present + if has_pathogenic { + sections.push(Section { + id: "pathogenic-variants".to_string(), + title: "Pathogenic Variants".to_string(), + present: true, + variants: gene_pathogenic, + }); + } + + if has_likely_pathogenic { + sections.push(Section { + id: "likely-pathogenic-variants".to_string(), + title: "Likely Pathogenic Variants".to_string(), + present: true, + variants: gene_likely_pathogenic, + }); + } + + // Dynamically add sections based on what's included and present + if has_vus && args.include_vus { + sections.push(Section { + id: "uncertain-significance-variants".to_string(), + title: "Variants of Uncertain Significance".to_string(), + present: true, + variants: gene_vus, + }); + } + + if has_conflicting && args.include_vus { + sections.push(Section { + id: "conflicting-variants".to_string(), + title: "Variants with Conflicting Interpretations".to_string(), + present: true, + variants: gene_conflicting, + }); + } + + // Add benign sections if present and included + if has_benign && args.include_benign { + sections.push(Section { + id: "benign-variants".to_string(), + title: "Benign Variants".to_string(), + present: true, + variants: gene_benign, + }); + } + + if has_likely_benign && args.include_benign { + sections.push(Section { + id: "likely-benign-variants".to_string(), + title: "Likely Benign Variants".to_string(), + present: true, + variants: gene_likely_benign, + }); + } + + // Always add understanding section at the end + sections.push(Section { + id: "understanding-this-report".to_string(), + title: "Understanding This Report".to_string(), + present: true, + variants: HashMap::new(), + }); + + // Write the header + writeln!(md_file, "# Genetic Variant Report")?; + writeln!(md_file)?; + writeln!(md_file, "**Date / Time Generated:** {}", datetime)?; + + // Write the variant types analyzed + let mut variant_types = Vec::new(); + if has_pathogenic { + variant_types.push("Pathogenic"); + } + if has_likely_pathogenic { + variant_types.push("Likely Pathogenic"); + } + + if args.include_vus { + variant_types.push("Uncertain Significance"); + variant_types.push("Conflicting Interpretations"); + } + + if args.include_benign { + if has_benign { + variant_types.push("Benign"); + } + if has_likely_benign { + variant_types.push("Likely Benign"); + } + } + + writeln!(md_file)?; + writeln!(md_file, "**Variant Types Analyzed:** {}", variant_types.join(", "))?; + writeln!(md_file)?; + + // Write analysis settings + writeln!(md_file, "### Analysis Settings:")?; + writeln!(md_file, "- **Input File:** `{}`", input_path.display())?; + writeln!(md_file, "- **Genome Build:** `{}`", build)?; + writeln!(md_file, "- **Include VUS:** `{}`", args.include_vus)?; + writeln!(md_file, "- **Include Benign:** `{}`", args.include_benign)?; + writeln!(md_file)?; + + // Write command run + writeln!(md_file, "**Command Used:**")?; + writeln!(md_file, "```bash")?; + writeln!(md_file, "{}", command_run)?; + writeln!(md_file, "```")?; + + // Generate dynamically correct table of contents + writeln!(md_file)?; + writeln!(md_file, "---")?; + writeln!(md_file)?; + writeln!(md_file, "## Table of Contents")?; + writeln!(md_file)?; + + // Only include sections that are present in the results + let present_sections: Vec<&Section> = sections.iter() + .filter(|s| s.present) + .collect(); + + for (i, section) in present_sections.iter().enumerate() { + writeln!(md_file, "{}. [{}](#{})", i + 1, section.title, section.id)?; + } + + writeln!(md_file)?; + writeln!(md_file, "---")?; + + // Generate each section + for section in sections.iter().filter(|s| s.present) { + if section.id == "disclaimer" { + write_disclaimer_section(&mut md_file)?; + } else if section.id == "summary" { + write_summary_section(&mut md_file, category_counts, args, final_records.len(), total_variants, unique_genes.len())?; + } else if section.id == "understanding-this-report" { + write_understanding_section(&mut md_file)?; + } else { + // This is a variant section - use appropriate handler + write_variant_section(&mut md_file, §ion.variants, §ion.id, §ion.title)?; + } + } + + println!(" -> Markdown report generated successfully."); + writeln!(log_file, " -> Markdown report generated successfully.")?; + + Ok(()) +} + +/// Write the disclaimer section +fn write_disclaimer_section(md_file: &mut File) -> Result<(), Box> { + writeln!(md_file, "")?; + writeln!(md_file, "# Disclaimer")?; + writeln!(md_file)?; + writeln!(md_file, "> **Important:** This report is provided for educational purposes only. Results generated or reported by the Pathogenic Variant Finder should not be used for medical diagnosis or to inform clinical decisions without proper validation by qualified healthcare professionals. The developers make no warranties, express or implied, regarding the accuracy, completeness, or reliability of the information provided by this tool. Users should be aware that variant classifications and pathogenicity assessments may change over time as new evidence emerges. Always consult with certified genetic counselors, clinical geneticists, or other appropriate healthcare providers for the interpretation of genetic variants in a medical context.")?; + writeln!(md_file)?; + writeln!(md_file, "---")?; + + Ok(()) +} + +/// Write the summary section +fn write_summary_section( + md_file: &mut File, + category_counts: &HashMap, + args: &Args, + variants_reported: usize, + variants_processed: usize, + unique_genes_count: usize, +) -> Result<(), Box> { + writeln!(md_file, "")?; + writeln!(md_file, "## Summary")?; + writeln!(md_file)?; + writeln!(md_file, "This report contains genetic variants found in your genome that match clinical databases. The variants are categorized by clinical significance:")?; + writeln!(md_file)?; + + // Additional summary stats + writeln!(md_file)?; + writeln!(md_file, "**Total Number of Genetic Variants Processed:** {}", variants_processed)?; + writeln!(md_file)?; + writeln!(md_file, "**Number of Unique Genes with Identified Variants:** {}", unique_genes_count)?; + writeln!(md_file)?; + + // Summary table + writeln!(md_file, "| Category | Count |")?; + writeln!(md_file, "|---------|-------|")?; + + let pathogenic_count = category_counts.get("pathogenic").unwrap_or(&0); + let likely_pathogenic_count = category_counts.get("likely_pathogenic").unwrap_or(&0); + let conflicting_count = category_counts.get("conflicting").unwrap_or(&0); + let vus_count = category_counts.get("vus").unwrap_or(&0); + let likely_benign_count = category_counts.get("likely_benign").unwrap_or(&0); + let benign_count = category_counts.get("benign").unwrap_or(&0); + + writeln!(md_file, "| Pathogenic | {} |", pathogenic_count)?; + writeln!(md_file, "| Likely Pathogenic | {} |", likely_pathogenic_count)?; + + if args.include_vus { + writeln!(md_file, "| Conflicting | {} |", conflicting_count)?; + writeln!(md_file, "| Uncertain Significance | {} |", vus_count)?; + } else { + writeln!(md_file, "| Conflicting | Not Analyzed |")?; + writeln!(md_file, "| Uncertain Significance | Not Analyzed |")?; + } + + if args.include_benign { + writeln!(md_file, "| Likely Benign | {} |", likely_benign_count)?; + writeln!(md_file, "| Benign | {} |", benign_count)?; + } else { + writeln!(md_file, "| Likely Benign | Not Analyzed |")?; + writeln!(md_file, "| Benign | Not Analyzed |")?; + } + + writeln!(md_file, "| **Total Variants Reported** | **{}** |", variants_reported)?; + writeln!(md_file)?; + writeln!(md_file, "---")?; + + Ok(()) +} + +/// Write the variant section (pathogenic, VUS, conflicting, or benign) +fn write_variant_section( + md_file: &mut File, + gene_variants: &HashMap>, + section_id: &str, + section_title: &str, +) -> Result<(), Box> { + writeln!(md_file, "", section_id)?; + writeln!(md_file, "## {}", section_title)?; + writeln!(md_file)?; + + if gene_variants.is_empty() { + writeln!(md_file, "No {} were found in this analysis.", section_title.to_lowercase())?; + writeln!(md_file)?; + writeln!(md_file, "---")?; + return Ok(()); + } + + // Summary table + writeln!(md_file, "### Summary Table")?; + writeln!(md_file)?; + writeln!(md_file, "| Gene | Variant Location | DNA Change | Condition | Genotype, Zygosity | Clinical Significance | African Population Frequency | American Population Frequency | East Asian Population Frequency | European Population Frequency | South Asian Population Frequency |")?; + writeln!(md_file, "|:-----|:-------|:----------|:----------|:---------|:---------|:---------------------|:---------------------|:---------------------|:---------------------|:---------------------|")?; + + // Track variant IDs for the details section + let mut variant_id = 1; + let mut summary_entries = Vec::new(); + + // Collect all variants from all genes into a single list + let mut all_variants: Vec<(&String, &FinalRecord)> = Vec::new(); + for (gene, variants) in gene_variants { + for variant in variants { + all_variants.push((gene, *variant)); + } + } + + // Sort all variants by chromosome and position first + all_variants.sort_by(|(_, a), (_, b)| { + let a_order = get_chromosome_order(&a.chr); + let b_order = get_chromosome_order(&b.chr); + + match a_order.cmp(&b_order) { + std::cmp::Ordering::Equal => a.pos.cmp(&b.pos), + other => other + } + }); + + // Process the sorted variants + for (gene, variant) in all_variants { + let chr = &variant.chr; + let pos = variant.pos; + let ref_allele = &variant.ref_allele; + let alt_allele = &variant.alt_allele; + + let dna_change = format!("{} -> {}", ref_allele, alt_allele); + + // Clean condition field by replacing pipe characters with commas and underscores with spaces + let condition = match &variant.clndn { + Some(c) => c.replace('|', ", ").replace('_', " "), + None => "Not specified".to_string(), + }; + + // Determine zygosity from genotype + let genotype = &variant.genotype; + let zygosity = if genotype == "1/1" { + "Homozygous" + } else if genotype == "0/1" || genotype == "1/0" { + "Heterozygous" + } else { + "Unknown" + }; + + let genotype_text = format!("{}, {}", genotype, zygosity); + + // Clean clinical significance - just use the classification type without extra data + let clean_clnsig = variant.clnsig.replace('_', " "); + + // Format frequencies as percentages with 2 decimal places + let format_freq = |f: Option| -> String { + match f { + Some(val) => format!("{:.2}%", val * 100.0), + None => "N/A".to_string(), + } + }; + + let afr_freq = format_freq(variant.af_afr); + let amr_freq = format_freq(variant.af_amr); + let eas_freq = format_freq(variant.af_eas); + let eur_freq = format_freq(variant.af_eur); + let sas_freq = format_freq(variant.af_sas); + + writeln!( + md_file, + "| [{}](#variant-{}) | Chr {}:{} | {} | {} | {} | {} | {} | {} | {} | {} | {} |", + gene, variant_id, chr, pos, dna_change, condition, genotype_text, + clean_clnsig, afr_freq, amr_freq, eas_freq, eur_freq, sas_freq + )?; + + summary_entries.push((gene.clone(), variant, variant_id)); + variant_id += 1; + } + + // Detailed descriptions + writeln!(md_file)?; + writeln!(md_file, "### Detailed Descriptions")?; + writeln!(md_file)?; + + for (gene, variant, id) in summary_entries { + write_variant_details(md_file, &gene, variant, id)?; + writeln!(md_file)?; + writeln!(md_file, "---")?; + writeln!(md_file)?; + } + + Ok(()) +} + +/// Write details for a single variant +fn write_variant_details( + md_file: &mut File, + gene: &str, + variant: &FinalRecord, + variant_id: usize, +) -> Result<(), Box> { + writeln!(md_file, "", variant_id)?; + writeln!(md_file, "### {}. {} ", variant_id, gene)?; + writeln!(md_file)?; + writeln!(md_file, "
")?; + writeln!(md_file, "Variant Details")?; + writeln!(md_file)?; + + // Clean condition field by replacing pipe characters with commas and underscores with spaces + let condition = match &variant.clndn { + Some(c) => c.replace('|', ", ").replace('_', " "), + None => "Not specified".to_string(), + }; + + writeln!(md_file, "**Associated Condition:** {}", condition)?; + writeln!(md_file)?; + + // Location + writeln!(md_file, "**Location:** Chromosome {}, Position {}", variant.chr, variant.pos)?; + writeln!(md_file)?; + + // DNA Change + writeln!( + md_file, + "**DNA Change:** Your DNA has `{}` where the reference genome has `{}`", + variant.alt_allele, variant.ref_allele + )?; + writeln!(md_file)?; + + // Genotype / Zygosity + let zygosity = if variant.genotype == "1/1" { + "Homozygous (both copies of the gene have this variant)" + } else if variant.genotype == "0/1" || variant.genotype == "1/0" { + "Heterozygous (one copy of the gene has this variant)" + } else { + "" + }; + + writeln!( + md_file, + "**Genotype / Zygosity:** {}, {}", + variant.genotype, zygosity )?; + writeln!(md_file)?; + + // Clinical significance - replace underscores with spaces + let clean_clnsig = variant.clnsig.replace('_', " "); + writeln!(md_file, "**Clinical Significance of Variant:** {}", clean_clnsig)?; + writeln!(md_file)?; + + // Molecular effects (if available) + if let Some(mol_effect) = &variant.molecular_consequence { + if !mol_effect.is_empty() { + writeln!(md_file, "**Molecular Effects:**")?; + writeln!(md_file, "- Type: {}", mol_effect.replace('_', " "))?; + writeln!(md_file)?; + } + } + + // Population frequencies + let has_any_freq = variant.af_afr.is_some() || variant.af_amr.is_some() || + variant.af_eas.is_some() || variant.af_eur.is_some() || + variant.af_sas.is_some(); + + if has_any_freq { + writeln!(md_file, "**Population Frequencies:** (How common this variant is in different populations)")?; + writeln!(md_file, "| Population | Frequency |")?; + writeln!(md_file, "|:-----------|:----------|")?; + + // Format frequency helper + let format_freq = |f: Option| -> String { + match f { + Some(val) => format!("{:.2}%", val * 100.0), + None => "Not available".to_string(), + } + }; + + if variant.af_afr.is_some() { + writeln!(md_file, "| African | {} |", format_freq(variant.af_afr))?; + } + if variant.af_amr.is_some() { + writeln!(md_file, "| American | {} |", format_freq(variant.af_amr))?; + } + if variant.af_eas.is_some() { + writeln!(md_file, "| East Asian | {} |", format_freq(variant.af_eas))?; + } + if variant.af_eur.is_some() { + writeln!(md_file, "| European | {} |", format_freq(variant.af_eur))?; + } + if variant.af_sas.is_some() { + writeln!(md_file, "| South Asian | {} |", format_freq(variant.af_sas))?; + } + writeln!(md_file)?; + } + + // Review status + let stars = variant.review_stars; + let review_status = if stars == 0 { + " - No assertion criteria provided" + } else if stars == 1 { + " ★ - Criteria provided, single submitter" + } else if stars == 2 { + " ★★ - Criteria provided, multiple submitters, no conflicts" + } else if stars == 3 { + " ★★★ - Criteria provided, reviewed by expert panel" + } else { + " ★★★★ - Practice guideline" + }; + + writeln!(md_file, "> **Review Status:** {}", review_status)?; + writeln!(md_file)?; + + // Additional fields if available + if let Some(preferred) = &variant.preferred_values { + if !preferred.is_empty() { + writeln!(md_file, "> **Preferred Values:** {}", preferred.replace('_', " "))?; + writeln!(md_file)?; + } + } + + if let Some(desc) = &variant.description { + if !desc.is_empty() { + writeln!(md_file, "> **Description:** {}", desc.replace('_', " "))?; + writeln!(md_file)?; + } + } + + if let Some(citations) = &variant.citations { + if !citations.is_empty() { + writeln!(md_file, "> **Research Citations:** {}", citations.replace('_', " "))?; + writeln!(md_file)?; + } + } + + if let Some(status) = &variant.record_status { + if !status.is_empty() { + writeln!(md_file, "> **Record Status:** {}", status.replace('_', " "))?; + writeln!(md_file)?; + } + } + + if let Some(evaluated) = &variant.date_last_evaluated { + if !evaluated.is_empty() { + writeln!(md_file, "> **Date Last Evaluated:** {}", evaluated.replace('_', " "))?; + writeln!(md_file)?; + } + } + + writeln!(md_file, "
")?; + + Ok(()) +} +/// Write the understanding section of the markdown report +fn write_understanding_section(md_file: &mut File) -> Result<(), Box> { + writeln!(md_file, "")?; + writeln!(md_file, "## Understanding This Report")?; + writeln!(md_file)?; + + writeln!(md_file, "**Clinical Significance Categories:**")?; + writeln!(md_file)?; + writeln!(md_file, "- **Pathogenic variants** are genetic changes that have strong evidence for causing disease or health conditions.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Likely Pathogenic variants** have good but not definitive evidence suggesting they cause disease.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Conflicting Interpretations** are variants where different labs or researchers have come to different conclusions about their significance.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Variants of Uncertain Significance (VUS)** are genetic changes where there is currently not enough evidence to determine if they are potentially harmful or harmless.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Likely Benign variants** have good evidence suggesting they do not cause disease.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Benign variants** are genetic changes that are known to be harmless based on strong evidence.")?; + writeln!(md_file)?; + + writeln!(md_file, "**Important Note:** Having a pathogenic or likely pathogenic variant doesn't necessarily mean you will develop the condition, as other factors like environment, lifestyle, and additional genetic factors also play important roles.")?; + writeln!(md_file)?; + + writeln!(md_file, "**Other Terms:**")?; + writeln!(md_file)?; + writeln!(md_file, "- **Genotype / Zygosity:**")?; + writeln!(md_file)?; + writeln!(md_file, " - **1/1, Homozygous** means the variant is present on both copies of the gene.")?; + writeln!(md_file)?; + writeln!(md_file, " - **0/1, Heterozygous** means the variant is present on only one copy of the gene.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Population Frequencies** show how common the variant is in different populations. Rare variants (low percentage) may be more significant than common ones.")?; + writeln!(md_file)?; + writeln!(md_file, "- **Review Stars** indicate the level of review in ClinVar, with more stars representing more thorough evaluation.")?; + Ok(()) }