Skip to content

Commit 7ff1ddf

Browse files
authored
Use mutation_derived table in mutation mapper to avoid joins (#11786)
Fix SQL issues in Mutation Mapper migrated to mutation_derived table. Fix issues with clickhouse.sql so it runs in test context Fix CH problem with getMutationCountByPosition query update pom.xml with new derived table version
1 parent 1ce3cf8 commit 7ff1ddf

File tree

3 files changed

+324
-134
lines changed

3 files changed

+324
-134
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
<frontend.version>v6.3.6</frontend.version>
3232
<!-- THIS SHOULD BE KEPT IN SYNC TO VERSION IN CGDS.SQL -->
3333
<db.version>2.14.2</db.version>
34-
<derived_table.version>1.0.2</derived_table.version>
34+
<derived_table.version>1.0.3</derived_table.version>
3535

3636
<!-- Version properties for dependencies that should have same version. -->
3737
<!-- The rest can be set in the dependencyManagement section -->

src/main/resources/db-scripts/clickhouse/clickhouse.sql

Lines changed: 215 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
-- version 1.0.2 of derived table schema and data definition
1+
-- version 1.0.3 of derived table schema and data definition
22
-- when making updates:
33
-- increment the version number here
44
-- update pom.xml with the new version number
@@ -11,6 +11,7 @@ DROP TABLE IF EXISTS clinical_data_derived;
1111
DROP TABLE IF EXISTS clinical_event_derived;
1212
DROP TABLE IF EXISTS genetic_alteration_derived;
1313
DROP TABLE IF EXISTS generic_assay_data_derived;
14+
DROP TABLE IF EXISTS mutation_derived;
1415

1516
-- the following query "fixes" the sample_profile table by adding entries for "missing" samples -- those which appear in mutated case list but not in the MySQL sample_profile table
1617
-- this problem was handled in java at run time in legacy codebase
@@ -172,8 +173,9 @@ CREATE TABLE IF NOT EXISTS genomic_event_derived
172173
sv_event_info String,
173174
patient_unique_id String,
174175
off_panel Boolean DEFAULT FALSE
175-
) ENGINE = MergeTree
176-
ORDER BY (variant_type, entrez_gene_id, hugo_gene_symbol, genetic_profile_stable_id, sample_unique_id);
176+
) ENGINE = MergeTree
177+
ORDER BY (genetic_profile_stable_id, cancer_study_identifier, variant_type, entrez_gene_id, hugo_gene_symbol, sample_unique_id);
178+
177179

178180
INSERT INTO genomic_event_derived
179181
-- Insert Mutations
@@ -373,10 +375,11 @@ SELECT
373375
ifNull(ce.stop_date, 0) AS stop_date,
374376
ce.event_type AS event_type,
375377
cs.cancer_study_identifier
376-
FROM clinical_event ce
377-
LEFT JOIN clinical_event_data ced ON ce.clinical_event_id = ced.clinical_event_id
378-
INNER JOIN patient p ON ce.patient_id = p.internal_id
379-
INNER JOIN cancer_study cs ON p.cancer_study_id = cs.cancer_study_id;
378+
379+
FROM clinical_event_data ced
380+
RIGHT JOIN clinical_event ce ON ced.clinical_event_id = ce.clinical_event_id
381+
INNER JOIN patient p ON ce.patient_id = p.internal_id
382+
INNER JOIN cancer_study cs ON p.cancer_study_id = cs.cancer_study_id;
380383

381384
CREATE TABLE IF NOT EXISTS genetic_alteration_derived
382385
(
@@ -485,6 +488,211 @@ FROM
485488
JOIN cancer_study cs ON cs.cancer_study_id = subquery.cancer_study_id
486489
JOIN sample_derived sd ON sd.internal_id = subquery.sample_id;
487490

491+
492+
493+
DROP TABLE IF EXISTS mutation_derived;
494+
CREATE TABLE mutation_derived
495+
(
496+
molecularProfileId String COMMENT 'Stable ID of the genetic profile',
497+
sampleId String COMMENT 'Stable ID of the sample',
498+
sampleInternalId Int64,
499+
patientId String COMMENT 'Stable ID of the patient',
500+
entrezGeneId Int64 COMMENT 'Entrez Gene ID from mutation table (NOT NULL)',
501+
studyId String COMMENT 'Cancer study identifier',
502+
center Nullable(String) COMMENT 'Sequencing center',
503+
mutationStatus Nullable(String) COMMENT 'Mutation status (e.g., Somatic, Germline)',
504+
validationStatus Nullable(String) COMMENT 'Validation status',
505+
tumorAltCount Nullable(Int64) COMMENT 'Tumor alternate allele count',
506+
tumorRefCount Nullable(Int64) COMMENT 'Tumor reference allele count',
507+
normalAltCount Nullable(Int64) COMMENT 'Normal alternate allele count',
508+
normalRefCount Nullable(Int64) COMMENT 'Normal reference allele count',
509+
aminoAcidChange Nullable(String) COMMENT 'Amino acid change',
510+
chr Nullable(String) COMMENT 'Chromosome',
511+
startPosition Nullable(Int64) COMMENT 'Start position',
512+
endPosition Nullable(Int64) COMMENT 'End position',
513+
referenceAllele Nullable(String) COMMENT 'Reference allele',
514+
tumorSeqAllele Nullable(String) COMMENT 'Tumor sequence allele',
515+
proteinChange Nullable(String) COMMENT 'Protein change',
516+
mutationType Nullable(String) COMMENT 'Type of mutation',
517+
ncbiBuild Nullable(String) COMMENT 'NCBI build version',
518+
variantType Nullable(String) COMMENT 'Variant type',
519+
refseqMrnaId Nullable(String) COMMENT 'RefSeq mRNA ID',
520+
proteinPosStart Nullable(Int64) COMMENT 'Protein position start',
521+
proteinPosEnd Nullable(Int64) COMMENT 'Protein position end',
522+
keyword Nullable(String) COMMENT 'Keyword',
523+
annotationJSON Nullable(String) COMMENT 'Annotation JSON',
524+
driverFilter Nullable(String) COMMENT 'Driver filter',
525+
driverFilterAnnotation Nullable(String) COMMENT 'Driver filter annotation',
526+
driverTiersFilter Nullable(String) COMMENT 'Driver tiers filter',
527+
driverTiersFilterAnnotation Nullable(String) COMMENT 'Driver tiers filter annotation',
528+
`GENE.entrezGeneId` Nullable(Int64) COMMENT 'Gene entrez ID',
529+
`GENE.hugoGeneSymbol` Nullable(String) COMMENT 'HUGO gene symbol',
530+
`GENE.type` Nullable(String) COMMENT 'Gene type',
531+
`alleleSpecificCopyNumber.ascnIntegerCopyNumber` Nullable(Int64) COMMENT 'ASCN integer copy number',
532+
`alleleSpecificCopyNumber.ascnMethod` Nullable(String) COMMENT 'ASCN method',
533+
`alleleSpecificCopyNumber.ccfExpectedCopiesUpper` Nullable(Float64) COMMENT 'CCF expected copies upper bound',
534+
`alleleSpecificCopyNumber.ccfExpectedCopies` Nullable(Float64) COMMENT 'CCF expected copies',
535+
`alleleSpecificCopyNumber.clonal` Nullable(String) COMMENT 'Clonality annotation',
536+
`alleleSpecificCopyNumber.minorCopyNumber` Nullable(Int64) COMMENT 'Minor copy number',
537+
`alleleSpecificCopyNumber.expectedAltCopies` Nullable(Int64) COMMENT 'Expected alternate copies',
538+
`alleleSpecificCopyNumber.totalCopyNumber` Nullable(Int64) COMMENT 'Total copy number'
539+
)
540+
ENGINE = MergeTree()
541+
ORDER BY (molecularProfileId, sampleId, entrezGeneId)
542+
COMMENT 'Mutation query results with detailed annotations including driver status and allele-specific copy numbers';
543+
544+
INSERT INTO mutation_derived
545+
SELECT
546+
genetic_profile.stable_id AS molecularProfileId,
547+
sample.stable_id AS sampleId,
548+
sample.internal_id As sampleInternalId,
549+
patient.stable_id AS patientId,
550+
mutation.entrez_gene_id AS entrezGeneId,
551+
cancer_study.cancer_study_identifier AS studyId,
552+
mutation.center AS center,
553+
mutation.mutation_status AS mutationStatus,
554+
mutation.validation_status AS validationStatus,
555+
mutation.tumor_alt_count AS tumorAltCount,
556+
mutation.tumor_ref_count AS tumorRefCount,
557+
mutation.normal_alt_count AS normalAltCount,
558+
mutation.normal_ref_count AS normalRefCount,
559+
mutation.amino_acid_change AS aminoAcidChange,
560+
mutation_event.chr AS chr,
561+
mutation_event.start_position AS startPosition,
562+
mutation_event.end_position AS endPosition,
563+
mutation_event.reference_allele AS referenceAllele,
564+
mutation_event.tumor_seq_allele AS tumorSeqAllele,
565+
mutation_event.protein_change AS proteinChange,
566+
mutation_event.mutation_type AS mutationType,
567+
mutation_event.ncbi_build AS ncbiBuild,
568+
mutation_event.variant_type AS variantType,
569+
mutation_event.refseq_mrna_id AS refseqMrnaId,
570+
mutation_event.protein_pos_start AS proteinPosStart,
571+
mutation_event.protein_pos_end AS proteinPosEnd,
572+
mutation_event.keyword AS keyword,
573+
mutation.annotation_json AS annotationJSON,
574+
alteration_driver_annotation.driver_filter AS driverFilter,
575+
alteration_driver_annotation.driver_filter_annotation AS driverFilterAnnotation,
576+
alteration_driver_annotation.driver_tiers_filter AS driverTiersFilter,
577+
alteration_driver_annotation.driver_tiers_filter_annotation AS driverTiersFilterAnnotation,
578+
gene.entrez_gene_id AS `GENE.entrezGeneId`,
579+
gene.hugo_gene_symbol AS `GENE.hugoGeneSymbol`,
580+
gene.type AS `GENE.type`,
581+
allele_specific_copy_number.ascn_integer_copy_number AS `alleleSpecificCopyNumber.ascnIntegerCopyNumber`,
582+
allele_specific_copy_number.ascn_method AS `alleleSpecificCopyNumber.ascnMethod`,
583+
allele_specific_copy_number.ccf_expected_copies_upper AS `alleleSpecificCopyNumber.ccfExpectedCopiesUpper`,
584+
allele_specific_copy_number.ccf_expected_copies AS `alleleSpecificCopyNumber.ccfExpectedCopies`,
585+
allele_specific_copy_number.clonal AS `alleleSpecificCopyNumber.clonal`,
586+
allele_specific_copy_number.minor_copy_number AS `alleleSpecificCopyNumber.minorCopyNumber`,
587+
allele_specific_copy_number.expected_alt_copies AS `alleleSpecificCopyNumber.expectedAltCopies`,
588+
allele_specific_copy_number.total_copy_number AS `alleleSpecificCopyNumber.totalCopyNumber`
589+
FROM mutation
590+
INNER JOIN genetic_profile ON mutation.genetic_profile_id = genetic_profile.genetic_profile_id
591+
INNER JOIN sample ON mutation.sample_id = sample.internal_id
592+
INNER JOIN patient ON sample.patient_id = patient.internal_id
593+
INNER JOIN cancer_study ON patient.cancer_study_id = cancer_study.cancer_study_id
594+
LEFT JOIN alteration_driver_annotation ON (mutation.genetic_profile_id = alteration_driver_annotation.genetic_profile_id) AND (mutation.sample_id = alteration_driver_annotation.sample_id) AND (mutation.mutation_event_id = alteration_driver_annotation.alteration_event_id)
595+
INNER JOIN mutation_event ON mutation.mutation_event_id = mutation_event.mutation_event_id
596+
INNER JOIN gene ON mutation.entrez_gene_id = gene.entrez_gene_id
597+
LEFT JOIN allele_specific_copy_number ON (mutation.mutation_event_id = allele_specific_copy_number.mutation_event_id) AND (mutation.genetic_profile_id = allele_specific_copy_number.genetic_profile_id) AND (mutation.sample_id = allele_specific_copy_number.sample_id);
598+
599+
600+
601+
602+
-- START: PRIMARY KEY ADDITIONS
603+
-- THE FOLLOWING SCRIPTS EXIST TO ADD PRIMARY KEYS TO LEGACY TABLES THAT ARE MISSING THEM. YOU
604+
-- CANNOT CHANGE THE PRIMARY KEY ON A TABLE IN CLICKHOUSE, SO WE NEED TO CREATE A NEW TABLE WITH THE
605+
-- PRIMARY KEY AND THEN COPY THE DATA OVER.
606+
607+
608+
--Adds primary key to the sample_cna_event table for Clickhouse-only
609+
DROP TABLE IF EXISTS sample_cna_event_BACKUP;
610+
CREATE TABLE sample_cna_event_BACKUP
611+
(
612+
`cna_event_id` Int64 COMMENT 'References cna_event.cna_event_id.',
613+
`sample_id` Int64 COMMENT 'References sample.internal_id.',
614+
`genetic_profile_id` Int64 COMMENT 'References genetic_profile.genetic_profile_id.',
615+
`annotation_json` Nullable(String) COMMENT 'JSON-formatted annotation details.'
616+
)
617+
ENGINE = MergeTree()
618+
PRIMARY KEY (genetic_profile_id, cna_event_id, sample_id)
619+
ORDER BY (genetic_profile_id, cna_event_id, sample_id)
620+
SETTINGS index_granularity = 8192
621+
COMMENT 'Observed CNA events per sample and profile. References cna_event, sample, and genetic_profile.';
622+
623+
-- Copy the data
624+
INSERT INTO sample_cna_event_BACKUP
625+
SELECT * FROM sample_cna_event;
626+
627+
-- SWITCH THE TABLES
628+
EXCHANGE TABLES sample_cna_event_BACKUP AND sample_cna_event;
629+
630+
DROP TABLE IF EXISTS mutation_BACKUP;
631+
CREATE TABLE mutation_BACKUP
632+
(
633+
`mutation_event_id` Int64 COMMENT 'References mutation_event.mutation_event_id.',
634+
`genetic_profile_id` Int64 COMMENT 'References genetic_profile.genetic_profile_id.',
635+
`sample_id` Int64 COMMENT 'References sample.internal_id.',
636+
`entrez_gene_id` Int64 COMMENT 'References gene.entrez_gene_id.',
637+
`center` Nullable(String) COMMENT 'Center where sequencing was performed.',
638+
`sequencer` Nullable(String) COMMENT 'Sequencing platform used.',
639+
`mutation_status` Nullable(String) COMMENT 'Mutation status: Germline, Somatic, or LOH.',
640+
`validation_status` Nullable(String) COMMENT 'Validation status.',
641+
`tumor_seq_allele1` Nullable(String) COMMENT 'Tumor allele 1 sequence.',
642+
`tumor_seq_allele2` Nullable(String) COMMENT 'Tumor allele 2 sequence.',
643+
`matched_norm_sample_barcode` Nullable(String) COMMENT 'Matched normal sample barcode.',
644+
`match_norm_seq_allele1` Nullable(String) COMMENT 'Matched normal allele 1 sequence.',
645+
`match_norm_seq_allele2` Nullable(String) COMMENT 'Matched normal allele 2 sequence.',
646+
`tumor_validation_allele1` Nullable(String) COMMENT 'Tumor validation allele 1 sequence.',
647+
`tumor_validation_allele2` Nullable(String) COMMENT 'Tumor validation allele 2 sequence.',
648+
`match_norm_validation_allele1` Nullable(String) COMMENT 'Matched normal validation allele 1.',
649+
`match_norm_validation_allele2` Nullable(String) COMMENT 'Matched normal validation allele 2.',
650+
`verification_status` Nullable(String) COMMENT 'Verification status.',
651+
`sequencing_phase` Nullable(String) COMMENT 'Sequencing phase.',
652+
`sequence_source` Nullable(String) COMMENT 'Source of sequencing data.',
653+
`validation_method` Nullable(String) COMMENT 'Validation method used.',
654+
`score` Nullable(String) COMMENT 'Score or quality metric.',
655+
`bam_file` Nullable(String) COMMENT 'Associated BAM file.',
656+
`tumor_alt_count` Nullable(Int64) COMMENT 'Tumor alternate allele count.',
657+
`tumor_ref_count` Nullable(Int64) COMMENT 'Tumor reference allele count.',
658+
`normal_alt_count` Nullable(Int64) COMMENT 'Normal alternate allele count.',
659+
`normal_ref_count` Nullable(Int64) COMMENT 'Normal reference allele count.',
660+
`amino_acid_change` Nullable(String) COMMENT 'Amino acid change from mutation.',
661+
`annotation_json` Nullable(String) COMMENT 'JSON-formatted annotations.'
662+
)
663+
ENGINE = MergeTree()
664+
ORDER BY (genetic_profile_id,entrez_gene_id)
665+
COMMENT 'Mutation observations in specific samples and profiles. References mutation_event, gene, genetic_profile, and sample.';
666+
667+
-- copy data into new table
668+
INSERT INTO mutation_BACKUP
669+
SELECT * FROM mutation;
670+
671+
-- switch the tables
672+
EXCHANGE TABLES mutation_BACKUP AND mutation;
673+
674+
675+
-- Adds primary key genetic_alteration table for Clickhouse-only
676+
DROP TABLE IF EXISTS genetic_alteration_BACKUP;
677+
CREATE TABLE genetic_alteration_BACKUP
678+
(
679+
`genetic_profile_id` Int64,
680+
`genetic_entity_id` Int64,
681+
`values` String
682+
)
683+
ENGINE = MergeTree()
684+
ORDER BY (genetic_profile_id, genetic_entity_id);
685+
686+
-- Copy the data
687+
INSERT INTO genetic_alteration_BACKUP
688+
SELECT * FROM genetic_alteration;
689+
690+
-- SWITCH THE TABLES
691+
EXCHANGE TABLES genetic_alteration_BACKUP AND genetic_alteration;
692+
693+
--END: PRIMARY KEY ADDITIONS
694+
695+
488696
OPTIMIZE TABLE sample_to_gene_panel_derived;
489697
OPTIMIZE TABLE gene_panel_to_gene_derived;
490698
OPTIMIZE TABLE sample_derived;

0 commit comments

Comments
 (0)