-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema.sql
More file actions
349 lines (329 loc) · 12.5 KB
/
schema.sql
File metadata and controls
349 lines (329 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
CREATE SCHEMA IF NOT EXISTS core;
CREATE SCHEMA IF NOT EXISTS evidence;
CREATE SCHEMA IF NOT EXISTS literature;
CREATE SCHEMA IF NOT EXISTS provenance;
CREATE TABLE IF NOT EXISTS provenance.ingestion_runs (
id text PRIMARY KEY,
source_repo text NOT NULL,
source_path text NOT NULL,
source_release text,
started_at timestamptz,
completed_at timestamptz,
transform_version text NOT NULL,
checksums jsonb NOT NULL DEFAULT '{}'::jsonb,
row_counts jsonb NOT NULL DEFAULT '{}'::jsonb,
notes text
);
CREATE TABLE IF NOT EXISTS core.mirnas (
id text PRIMARY KEY,
canonical_name text NOT NULL,
species text NOT NULL,
sequence text,
id_namespace text NOT NULL,
canonical_accession text,
family text,
arm text,
aliases text[] NOT NULL DEFAULT ARRAY[]::text[],
normalization jsonb NOT NULL DEFAULT '{}'::jsonb,
source_refs jsonb NOT NULL DEFAULT '[]'::jsonb
);
CREATE TABLE IF NOT EXISTS core.genes (
id text PRIMARY KEY,
canonical_symbol text NOT NULL,
species text NOT NULL,
id_namespace text NOT NULL,
canonical_accession text NOT NULL,
aliases text[] NOT NULL DEFAULT ARRAY[]::text[],
normalization jsonb NOT NULL DEFAULT '{}'::jsonb,
source_refs jsonb NOT NULL DEFAULT '[]'::jsonb
);
CREATE TABLE IF NOT EXISTS core.transcripts (
id text PRIMARY KEY,
gene_id text NOT NULL REFERENCES core.genes(id),
id_namespace text NOT NULL,
canonical_accession text NOT NULL,
gene_symbol text,
species text NOT NULL,
sequence text,
sequence_scope text NOT NULL,
genome_build text,
tx_start bigint,
tx_end bigint,
strand text,
localization_labels jsonb,
feature_track_summary jsonb,
sequence_length integer,
normalization jsonb NOT NULL DEFAULT '{}'::jsonb,
source_refs jsonb NOT NULL DEFAULT '[]'::jsonb
);
CREATE TABLE IF NOT EXISTS core.mirna_gene_pairs (
id text PRIMARY KEY,
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text NOT NULL REFERENCES core.genes(id),
species text NOT NULL,
evidence_counts jsonb NOT NULL DEFAULT '{}'::jsonb,
best_supported_transcript_ids text[] NOT NULL DEFAULT ARRAY[]::text[],
support_summary jsonb NOT NULL DEFAULT '{}'::jsonb,
source_refs jsonb NOT NULL DEFAULT '[]'::jsonb,
UNIQUE (mirna_id, gene_id)
);
CREATE TABLE IF NOT EXISTS core.transcript_feature_tracks (
id text PRIMARY KEY,
transcript_id text NOT NULL REFERENCES core.transcripts(id),
track_name text NOT NULL,
source text NOT NULL,
kind text NOT NULL,
count integer NOT NULL CHECK (count >= 0),
payload_ref jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.experiments (
id text PRIMARY KEY,
dataset_id text NOT NULL,
mirna_id text NOT NULL REFERENCES core.mirnas(id),
mirna_name_raw text NOT NULL,
mirna_sequence text NOT NULL,
article_pubmed_id text,
geo_accession text,
organism text NOT NULL,
tested_cell_line text,
tissue text,
method text,
experiment_type text NOT NULL,
treatment text,
de_table_path text NOT NULL,
source_repo text NOT NULL,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.experiment_gene_effects (
id text PRIMARY KEY,
experiment_id text NOT NULL REFERENCES evidence.experiments(id),
pair_id text NOT NULL REFERENCES core.mirna_gene_pairs(id),
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text NOT NULL REFERENCES core.genes(id),
logfc double precision,
fdr double precision,
pvalue double precision,
logcpm double precision,
f_statistic double precision,
effect_direction text NOT NULL,
passes_default_threshold boolean,
raw_row jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.predictors (
id text PRIMARY KEY,
tool_id text NOT NULL,
official_name text NOT NULL,
organism text NOT NULL,
score_type text NOT NULL,
score_direction text NOT NULL,
score_range text,
input_id_gene_type text NOT NULL,
canonical_id_gene_type text NOT NULL,
input_id_mirna_type text NOT NULL,
canonical_id_mirna_type text NOT NULL,
predictor_output_path text NOT NULL,
source_repo text NOT NULL
);
CREATE TABLE IF NOT EXISTS evidence.predictor_scores (
id text PRIMARY KEY,
predictor_id text NOT NULL REFERENCES evidence.predictors(id),
pair_id text NOT NULL REFERENCES core.mirna_gene_pairs(id),
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text NOT NULL REFERENCES core.genes(id),
score_raw double precision NOT NULL,
score_direction text NOT NULL,
score_rank_within_mirna integer,
source_row jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.site_observations (
id text PRIMARY KEY,
source_dataset text NOT NULL,
source_repo text NOT NULL,
mirna_id text REFERENCES core.mirnas(id),
mirna_name_raw text,
mirna_family_raw text,
mirna_sequence_raw text NOT NULL,
target_sequence_raw text,
feature_label_raw text,
label integer,
chr text,
start_pos bigint,
end_pos bigint,
strand text,
read_len integer,
gene_cluster_id text,
gene_phylop_ref jsonb,
gene_phastcons_ref jsonb,
raw_row jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.mirna_recognition_elements (
id text PRIMARY KEY,
source_dataset text NOT NULL,
source_repo text NOT NULL,
source_split text NOT NULL,
source_row_index integer NOT NULL CHECK (source_row_index >= 1),
label integer NOT NULL,
mirna_id text NOT NULL REFERENCES core.mirnas(id),
mirna_name_raw text NOT NULL,
mirna_family_raw text,
mirna_sequence_raw text NOT NULL,
gene_id text REFERENCES core.genes(id),
gene_name_raw text,
gene_cluster_id text,
target_sequence_raw text NOT NULL,
feature_label_raw text,
chr text,
start_pos bigint,
end_pos bigint,
strand text,
read_len integer,
experiment_type text,
organism text,
raw_row jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.mre_predictor_scores (
id text PRIMARY KEY,
predictor_id text NOT NULL REFERENCES evidence.predictors(id),
mre_id text NOT NULL REFERENCES evidence.mirna_recognition_elements(id),
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text REFERENCES core.genes(id),
score_raw double precision NOT NULL,
score_direction text NOT NULL,
score_rank_within_mre integer,
label integer,
source_dataset text,
source_row jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.site_transcript_overlaps (
id text PRIMARY KEY,
observation_id text NOT NULL REFERENCES evidence.site_observations(id),
transcript_id text NOT NULL REFERENCES core.transcripts(id),
gene_id text NOT NULL REFERENCES core.genes(id),
gene_name_raw text,
contained_100pct boolean NOT NULL,
overlap_tx_bp integer NOT NULL CHECK (overlap_tx_bp >= 0),
overlap_exon_bp integer NOT NULL CHECK (overlap_exon_bp >= 0),
overlap_cds_bp integer NOT NULL CHECK (overlap_cds_bp >= 0),
overlap_utr5_bp integer NOT NULL CHECK (overlap_utr5_bp >= 0),
overlap_utr3_bp integer NOT NULL CHECK (overlap_utr3_bp >= 0),
read_start_in_tx_1based integer,
read_end_in_tx_1based integer,
overlap_start_genome_1based bigint,
overlap_end_genome_1based bigint,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.mre_sites (
id text PRIMARY KEY,
pair_id text NOT NULL REFERENCES core.mirna_gene_pairs(id),
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text NOT NULL REFERENCES core.genes(id),
transcript_id text NOT NULL REFERENCES core.transcripts(id),
observation_id text NOT NULL REFERENCES evidence.site_observations(id),
chr text NOT NULL,
start_pos bigint NOT NULL,
end_pos bigint NOT NULL,
strand text NOT NULL,
read_len integer NOT NULL CHECK (read_len >= 0),
selection_policy text NOT NULL,
dominance_mode text NOT NULL,
selected_gene_name text,
dominant_region_selected text NOT NULL,
regions_present_selected text NOT NULL,
dominant_region_union text NOT NULL,
regions_present_union text NOT NULL,
bp_utr3_selected integer,
bp_cds_selected integer,
bp_utr5_selected integer,
bp_exon_other_selected integer,
bp_intron_selected integer,
bp_intergenic_selected integer,
bp_utr3_union integer,
bp_cds_union integer,
bp_utr5_union integer,
bp_exon_other_union integer,
bp_intron_union integer,
bp_intergenic_union integer,
ambiguous_union_vs_selected boolean NOT NULL,
n_passing_transcripts integer NOT NULL CHECK (n_passing_transcripts >= 0),
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS evidence.nucleotide_profiles (
id text PRIMARY KEY,
entity_type text NOT NULL,
entity_id text NOT NULL,
profile_type text NOT NULL,
length integer NOT NULL CHECK (length >= 0),
storage_mode text NOT NULL,
values_json jsonb,
payload_ref jsonb,
metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS literature.literature_documents (
id text PRIMARY KEY,
document_id text NOT NULL,
pmid text,
title text NOT NULL,
abstract text NOT NULL,
full_text text,
source text NOT NULL,
year text,
metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS literature.literature_mentions (
id text PRIMARY KEY,
document_id text NOT NULL REFERENCES literature.literature_documents(id),
annotation_id text NOT NULL,
source text NOT NULL,
span_text text NOT NULL,
start_pos integer,
end_pos integer,
entity_type text NOT NULL,
canonical_id_raw text,
canonical_name_raw text,
resolved_mirna_id text REFERENCES core.mirnas(id),
resolved_gene_id text REFERENCES core.genes(id),
confidence double precision,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE TABLE IF NOT EXISTS literature.literature_assertions (
id text PRIMARY KEY,
document_id text NOT NULL REFERENCES literature.literature_documents(id),
mirna_id text NOT NULL REFERENCES core.mirnas(id),
gene_id text NOT NULL REFERENCES core.genes(id),
assertion_type text NOT NULL,
direction text,
evidence_text text NOT NULL,
curation_status text NOT NULL,
curator text,
run_id text NOT NULL REFERENCES provenance.ingestion_runs(id)
);
CREATE INDEX IF NOT EXISTS mirnas_name_idx ON core.mirnas (canonical_name);
CREATE INDEX IF NOT EXISTS mirnas_accession_idx ON core.mirnas (canonical_accession);
CREATE INDEX IF NOT EXISTS genes_symbol_idx ON core.genes (canonical_symbol);
CREATE INDEX IF NOT EXISTS genes_accession_idx ON core.genes (canonical_accession);
CREATE INDEX IF NOT EXISTS transcripts_gene_idx ON core.transcripts (gene_id);
CREATE INDEX IF NOT EXISTS pairs_mirna_idx ON core.mirna_gene_pairs (mirna_id);
CREATE INDEX IF NOT EXISTS pairs_gene_idx ON core.mirna_gene_pairs (gene_id);
CREATE INDEX IF NOT EXISTS effects_pair_idx ON evidence.experiment_gene_effects (pair_id);
CREATE INDEX IF NOT EXISTS scores_pair_idx ON evidence.predictor_scores (pair_id);
CREATE INDEX IF NOT EXISTS mre_scores_mre_idx ON evidence.mre_predictor_scores (mre_id);
CREATE INDEX IF NOT EXISTS mre_scores_predictor_idx ON evidence.mre_predictor_scores (predictor_id);
CREATE INDEX IF NOT EXISTS mre_scores_mirna_idx ON evidence.mre_predictor_scores (mirna_id);
CREATE INDEX IF NOT EXISTS observations_locus_idx ON evidence.site_observations (chr, start_pos, end_pos);
CREATE INDEX IF NOT EXISTS mre_reads_dataset_split_idx ON evidence.mirna_recognition_elements (source_dataset, source_split);
CREATE INDEX IF NOT EXISTS mre_reads_mirna_idx ON evidence.mirna_recognition_elements (mirna_id);
CREATE INDEX IF NOT EXISTS mre_reads_gene_idx ON evidence.mirna_recognition_elements (gene_id);
CREATE INDEX IF NOT EXISTS mre_reads_locus_idx ON evidence.mirna_recognition_elements (chr, start_pos, end_pos);
CREATE INDEX IF NOT EXISTS overlaps_observation_idx ON evidence.site_transcript_overlaps (observation_id);
CREATE INDEX IF NOT EXISTS overlaps_transcript_idx ON evidence.site_transcript_overlaps (transcript_id);
CREATE INDEX IF NOT EXISTS mre_pair_idx ON evidence.mre_sites (pair_id);
CREATE INDEX IF NOT EXISTS mre_transcript_idx ON evidence.mre_sites (transcript_id);
CREATE INDEX IF NOT EXISTS nucleotide_entity_idx ON evidence.nucleotide_profiles (entity_type, entity_id);