|
1 | | -# frozen_string_literal: true |
2 | | - |
3 | 1 | # service that handles preprocessing expression/annotation data to speed up dot plot rendering |
4 | 2 | class DotPlotService |
5 | | - # main handler for launching ingest job to process expression data |
| 3 | + # main handler for launching ingest job to process expression data into DotPlotGene objects |
| 4 | + # since the study can have only one processed matrix/metadata file, this will only run if the study is eligible |
6 | 5 | # |
7 | 6 | # * *params* |
8 | 7 | # - +study+ (Study) => the study that owns the data |
9 | | - # - +cluster_group+ (ClusterGroup) => the cluster to source cell names from |
10 | | - # - +annotation_file+ (StudyFile) => the StudyFile containing annotation data |
11 | | - # - +expression_file+ (StudyFile) => the StudyFile to source data from |
| 8 | + # - +cluster_group+ (ClusterGroup) => the cluster to set associations for |
| 9 | + # - +user+ (User) => the user that will run the job |
12 | 10 | # |
13 | 11 | # * *yields* |
14 | 12 | # - (IngestJob) => the job that will be run to process the data |
15 | | - def self.run_preprocess_expression_job(study, cluster_group, annotation_file, expression_file) |
16 | | - study_eligible?(study) # method stub, waiting for scp-ingest-pipeline implementation |
| 13 | + def self.run_process_dot_plot_genes(study, cluster_group, user) |
| 14 | + validate_study(study, cluster_group) |
| 15 | + expression_file = study_processed_matrices(study)&.first |
| 16 | + metadata_file = study.metadata_file |
| 17 | + validate_source_data(expression_file, metadata_file) |
| 18 | + params_object = create_params_object(cluster_group, expression_file, metadata_file) |
| 19 | + if params_object.valid? |
| 20 | + job = IngestJob.new( |
| 21 | + study:, study_file: expression_file, user:, action: :ingest_dot_plot_genes, params_object: |
| 22 | + ) |
| 23 | + job.delay.push_remote_and_launch_ingest |
| 24 | + true |
| 25 | + else |
| 26 | + raise ArgumentError, "job parameters failed to validate: #{params_object.errors.full_messages.join(', ')}" |
| 27 | + end |
| 28 | + end |
| 29 | + |
| 30 | + # create DotPlotGeneIngestParameters object based on the provided files |
| 31 | + # |
| 32 | + # * *params* |
| 33 | + # - +cluster_group+ (ClusterGroup) => the cluster group to associate with |
| 34 | + # - +expression_file+ (StudyFile) => the expression matrix file to process |
| 35 | + # - +metadata_file+ (StudyFile) => the metadata file to source annotations |
| 36 | + # |
| 37 | + # * *returns* |
| 38 | + # - (DotPlotGeneIngestParameters) => a parameters object with the necessary file paths and metadata |
| 39 | + def self.create_params_object(cluster_group, expression_file, metadata_file) |
| 40 | + params = { |
| 41 | + cluster_group_id: cluster_group.id, |
| 42 | + cluster_file: RequestUtils.cluster_file_url(cluster_group) |
| 43 | + } |
| 44 | + case expression_file.file_type |
| 45 | + when 'Expression Matrix' |
| 46 | + params[:matrix_file_type] = 'dense' |
| 47 | + params[:matrix_file_path] = expression_file.gs_url |
| 48 | + params[:cell_metadata_file] = metadata_file.gs_url |
| 49 | + when 'MM Coordinate Matrix' |
| 50 | + params[:matrix_file_type] = 'mtx' |
| 51 | + genes_file = expression_file.bundled_files.detect { |f| f.file_type == '10X Genes File' } |
| 52 | + barcodes_file = expression_file.bundled_files.detect { |f| f.file_type == '10X Barcodes File' } |
| 53 | + params[:matrix_file_path] = expression_file.gs_url |
| 54 | + params[:cell_metadata_file] = metadata_file.gs_url |
| 55 | + params[:gene_file] = genes_file.gs_url |
| 56 | + params[:barcode_file] = barcodes_file.gs_url |
| 57 | + when 'AnnData' |
| 58 | + params[:matrix_file_type] = 'mtx' # extracted expression for AnnData is in MTX format |
| 59 | + params[:cell_metadata_file] = RequestUtils.data_fragment_url(metadata_file, 'metadata') |
| 60 | + params[:matrix_file_path] = RequestUtils.data_fragment_url( |
| 61 | + expression_file, 'matrix', file_type_detail: 'processed' |
| 62 | + ) |
| 63 | + params[:gene_file] = RequestUtils.data_fragment_url( |
| 64 | + expression_file, 'features', file_type_detail: 'processed' |
| 65 | + ) |
| 66 | + params[:barcode_file] = RequestUtils.data_fragment_url( |
| 67 | + expression_file, 'barcodes', file_type_detail: 'processed' |
| 68 | + ) |
| 69 | + end |
| 70 | + DotPlotGeneIngestParameters.new(**params) |
17 | 71 | end |
18 | 72 |
|
19 | 73 | # determine study eligibility - can only have one processed matrix and be able to visualize clusters |
20 | 74 | # |
21 | 75 | # * *params* |
22 | | - # - +study+ (Study) the study that owns the data |
| 76 | + # - +study+ (Study) => the study that owns the data |
23 | 77 | # * *returns* |
24 | | - # - (Boolean) true if the study is eligible for dot plot visualization |
| 78 | + # - (Boolean) => true if the study is eligible for dot plot visualization |
25 | 79 | def self.study_eligible?(study) |
26 | 80 | processed_matrices = study_processed_matrices(study) |
27 | 81 | study.can_visualize_clusters? && study.has_expression_data? && processed_matrices.size == 1 |
28 | 82 | end |
29 | 83 |
|
30 | 84 | # check if the given study/cluster has already been preprocessed |
31 | 85 | # * *params* |
32 | | - # - +study+ (Study) the study that owns the data |
33 | | - # - +cluster_group+ (ClusterGroup) the cluster to check for processed data |
| 86 | + # - +study+ (Study) => the study that owns the data |
| 87 | + # - +cluster_group+ (ClusterGroup) => the cluster to check for processed data |
34 | 88 | # |
35 | 89 | # * *returns* |
36 | | - # - (Boolean) true if the study/cluster has already been processed |
| 90 | + # - (Boolean) => true if the study/cluster has already been processed |
37 | 91 | def self.cluster_processed?(study, cluster_group) |
38 | 92 | DotPlotGene.where(study:, cluster_group:).exists? |
39 | 93 | end |
40 | 94 |
|
41 | 95 | # get processed expression matrices for a study |
42 | 96 | # |
43 | 97 | # * *params* |
44 | | - # - +study+ (Study) the study to get matrices for |
| 98 | + # - +study+ (Study) => the study to get matrices for |
45 | 99 | # |
46 | 100 | # * *returns* |
47 | | - # - (Array<StudyFile>) an array of processed expression matrices for the study |
| 101 | + # - (Array<StudyFile>) => an array of processed expression matrices for the study |
48 | 102 | def self.study_processed_matrices(study) |
49 | 103 | study.expression_matrices.select do |matrix| |
50 | 104 | matrix.is_viz_anndata? || !matrix.is_raw_counts_file? |
51 | 105 | end |
52 | 106 | end |
53 | 107 |
|
54 | | - # seeding method for testing purposes, will be removed once pipeline is in place |
55 | | - # data is random and not representative of actual expression data |
56 | | - def self.seed_dot_plot_genes(study) |
57 | | - return false unless study_eligible?(study) |
58 | | - |
59 | | - DotPlotGene.where(study_id: study.id).delete_all |
60 | | - puts "Seeding dot plot genes for #{study.accession}" |
61 | | - expression_matrix = study.expression_matrices.first |
62 | | - print 'assembling genes and annotations...' |
63 | | - genes = Gene.where(study:, study_file: expression_matrix).pluck(:name) |
64 | | - annotations = AnnotationVizService.available_metadata_annotations( |
65 | | - study, annotation_type: 'group' |
66 | | - ).reject { |a| a[:scope] == 'invalid' } |
67 | | - puts " done. Found #{genes.size} genes and #{annotations.size} study-wide annotations." |
68 | | - study.cluster_groups.each do |cluster_group| |
69 | | - next if cluster_processed?(study, cluster_group) |
| 108 | + # validate the study for dot plot preprocessing |
| 109 | + # |
| 110 | + # * *params* |
| 111 | + # - +study+ (Study) => the study to validate |
| 112 | + # |
| 113 | + # * *raises* |
| 114 | + # - (ArgumentError) => if the study is invalid or does not qualify for dot plot visualization |
| 115 | + def self.validate_study(study, cluster_group) |
| 116 | + raise ArgumentError, 'Invalid study' unless study.present? && study.is_a?(Study) |
| 117 | + raise ArgumentError, 'Study does not qualify for dot plot visualization' unless study_eligible?(study) |
| 118 | + raise ArgumentError, 'Study has already been processed' if cluster_processed?(study, cluster_group) |
| 119 | + end |
70 | 120 |
|
71 | | - cluster_annotations = ClusterVizService.available_annotations_by_cluster( |
72 | | - cluster_group, 'group' |
73 | | - ).reject { |a| a[:scope] == 'invalid' } |
74 | | - all_annotations = annotations + cluster_annotations |
75 | | - puts "Processing #{cluster_group.name} with #{all_annotations.size} annotations." |
76 | | - documents = [] |
77 | | - genes.each do |gene| |
78 | | - exp_scores = all_annotations.map do |annotation| |
79 | | - { |
80 | | - "#{annotation[:name]}--#{annotation[:type]}--#{annotation[:scope]}" => annotation[:values].map do |value| |
81 | | - { value => [rand.round(3), rand.round(3)] } |
82 | | - end.reduce({}, :merge) |
83 | | - } |
84 | | - end.reduce({}, :merge) |
85 | | - documents << DotPlotGene.new( |
86 | | - study:, study_file: expression_matrix, cluster_group:, gene_symbol: gene, searchable_gene: gene.downcase, |
87 | | - exp_scores: |
88 | | - ).attributes |
89 | | - if documents.size == 1000 |
90 | | - DotPlotGene.collection.insert_many(documents) |
91 | | - count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size |
92 | | - puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}." |
93 | | - documents.clear |
94 | | - end |
95 | | - end |
96 | | - DotPlotGene.collection.insert_many(documents) |
97 | | - count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size |
98 | | - puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}." |
99 | | - puts "Finished processing #{cluster_group.name}" |
100 | | - end |
101 | | - puts "Seeding complete for #{study.accession}, #{DotPlotGene.where(study_id: study.id).size} DotPlotGenes created." |
102 | | - true |
103 | | - rescue StandardError => e |
104 | | - puts "Error seeding DotPlotGenes in #{study.accession}: #{e.message}" |
105 | | - false |
| 121 | + # validate required data is present for processing |
| 122 | + # |
| 123 | + # * *params* |
| 124 | + # - +expression_file+ (StudyFile) => the expression matrix file to process |
| 125 | + # - +metadata_file+ (StudyFile) => the metadata file to source annotations |
| 126 | + # |
| 127 | + # * *raises* |
| 128 | + # - (ArgumentError) => if the source data is not fully parsed or MTX bundled is not completed |
| 129 | + def self.validate_source_data(expression_file, metadata_file) |
| 130 | + raise ArgumentError, 'Missing required files' unless expression_file.present? && metadata_file.present? |
| 131 | + raise ArgumentError, 'Source data not fully parsed' unless expression_file.parsed? && metadata_file.parsed? |
| 132 | + raise ArgumentError, 'MTX bundled not completed' if expression_file.should_bundle? && |
| 133 | + !expression_file.has_completed_bundle? |
106 | 134 | end |
107 | 135 | end |
0 commit comments