|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +# service that handles preprocessing expression/annotation data to speed up dot plot rendering |
| 4 | +class DotPlotService |
| 5 | + # main handler for launching ingest job to process expression data |
| 6 | + # |
| 7 | + # * *params* |
| 8 | + # - +study+ (Study) => the study that owns the data |
| 9 | + # - +cluster_group+ (ClusterGroup) => the cluster to source cell names from |
| 10 | + # - +annotation_file+ (StudyFile) => the StudyFile containing annotation data |
| 11 | + # - +expression_file+ (StudyFile) => the StudyFile to source data from |
| 12 | + # |
| 13 | + # * *yields* |
| 14 | + # - (IngestJob) => the job that will be run to process the data |
| 15 | + def self.run_preprocess_expression_job(study, cluster_group, annotation_file, expression_file) |
| 16 | + study_eligible?(study) # method stub, waiting for scp-ingest-pipeline implementation |
| 17 | + end |
| 18 | + |
| 19 | + # determine study eligibility - can only have one processed matrix and be able to visualize clusters |
| 20 | + # |
| 21 | + # * *params* |
| 22 | + # - +study+ (Study) the study that owns the data |
| 23 | + # * *returns* |
| 24 | + # - (Boolean) true if the study is eligible for dot plot visualization |
| 25 | + def self.study_eligible?(study) |
| 26 | + processed_matrices = study_processed_matrices(study) |
| 27 | + study.can_visualize_clusters? && study.has_expression_data? && processed_matrices.size == 1 |
| 28 | + end |
| 29 | + |
| 30 | + # check if the given study/cluster has already been preprocessed |
| 31 | + # * *params* |
| 32 | + # - +study+ (Study) the study that owns the data |
| 33 | + # - +cluster_group+ (ClusterGroup) the cluster to check for processed data |
| 34 | + # |
| 35 | + # * *returns* |
| 36 | + # - (Boolean) true if the study/cluster has already been processed |
| 37 | + def self.cluster_processed?(study, cluster_group) |
| 38 | + DotPlotGene.where(study:, cluster_group:).exists? |
| 39 | + end |
| 40 | + |
| 41 | + # get processed expression matrices for a study |
| 42 | + # |
| 43 | + # * *params* |
| 44 | + # - +study+ (Study) the study to get matrices for |
| 45 | + # |
| 46 | + # * *returns* |
| 47 | + # - (Array<StudyFile>) an array of processed expression matrices for the study |
| 48 | + def self.study_processed_matrices(study) |
| 49 | + study.expression_matrices.select do |matrix| |
| 50 | + matrix.is_viz_anndata? || !matrix.is_raw_counts_file? |
| 51 | + end |
| 52 | + end |
| 53 | + |
| 54 | + # seeding method for testing purposes, will be removed once pipeline is in place |
| 55 | + # data is random and not representative of actual expression data |
| 56 | + def self.seed_dot_plot_genes(study) |
| 57 | + return false unless study_eligible?(study) |
| 58 | + |
| 59 | + DotPlotGene.where(study_id: study.id).delete_all |
| 60 | + puts "Seeding dot plot genes for #{study.accession}" |
| 61 | + expression_matrix = study.expression_matrices.first |
| 62 | + print 'assembling genes and annotations...' |
| 63 | + genes = Gene.where(study:, study_file: expression_matrix).pluck(:name) |
| 64 | + annotations = AnnotationVizService.available_metadata_annotations( |
| 65 | + study, annotation_type: 'group' |
| 66 | + ).reject { |a| a[:scope] == 'invalid' } |
| 67 | + puts " done. Found #{genes.size} genes and #{annotations.size} study-wide annotations." |
| 68 | + study.cluster_groups.each do |cluster_group| |
| 69 | + next if cluster_processed?(study, cluster_group) |
| 70 | + |
| 71 | + cluster_annotations = ClusterVizService.available_annotations_by_cluster( |
| 72 | + cluster_group, 'group' |
| 73 | + ).reject { |a| a[:scope] == 'invalid' } |
| 74 | + all_annotations = annotations + cluster_annotations |
| 75 | + puts "Processing #{cluster_group.name} with #{all_annotations.size} annotations." |
| 76 | + documents = [] |
| 77 | + genes.each do |gene| |
| 78 | + exp_scores = all_annotations.map do |annotation| |
| 79 | + { |
| 80 | + "#{annotation[:name]}--#{annotation[:type]}--#{annotation[:scope]}" => annotation[:values].map do |value| |
| 81 | + { value => [rand.round(3), rand.round(3)] } |
| 82 | + end.reduce({}, :merge) |
| 83 | + } |
| 84 | + end.reduce({}, :merge) |
| 85 | + documents << DotPlotGene.new( |
| 86 | + study:, study_file: expression_matrix, cluster_group:, gene_symbol: gene, searchable_gene: gene.downcase, |
| 87 | + exp_scores: |
| 88 | + ).attributes |
| 89 | + if documents.size == 1000 |
| 90 | + DotPlotGene.collection.insert_many(documents) |
| 91 | + count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size |
| 92 | + puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}." |
| 93 | + documents.clear |
| 94 | + end |
| 95 | + end |
| 96 | + DotPlotGene.collection.insert_many(documents) |
| 97 | + count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size |
| 98 | + puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}." |
| 99 | + puts "Finished processing #{cluster_group.name}" |
| 100 | + end |
| 101 | + puts "Seeding complete for #{study.accession}, #{DotPlotGene.where(study_id: study.id).size} DotPlotGenes created." |
| 102 | + true |
| 103 | + rescue StandardError => e |
| 104 | + puts "Error seeding DotPlotGenes in #{study.accession}: #{e.message}" |
| 105 | + false |
| 106 | + end |
| 107 | +end |
0 commit comments