Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 1 addition & 18 deletions app/lib/differential_expression_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_
de_type: 'rest', group1: nil, group2: nil, machine_type: nil, dry_run: nil)
validate_study(study)
validate_annotation(cluster_group, study, annotation_name, annotation_scope, group1:, group2:)
cluster_url = cluster_file_url(cluster_group)
cluster_url = RequestUtils.cluster_file_url(cluster_group)
study_file = cluster_group.study_file
metadata_url = study_file.is_viz_anndata? ?
RequestUtils.data_fragment_url(study_file, 'metadata') :
Expand Down Expand Up @@ -480,23 +480,6 @@ def self.encode_filename(values)
values.map { |val| val.gsub(/\+/, 'pos').gsub(/\W/, '_') }.join('--')
end

# return a GS URL for a requested ClusterGroup, depending on file type
#
# * *params*
# - +cluster_group+ (ClusterGroup) => Clustering object to source name/file from
#
# * *returns*
# - (String)
def self.cluster_file_url(cluster_group)
study_file = cluster_group.study_file
if study_file.is_viz_anndata?
data_frag = study_file.ann_data_file_info.find_fragment(data_type: :cluster, name: cluster_group.name)
RequestUtils.data_fragment_url(study_file, 'cluster', file_type_detail: data_frag[:obsm_key_name])
else
study_file.gs_url
end
end

# retrieve the weekly user quota value
#
# * *returns*
Expand Down
160 changes: 94 additions & 66 deletions app/lib/dot_plot_service.rb
Original file line number Diff line number Diff line change
@@ -1,107 +1,135 @@
# frozen_string_literal: true

# service that handles preprocessing expression/annotation data to speed up dot plot rendering
class DotPlotService
# main handler for launching ingest job to process expression data
# main handler for launching ingest job to process expression data into DotPlotGene objects
# since the study can have only one processed matrix/metadata file, this will only run if the study is eligible
#
# * *params*
# - +study+ (Study) => the study that owns the data
# - +cluster_group+ (ClusterGroup) => the cluster to source cell names from
# - +annotation_file+ (StudyFile) => the StudyFile containing annotation data
# - +expression_file+ (StudyFile) => the StudyFile to source data from
# - +cluster_group+ (ClusterGroup) => the cluster to set associations for
# - +user+ (User) => the user that will run the job
#
# * *yields*
# - (IngestJob) => the job that will be run to process the data
def self.run_preprocess_expression_job(study, cluster_group, annotation_file, expression_file)
study_eligible?(study) # method stub, waiting for scp-ingest-pipeline implementation
def self.run_process_dot_plot_genes(study, cluster_group, user)
validate_study(study, cluster_group)
expression_file = study_processed_matrices(study)&.first
metadata_file = study.metadata_file
validate_source_data(expression_file, metadata_file)
params_object = create_params_object(cluster_group, expression_file, metadata_file)
if params_object.valid?
job = IngestJob.new(
study:, study_file: expression_file, user:, action: :ingest_dot_plot_genes, params_object:
)
job.delay.push_remote_and_launch_ingest
true
else
raise ArgumentError, "job parameters failed to validate: #{params_object.errors.full_messages.join(', ')}"
end
end

# create DotPlotGeneIngestParameters object based on the provided files
#
# * *params*
# - +cluster_group+ (ClusterGroup) => the cluster group to associate with
# - +expression_file+ (StudyFile) => the expression matrix file to process
# - +metadata_file+ (StudyFile) => the metadata file to source annotations
#
# * *returns*
# - (DotPlotGeneIngestParameters) => a parameters object with the necessary file paths and metadata
def self.create_params_object(cluster_group, expression_file, metadata_file)
params = {
cluster_group_id: cluster_group.id,
cluster_file: RequestUtils.cluster_file_url(cluster_group)
}
case expression_file.file_type
when 'Expression Matrix'
params[:matrix_file_type] = 'dense'
params[:matrix_file_path] = expression_file.gs_url
params[:cell_metadata_file] = metadata_file.gs_url
when 'MM Coordinate Matrix'
params[:matrix_file_type] = 'mtx'
genes_file = expression_file.bundled_files.detect { |f| f.file_type == '10X Genes File' }
barcodes_file = expression_file.bundled_files.detect { |f| f.file_type == '10X Barcodes File' }
params[:matrix_file_path] = expression_file.gs_url
params[:cell_metadata_file] = metadata_file.gs_url
params[:gene_file] = genes_file.gs_url
params[:barcode_file] = barcodes_file.gs_url
when 'AnnData'
params[:matrix_file_type] = 'mtx' # extracted expression for AnnData is in MTX format
params[:cell_metadata_file] = RequestUtils.data_fragment_url(metadata_file, 'metadata')
params[:matrix_file_path] = RequestUtils.data_fragment_url(
expression_file, 'matrix', file_type_detail: 'processed'
)
params[:gene_file] = RequestUtils.data_fragment_url(
expression_file, 'features', file_type_detail: 'processed'
)
params[:barcode_file] = RequestUtils.data_fragment_url(
expression_file, 'barcodes', file_type_detail: 'processed'
)
end
DotPlotGeneIngestParameters.new(**params)
end

# determine study eligibility - can only have one processed matrix and be able to visualize clusters
#
# * *params*
# - +study+ (Study) the study that owns the data
# - +study+ (Study) => the study that owns the data
# * *returns*
# - (Boolean) true if the study is eligible for dot plot visualization
# - (Boolean) => true if the study is eligible for dot plot visualization
def self.study_eligible?(study)
processed_matrices = study_processed_matrices(study)
study.can_visualize_clusters? && study.has_expression_data? && processed_matrices.size == 1
end

# check if the given study/cluster has already been preprocessed
# * *params*
# - +study+ (Study) the study that owns the data
# - +cluster_group+ (ClusterGroup) the cluster to check for processed data
# - +study+ (Study) => the study that owns the data
# - +cluster_group+ (ClusterGroup) => the cluster to check for processed data
#
# * *returns*
# - (Boolean) true if the study/cluster has already been processed
# - (Boolean) => true if the study/cluster has already been processed
def self.cluster_processed?(study, cluster_group)
DotPlotGene.where(study:, cluster_group:).exists?
end

# get processed expression matrices for a study
#
# * *params*
# - +study+ (Study) the study to get matrices for
# - +study+ (Study) => the study to get matrices for
#
# * *returns*
# - (Array<StudyFile>) an array of processed expression matrices for the study
# - (Array<StudyFile>) => an array of processed expression matrices for the study
def self.study_processed_matrices(study)
study.expression_matrices.select do |matrix|
matrix.is_viz_anndata? || !matrix.is_raw_counts_file?
end
end

# seeding method for testing purposes, will be removed once pipeline is in place
# data is random and not representative of actual expression data
def self.seed_dot_plot_genes(study)
return false unless study_eligible?(study)

DotPlotGene.where(study_id: study.id).delete_all
puts "Seeding dot plot genes for #{study.accession}"
expression_matrix = study.expression_matrices.first
print 'assembling genes and annotations...'
genes = Gene.where(study:, study_file: expression_matrix).pluck(:name)
annotations = AnnotationVizService.available_metadata_annotations(
study, annotation_type: 'group'
).reject { |a| a[:scope] == 'invalid' }
puts " done. Found #{genes.size} genes and #{annotations.size} study-wide annotations."
study.cluster_groups.each do |cluster_group|
next if cluster_processed?(study, cluster_group)
# validate the study for dot plot preprocessing
#
# * *params*
# - +study+ (Study) => the study to validate
#
# * *raises*
# - (ArgumentError) => if the study is invalid or does not qualify for dot plot visualization
def self.validate_study(study, cluster_group)
raise ArgumentError, 'Invalid study' unless study.present? && study.is_a?(Study)
raise ArgumentError, 'Study does not qualify for dot plot visualization' unless study_eligible?(study)
raise ArgumentError, 'Study has already been processed' if cluster_processed?(study, cluster_group)
end

cluster_annotations = ClusterVizService.available_annotations_by_cluster(
cluster_group, 'group'
).reject { |a| a[:scope] == 'invalid' }
all_annotations = annotations + cluster_annotations
puts "Processing #{cluster_group.name} with #{all_annotations.size} annotations."
documents = []
genes.each do |gene|
exp_scores = all_annotations.map do |annotation|
{
"#{annotation[:name]}--#{annotation[:type]}--#{annotation[:scope]}" => annotation[:values].map do |value|
{ value => [rand.round(3), rand.round(3)] }
end.reduce({}, :merge)
}
end.reduce({}, :merge)
documents << DotPlotGene.new(
study:, study_file: expression_matrix, cluster_group:, gene_symbol: gene, searchable_gene: gene.downcase,
exp_scores:
).attributes
if documents.size == 1000
DotPlotGene.collection.insert_many(documents)
count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size
puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}."
documents.clear
end
end
DotPlotGene.collection.insert_many(documents)
count = DotPlotGene.where(study_id: study.id, cluster_group_id: cluster_group.id).size
puts "Inserted #{count}/#{genes.size} DotPlotGenes for #{cluster_group.name}."
puts "Finished processing #{cluster_group.name}"
end
puts "Seeding complete for #{study.accession}, #{DotPlotGene.where(study_id: study.id).size} DotPlotGenes created."
true
rescue StandardError => e
puts "Error seeding DotPlotGenes in #{study.accession}: #{e.message}"
false
# validate required data is present for processing
#
# * *params*
# - +expression_file+ (StudyFile) => the expression matrix file to process
# - +metadata_file+ (StudyFile) => the metadata file to source annotations
#
# * *raises*
# - (ArgumentError) => if the source data is not fully parsed or MTX bundled is not completed
def self.validate_source_data(expression_file, metadata_file)
raise ArgumentError, 'Missing required files' unless expression_file.present? && metadata_file.present?
raise ArgumentError, 'Source data not fully parsed' unless expression_file.parsed? && metadata_file.parsed?
raise ArgumentError, 'MTX bundled not completed' if expression_file.should_bundle? &&
!expression_file.has_completed_bundle?
end
end
18 changes: 18 additions & 0 deletions app/lib/request_utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,24 @@ def self.data_fragment_url(ann_data_file, fragment_type, gs_url: true, file_type
"#{url}.#{ext}.gz"
end

# return a GS URL for a requested ClusterGroup, depending on file type
#
# * *params*
# - +cluster_group+ (ClusterGroup) => Clustering object to source name/file from
#
# * *returns*
# - (String)
def self.cluster_file_url(cluster_group)
study_file = cluster_group.study_file
if study_file.is_viz_anndata?
data_frag = study_file.ann_data_file_info.find_fragment(data_type: :cluster, name: cluster_group.name)
safe_frag = data_frag.with_indifferent_access
data_fragment_url(study_file, 'cluster', file_type_detail: safe_frag[:obsm_key_name])
else
study_file.gs_url
end
end

# extracts an array of genes from a comma-delimited string list of gene names
def self.get_genes_from_param(study, gene_param)
terms = RequestUtils.sanitize_search_terms(gene_param).split(',')
Expand Down
9 changes: 8 additions & 1 deletion app/models/ann_data_file_info.rb
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,14 @@ def merge_form_fragments(form_data, fragments)
# also supports finding values as both strings and symbols (for data_type values)
def find_fragment(**attrs)
data_fragments.detect do |fragment|
!{ **attrs }.map { |k, v| fragment[k] == v || fragment[k] == v.send(transform_for(v)) }.include?(false)
!{ **attrs }.map do |k, v|
safe_v = v.send(transform_for(v))
safe_k = k.send(transform_for(k))
fragment[k] == v ||
fragment[k] == safe_v ||
fragment[safe_k] == v ||
fragment[safe_k] == safe_v
end.include?(false)
end
end

Expand Down
3 changes: 2 additions & 1 deletion app/models/batch_api_client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class BatchApiClient
ingest_differential_expression: ['Differential Expression'],
render_expression_arrays: %w[Cluster],
image_pipeline: %w[Cluster],
ingest_anndata: %w[AnnData]
ingest_anndata: %w[AnnData],
ingest_dot_plot_genes: ['Expression Matrix', 'MM Coordinate Matrix', 'AnnData']
}.freeze

# default GCE machine_type
Expand Down
52 changes: 52 additions & 0 deletions app/models/dot_plot_gene_ingest_parameters.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# class to hold parameters specific to ingest job for computing dot plot gene metrics
class DotPlotGeneIngestParameters
include ActiveModel::Model
include Parameterizable

PARAMETER_NAME = '--ingest-dot-plot-genes'

# cell_metadata_file: metadata file to source annotations
# cluster_file: clustering file with cells to use as control list for filtering and optional annotations
# cluster_group_id: BSON ID of ClusterGroup object for associations
# matrix_file_path: expression matrix with source data
# matrix_file_type: type of expression matrix (dense, sparse)
# gene_file (optional): genes/features file for sparse matrix
# barcode_file (optional): barcodes file for sparse matrix
# machine_type (optional): override for default ingest machine type (uses 'n2d-highmem-8')
PARAM_DEFAULTS = {
cell_metadata_file: nil,
cluster_file: nil,
cluster_group_id: nil,
matrix_file_path: nil,
matrix_file_type: nil,
gene_file: nil,
barcode_file: nil,
machine_type: 'n2d-highmem-8'
}.freeze

# values that are available as methods but not as attributes (and not passed to command line)
NON_ATTRIBUTE_PARAMS = %i[machine_type].freeze

attr_accessor(*PARAM_DEFAULTS.keys)

validates :cell_metadata_file, :cluster_file, :cluster_group_id, :matrix_file_path, :matrix_file_type, presence: true
validates :cell_metadata_file, :cluster_file, :matrix_file_path,
format: { with: Parameterizable::GS_URL_REGEXP, message: 'is not a valid GS url' }
validates :matrix_file_type, inclusion: %w[dense mtx]
validates :machine_type, inclusion: Parameterizable::GCE_MACHINE_TYPES
validates :gene_file, :barcode_file,
presence: true,
format: {
with: Parameterizable::GS_URL_REGEXP,
message: 'is not a valid GS url'
},
if: -> { matrix_file_type == 'mtx' }

def initialize(attributes = nil)
super
end

def cluster_group
ClusterGroup.find(cluster_group_id)
end
end
Loading
Loading