Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3cd7ae8
Extending CSFV ontology validation to classic metadata files
bistline Apr 15, 2025
ae887a6
adding tests
bistline Apr 15, 2025
1e4f7e3
Bump vite from 4.5.10 to 4.5.13
dependabot[bot] Apr 15, 2025
d0abca5
addressing test regressions
bistline Apr 15, 2025
cee65c9
fixing issue with column names for AnnData validation
bistline Apr 15, 2025
e9c1f00
final remaining test regressions
bistline Apr 15, 2025
ae65080
Handle optional metadata, consolidating & removing debugs
bistline Apr 16, 2025
64dd749
Deal with timeout issue re: validation taking too long
bistline Apr 16, 2025
078bf2b
Simplifying knownErrors usage, fixing declaration timeout error
bistline Apr 16, 2025
92c8ce9
Fix error label
bistline Apr 16, 2025
b792ae9
still working on timeout error, making metadata file smaller
bistline Apr 16, 2025
894d411
Merge remote-tracking branch 'origin/jb-csfv-consolidation' into jb-c…
bistline Apr 16, 2025
9381539
Merge pull request #2242 from broadinstitute/jb-csfv-consolidation
bistline Apr 17, 2025
07ce597
Report ingestSummary even if AnnData extraction fails
bistline Apr 17, 2025
a464021
Adding tests
bistline Apr 17, 2025
171403a
Merge pull request #2240 from broadinstitute/dependabot/npm_and_yarn/…
eweitz Apr 17, 2025
81f177c
Bump nokogiri from 1.18.4 to 1.18.8
dependabot[bot] Apr 22, 2025
f216444
Merge pull request #2244 from broadinstitute/dependabot/bundler/nokog…
eweitz Apr 22, 2025
d74fe21
Merge pull request #2243 from broadinstitute/jb-anndata-summary-fail
bistline Apr 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -339,11 +339,11 @@ GEM
net-protocol
netrc (0.11.0)
nio4r (2.7.4)
nokogiri (1.18.4-arm64-darwin)
nokogiri (1.18.8-arm64-darwin)
racc (~> 1.4)
nokogiri (1.18.4-x86_64-darwin)
nokogiri (1.18.8-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.18.4-x86_64-linux-gnu)
nokogiri (1.18.8-x86_64-linux-gnu)
racc (~> 1.4)
oauth2 (1.4.7)
faraday (>= 0.8, < 2.0)
Expand Down
46 changes: 26 additions & 20 deletions app/javascript/lib/validation/ontology-validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

import { decompressSync, strFromU8 } from 'fflate'

import {
metadataSchema, REQUIRED_CONVENTION_COLUMNS
} from './shared-validation'
import { metadataSchema } from './shared-validation'

// TODO: Replace "development" with "main" after next ingest release
const ONTOLOGY_BASE_URL =
Expand Down Expand Up @@ -136,24 +134,32 @@ export async function fetchOntologies() {
return ontologies
}

/** Get lowercase shortnames for all required ontologies */
function getOntologyShortNames() {
let requiredOntologies = []

// Validate IDs for species, organ, disease, and library preparation protocol
for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
const column = REQUIRED_CONVENTION_COLUMNS[i]
if (!column.endsWith('__ontology_label')) {continue}
const key = column.split('__ontology_label')[0]
const ontologies = getAcceptedOntologies(key, metadataSchema)
requiredOntologies = requiredOntologies.concat(ontologies)
}
/** Get lowercase shortnames for all supported ontologies */
export function getOntologyShortNames() {
let supportedOntologies = []

requiredOntologies = Array.from(
new Set(requiredOntologies.map(o => o.toLowerCase()))
)
// get all ontology-based properties, ignoring organ_region as it isn't minified
const properties = getOntologyBasedProps()
for (let i = 0; i < properties.length; i++) {
const prop = properties[i]
const ontologies = getAcceptedOntologies(prop, metadataSchema)
supportedOntologies = supportedOntologies.concat(ontologies)
}
return Array.from(new Set(supportedOntologies.map(o => o.toLowerCase())))
}

return requiredOntologies
/** get all metadata properties that are ontology-based */
export function getOntologyBasedProps() {
const ontologyProps = []
// ignore organ_region as it isn't a supported minified ontology
const properties = Object.keys(metadataSchema.properties).filter(p => p !== 'organ_region')
for (let i = 0; i < properties.length; i++) {
const prop = properties[i]
if (metadataSchema.properties[prop].ontology) {
ontologyProps.push(prop)
}
}
return ontologyProps
}

/**
Expand All @@ -168,7 +174,7 @@ export function getAcceptedOntologies(key, metadataSchema) {
const acceptedOntologies =
olsUrls?.split(',').map(url => url.split('/').slice(-1)[0].toUpperCase())

if (acceptedOntologies.includes('NCBITAXON')) {
if (acceptedOntologies && acceptedOntologies.includes('NCBITAXON')) {
acceptedOntologies.push('NCBITaxon')
}

Expand Down
10 changes: 10 additions & 0 deletions app/javascript/lib/validation/shared-validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,13 @@ export function timeOutCSFV(chunker) {
}
return issues
}

/** get ontology shortname from an identifier */
export function getOntologyShortNameLc(identifier) {
return identifier.split(/[_:]/)[0].toLowerCase()
}

export function getLabelSuffixForOntology(indentifier) {
const shortName = getOntologyShortNameLc(indentifier)
return shortName === 'uo' ? '_label' : '__ontology_label'
}
52 changes: 29 additions & 23 deletions app/javascript/lib/validation/validate-anndata.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ import { openH5File } from 'hdf5-indexed-reader'
import { getOAuthToken } from '~/lib/scp-api'
import {
validateUnique, validateRequiredMetadataColumns,
validateAlphanumericAndUnderscores,
validateAlphanumericAndUnderscores, getOntologyShortNameLc,
metadataSchema, REQUIRED_CONVENTION_COLUMNS
} from './shared-validation'
import { getAcceptedOntologies, fetchOntologies } from './ontology-validation'
import { fetchOntologies, getOntologyBasedProps, getAcceptedOntologies } from './ontology-validation'

const ONTOLOGY_PROPS = getOntologyBasedProps()

/** Get ontology ID values for key in AnnData file */
async function getOntologyIds(key, hdf5File) {
Expand All @@ -26,7 +28,7 @@ async function getOntologyIds(key, hdf5File) {
if (internalCategories) {
resolvedCategories = await Promise.all(internalCategories.values)
}
const group = resolvedCategories.find(o => o.name.endsWith(key))
const group = resolvedCategories.find(o => findMatchingGroup(o, key))
if (group) {
let categories
if (internalCategories) {
Expand All @@ -40,6 +42,11 @@ async function getOntologyIds(key, hdf5File) {
return ontologyIds
}

/** find a group in /obs based on exact name match */
export function findMatchingGroup(category, key) {
return category.name.split('/').slice(-1)[0] === key
}

/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
async function getAnnotationHeaders(key, hdf5File) {
const obsGroup = await hdf5File.get(key)
Expand Down Expand Up @@ -122,7 +129,7 @@ export function checkOntologyIdFormat(key, ontologyIds) {
}

/** Validate author's annotation labels and IDs match those in ontologies */
async function checkOntologyLabelsAndIds(key, ontologies, groups) {
export async function checkOntologyLabelsAndIds(key, ontologies, groups) {
const [ids, idIndexes, labels, labelIndexes] = groups

const issues = []
Expand All @@ -138,15 +145,14 @@ async function checkOntologyLabelsAndIds(key, ontologies, groups) {

rawUniques.map(r => {
let [id, label] = r.split(' || ')
const ontologyShortNameLc = id.split(/[_:]/)[0].toLowerCase()
const ontologyShortNameLc = getOntologyShortNameLc(id)
const ontology = ontologies[ontologyShortNameLc]

if (id.includes(':')) {
// Convert colon to underscore for ontology lookup
const idParts = id.split(':')
id = `${idParts[0]}_${idParts[1]}`
}

if (!(id in ontology)) {
// Register invalid ontology ID
const msg = `Invalid ontology ID: ${id}`
Expand Down Expand Up @@ -174,9 +180,10 @@ async function checkOntologyLabelsAndIds(key, ontologies, groups) {
}

/** Get ontology ID values for key in AnnData file */
async function getOntologyIdsAndLabels(requiredName, hdf5File) {
export async function getOntologyIdsAndLabels(columnName, hdf5File) {
const obs = await hdf5File.get('obs')
const obsValues = await Promise.all(obs.values)
const isRequired = REQUIRED_CONVENTION_COLUMNS.includes(columnName)

// Old versions of the AnnData spec used __categories as an obs.
// However, in new versions (since before 2023-01-23) of AnnData spec,
Expand All @@ -192,11 +199,14 @@ async function getOntologyIdsAndLabels(requiredName, hdf5File) {
return null
}

const idKey = requiredName
const labelKey = `${requiredName}__ontology_label`
const idKey = columnName
const labelKey = `${columnName}__ontology_label`

const idGroup = obsValues.find(o => findMatchingGroup(o, idKey))
const labelGroup = obsValues.find(o => findMatchingGroup(o, labelKey))

const idGroup = obsValues.find(o => o.name.endsWith(idKey))
const labelGroup = obsValues.find(o => o.name.endsWith(labelKey))
// exit when optional metadata isn't found, like cell_type
if (!idGroup && !isRequired) { return }

// AnnData organizes each "obs" annotation (e.g. disease__ontology_label,
// sex) into a container with a `categories` frame and a `code` frame.
Expand Down Expand Up @@ -231,15 +241,13 @@ async function validateOntologyLabelsAndIds(hdf5File) {
const ontologies = await fetchOntologies()

// Validate IDs for species, organ, disease, and library preparation protocol
for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
const column = REQUIRED_CONVENTION_COLUMNS[i]
if (!column.endsWith('__ontology_label')) {continue}
const key = column.split('__ontology_label')[0]
const groups = await getOntologyIdsAndLabels(key, hdf5File)
for (let i = 0; i < ONTOLOGY_PROPS.length; i++) {
const column = ONTOLOGY_PROPS[i]
const groups = await getOntologyIdsAndLabels(column, hdf5File)

if (groups) {
issues = issues.concat(
await checkOntologyLabelsAndIds(key, ontologies, groups)
await checkOntologyLabelsAndIds(column, ontologies, groups)
)
}
}
Expand All @@ -253,14 +261,12 @@ async function validateOntologyIdFormat(hdf5File) {
let issues = []

// Validate IDs for species, organ, disease, and library preparation protocol
for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
const column = REQUIRED_CONVENTION_COLUMNS[i]
if (!column.endsWith('__ontology_label')) {continue}
const key = column.split('__ontology_label')[0]
const ontologyIds = await getOntologyIds(key, hdf5File)
for (let i = 0; i < ONTOLOGY_PROPS.length; i++) {
const column = ONTOLOGY_PROPS[i]
const ontologyIds = await getOntologyIds(column, hdf5File)

issues = issues.concat(
checkOntologyIdFormat(key, ontologyIds)
checkOntologyIdFormat(column, ontologyIds)
)
}

Expand Down
84 changes: 82 additions & 2 deletions app/javascript/lib/validation/validate-file-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ import {
} from './shared-validation'
import { parseDifferentialExpressionFile } from './validate-differential-expression'
import { parseAnnDataFile } from './validate-anndata'

import { fetchOntologies, getOntologyBasedProps } from '~/lib/validation/ontology-validation'
import { getOntologyShortNameLc, getLabelSuffixForOntology } from './shared-validation'

/**
* Gzip decompression requires reading the whole file, given the current
Expand All @@ -36,7 +37,7 @@ const MAX_GZIP_FILESIZE = 50 * oneMiB

/** File extensions / suffixes that indicate content must be gzipped */
const EXTENSIONS_MUST_GZIP = ['gz', 'bam', 'tbi', 'csi']

const ONTOLOGY_PROPS = getOntologyBasedProps()

/**
* Helper function to verify first pair of headers is NAME or TYPE
Expand Down Expand Up @@ -217,7 +218,11 @@ export async function parseMetadataFile(chunker, mimeType, fileOptions) {
const { headers, delimiter } = await getParsedHeaderLines(chunker, mimeType)
let issues = validateCapFormat(headers)
issues = issues.concat(validateNoMetadataCoordinates(headers))
let ontologies
// keep track of a map of ontology-based errors to avoid duplications
const knownErrors = []
if (fileOptions.use_metadata_convention) {
ontologies = await fetchOntologies()
issues = issues.concat(validateRequiredMetadataColumns(headers))
}

Expand All @@ -232,12 +237,87 @@ export async function parseMetadataFile(chunker, mimeType, fileOptions) {
issues = issues.concat(validateUniqueCellNamesWithinFile(line, isLastLine, dataObj))
issues = issues.concat(validateMetadataLabelMatches(headers, line, isLastLine, dataObj))
issues = issues.concat(validateGroupColumnCounts(headers, line, isLastLine, dataObj))
if (fileOptions.use_metadata_convention) {
issues = issues.concat(validateConventionTerms(headers, line, ontologies, knownErrors))
}
// add other line-by-line validations here
}
})
return { issues, delimiter, numColumns: headers[0].length }
}

/** validate all ontology-based convention terms in a given line */
export function validateConventionTerms(headers, line, ontologies, knownErrors) {
let issues = []
const metadataHeaders = headers[0]
for (let i = 0; i < metadataHeaders.length; i++) {
const header = metadataHeaders[i]
if (ONTOLOGY_PROPS.includes(header)) {
const ontologyId = line[i]
const labelHeader = `${header}${getLabelSuffixForOntology(ontologyId)}`
const labelIdx = metadataHeaders.indexOf(labelHeader)
const label = line[labelIdx]
issues = issues.concat(validateOntologyTerm(header, ontologyId, label, ontologies, knownErrors))
}
}
return issues
}

/** validate a single ontology ID against stored ontologies and return issues */
export function validateOntologyTerm(prop, ontologyId, label, ontologies, knownErrors) {
const issues = []
const ontologyShortNameLc = getOntologyShortNameLc(ontologyId)
const ontology = ontologies[ontologyShortNameLc]

if (ontologyId.includes(':')) {
// Convert colon to underscore for ontology lookup
const idParts = ontologyId.split(':')
ontologyId = `${idParts[0]}_${idParts[1]}`
}

let errorIdentifier
let issue

if (!ontology) {
errorIdentifier = `${ontologyId}-label-lookup-error`
const accepted = Object.keys(ontologies).join(', ')
const msg =
`Ontology ID "${ontologyId}" ` +
`is not among accepted ontologies (${accepted}) ` +
`for key "${prop}"`

issue = ['error', 'ontology:label-lookup-error', msg]
} else if (!(ontologyId in ontology)) {
// Register invalid ontology ID
const msg = `Invalid ontology ID: ${ontologyId}`
errorIdentifier = `${ontologyId}-invalid-id`
issue = [
'error', 'ontology:label-lookup-error', msg,
{ subtype: 'ontology:invalid-id' }
]
} else {
const validLabels = ontology[ontologyId]

if (!(validLabels.includes(label))) {
errorIdentifier = `${ontologyId}-label-lookup-error`
// Register invalid ontology label
const prettyLabels = validLabels.join(', ')
const validLabelsClause = `Valid labels for ${ontologyId}: ${prettyLabels}`
const msg = `Invalid ${prop} label "${label}". ${validLabelsClause}`
issue = [
'error', 'ontology:label-not-match-id', msg,
{ subtype: 'ontology:invalid-label' }
]
}
}
// only store unique instances of errors since we're validating line by line
if (issue && knownErrors.indexOf(errorIdentifier) < 0) {
issues.push(issue)
knownErrors.push(errorIdentifier)
}
return issues
}

/** parse a cluster file, and return an array of issues, along with file parsing info */
export async function parseClusterFile(chunker, mimeType) {
const { headers, delimiter } = await getParsedHeaderLines(chunker, mimeType)
Expand Down
16 changes: 13 additions & 3 deletions app/models/ingest_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1044,7 +1044,7 @@ def log_to_mixpanel
mixpanel_log_props = get_job_analytics
# log job properties to Mixpanel
MetricsService.log(mixpanel_event_name, mixpanel_log_props, user)
report_anndata_summary if study_file.is_viz_anndata? && !%i[ingest_anndata differential_expression].include?(action)
report_anndata_summary if study_file.is_viz_anndata?
end

# set a mixpanel event name based on action
Expand Down Expand Up @@ -1087,7 +1087,7 @@ def anndata_summary_props
commands = client.get_job_command_line(job:)
commands.detect { |c| c == '--extract' } || client.job_error(job.name).present?
end.count
num_files_extracted += 1 if extracted_raw_counts?(initial_extract)
num_files_extracted += 1 if extracted_raw_counts?(initial_extract) && job_status == 'success'
# event properties for Mixpanel summary event
{
perfTime: job_perftime,
Expand Down Expand Up @@ -1115,10 +1115,20 @@ def extracted_raw_counts?(job)
extract_params.include?('raw_counts')
end

# determine if this job qualifies for sending an ingestSummary event
# will return false if summary exists, this is a DE job, or
# a successful AnnData extract (meaning downstream jobs are running)
def skip_anndata_summary?
study_file.has_anndata_summary? ||
action == :differential_expression ||
should_retry? ||
(!failed? && action == :ingest_anndata)
end

# report a summary of all AnnData extraction for this file to Mixpanel, if this is the last job
def report_anndata_summary
study_file.reload
return false if study_file.has_anndata_summary? # don't bother checking if summary is already sent
return false if skip_anndata_summary?

file_identifier = "#{study_file.upload_file_name} (#{study_file.id})"
Rails.logger.info "Checking AnnData summary for #{file_identifier} after #{action}"
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"prop-types": "^15.7.2",
"react-select-event": "^5.3.0",
"stylelint-prettier": "^1.1.2",
"vite": "^4.5.10",
"vite": "^4.5.13",
"vite-plugin-ruby": "^3.0.5"
}
}
Loading