broadinstitute · eweitz · Apr 23, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -339,11 +339,11 @@ GEM
       net-protocol
     netrc (0.11.0)
     nio4r (2.7.4)
-    nokogiri (1.18.4-arm64-darwin)
+    nokogiri (1.18.8-arm64-darwin)
       racc (~> 1.4)
-    nokogiri (1.18.4-x86_64-darwin)
+    nokogiri (1.18.8-x86_64-darwin)
       racc (~> 1.4)
-    nokogiri (1.18.4-x86_64-linux-gnu)
+    nokogiri (1.18.8-x86_64-linux-gnu)
       racc (~> 1.4)
     oauth2 (1.4.7)
       faraday (>= 0.8, < 2.0)

diff --git a/app/javascript/lib/validation/ontology-validation.js b/app/javascript/lib/validation/ontology-validation.js
@@ -12,9 +12,7 @@
 
 import { decompressSync, strFromU8 } from 'fflate'
 
-import {
-  metadataSchema, REQUIRED_CONVENTION_COLUMNS
-} from './shared-validation'
+import { metadataSchema } from './shared-validation'
 
 // TODO: Replace "development" with "main" after next ingest release
 const ONTOLOGY_BASE_URL =
@@ -136,24 +134,32 @@ export async function fetchOntologies() {
   return ontologies
 }
 
-/** Get lowercase shortnames for all required ontologies */
-function getOntologyShortNames() {
-  let requiredOntologies = []
-
-  // Validate IDs for species, organ, disease, and library preparation protocol
-  for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
-    const column = REQUIRED_CONVENTION_COLUMNS[i]
-    if (!column.endsWith('__ontology_label')) {continue}
-    const key = column.split('__ontology_label')[0]
-    const ontologies = getAcceptedOntologies(key, metadataSchema)
-    requiredOntologies = requiredOntologies.concat(ontologies)
-  }
+/** Get lowercase shortnames for all supported ontologies */
+export function getOntologyShortNames() {
+  let supportedOntologies = []
 
-  requiredOntologies = Array.from(
-    new Set(requiredOntologies.map(o => o.toLowerCase()))
-  )
+  // get all ontology-based properties, ignoring organ_region as it isn't minified
+  const properties = getOntologyBasedProps()
+  for (let i = 0; i < properties.length; i++) {
+    const prop = properties[i]
+    const ontologies = getAcceptedOntologies(prop, metadataSchema)
+    supportedOntologies = supportedOntologies.concat(ontologies)
+  }
+  return Array.from(new Set(supportedOntologies.map(o => o.toLowerCase())))
+}
 
-  return requiredOntologies
+/** get all metadata properties that are ontology-based */
+export function getOntologyBasedProps() {
+  const ontologyProps = []
+  // ignore organ_region as it isn't a supported minified ontology
+  const properties = Object.keys(metadataSchema.properties).filter(p => p !== 'organ_region')
+  for (let i = 0; i < properties.length; i++) {
+    const prop = properties[i]
+    if (metadataSchema.properties[prop].ontology) {
+      ontologyProps.push(prop)
+    }
+  }
+  return ontologyProps
 }
 
 /**
@@ -168,7 +174,7 @@ export function getAcceptedOntologies(key, metadataSchema) {
   const acceptedOntologies =
     olsUrls?.split(',').map(url => url.split('/').slice(-1)[0].toUpperCase())
 
-  if (acceptedOntologies.includes('NCBITAXON')) {
+  if (acceptedOntologies && acceptedOntologies.includes('NCBITAXON')) {
     acceptedOntologies.push('NCBITaxon')
   }
 

diff --git a/app/javascript/lib/validation/shared-validation.js b/app/javascript/lib/validation/shared-validation.js
@@ -313,3 +313,13 @@ export function timeOutCSFV(chunker) {
   }
   return issues
 }
+
+/** get ontology shortname from an identifier */
+export function getOntologyShortNameLc(identifier) {
+  return identifier.split(/[_:]/)[0].toLowerCase()
+}
+
+export function getLabelSuffixForOntology(indentifier) {
+  const shortName = getOntologyShortNameLc(indentifier)
+  return shortName === 'uo' ? '_label' : '__ontology_label'
+}
diff --git a/app/javascript/lib/validation/validate-anndata.js b/app/javascript/lib/validation/validate-anndata.js
@@ -3,10 +3,12 @@ import { openH5File } from 'hdf5-indexed-reader'
 import { getOAuthToken } from '~/lib/scp-api'
 import {
   validateUnique, validateRequiredMetadataColumns,
-  validateAlphanumericAndUnderscores,
+  validateAlphanumericAndUnderscores, getOntologyShortNameLc,
   metadataSchema, REQUIRED_CONVENTION_COLUMNS
 } from './shared-validation'
-import { getAcceptedOntologies, fetchOntologies } from './ontology-validation'
+import { fetchOntologies, getOntologyBasedProps, getAcceptedOntologies } from './ontology-validation'
+
+const ONTOLOGY_PROPS = getOntologyBasedProps()
 
 /** Get ontology ID values for key in AnnData file */
 async function getOntologyIds(key, hdf5File) {
@@ -26,7 +28,7 @@ async function getOntologyIds(key, hdf5File) {
   if (internalCategories) {
     resolvedCategories = await Promise.all(internalCategories.values)
   }
-  const group = resolvedCategories.find(o => o.name.endsWith(key))
+  const group = resolvedCategories.find(o => findMatchingGroup(o, key))
   if (group) {
     let categories
     if (internalCategories) {
@@ -40,6 +42,11 @@ async function getOntologyIds(key, hdf5File) {
   return ontologyIds
 }
 
+/** find a group in /obs based on exact name match */
+export function findMatchingGroup(category, key) {
+  return category.name.split('/').slice(-1)[0] === key
+}
+
 /** Get annotation headers for a key (e.g. obs) from an HDF5 file */
 async function getAnnotationHeaders(key, hdf5File) {
   const obsGroup = await hdf5File.get(key)
@@ -122,7 +129,7 @@ export function checkOntologyIdFormat(key, ontologyIds) {
 }
 
 /** Validate author's annotation labels and IDs match those in ontologies */
-async function checkOntologyLabelsAndIds(key, ontologies, groups) {
+export async function checkOntologyLabelsAndIds(key, ontologies, groups) {
   const [ids, idIndexes, labels, labelIndexes] = groups
 
   const issues = []
@@ -138,15 +145,14 @@ async function checkOntologyLabelsAndIds(key, ontologies, groups) {
 
   rawUniques.map(r => {
     let [id, label] = r.split(' || ')
-    const ontologyShortNameLc = id.split(/[_:]/)[0].toLowerCase()
+    const ontologyShortNameLc = getOntologyShortNameLc(id)
     const ontology = ontologies[ontologyShortNameLc]
 
     if (id.includes(':')) {
       // Convert colon to underscore for ontology lookup
       const idParts = id.split(':')
       id = `${idParts[0]}_${idParts[1]}`
     }
-
     if (!(id in ontology)) {
       // Register invalid ontology ID
       const msg = `Invalid ontology ID: ${id}`
@@ -174,9 +180,10 @@ async function checkOntologyLabelsAndIds(key, ontologies, groups) {
 }
 
 /** Get ontology ID values for key in AnnData file */
-async function getOntologyIdsAndLabels(requiredName, hdf5File) {
+export async function getOntologyIdsAndLabels(columnName, hdf5File) {
   const obs = await hdf5File.get('obs')
   const obsValues = await Promise.all(obs.values)
+  const isRequired = REQUIRED_CONVENTION_COLUMNS.includes(columnName)
 
   // Old versions of the AnnData spec used __categories as an obs.
   // However, in new versions (since before 2023-01-23) of AnnData spec,
@@ -192,11 +199,14 @@ async function getOntologyIdsAndLabels(requiredName, hdf5File) {
     return null
   }
 
-  const idKey = requiredName
-  const labelKey = `${requiredName}__ontology_label`
+  const idKey = columnName
+  const labelKey = `${columnName}__ontology_label`
+
+  const idGroup = obsValues.find(o => findMatchingGroup(o, idKey))
+  const labelGroup = obsValues.find(o => findMatchingGroup(o, labelKey))
 
-  const idGroup = obsValues.find(o => o.name.endsWith(idKey))
-  const labelGroup = obsValues.find(o => o.name.endsWith(labelKey))
+  // exit when optional metadata isn't found, like cell_type
+  if (!idGroup && !isRequired) { return }
 
   // AnnData organizes each "obs" annotation (e.g. disease__ontology_label,
   // sex) into a container with a `categories` frame and a `code` frame.
@@ -231,15 +241,13 @@ async function validateOntologyLabelsAndIds(hdf5File) {
   const ontologies = await fetchOntologies()
 
   // Validate IDs for species, organ, disease, and library preparation protocol
-  for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
-    const column = REQUIRED_CONVENTION_COLUMNS[i]
-    if (!column.endsWith('__ontology_label')) {continue}
-    const key = column.split('__ontology_label')[0]
-    const groups = await getOntologyIdsAndLabels(key, hdf5File)
+  for (let i = 0; i < ONTOLOGY_PROPS.length; i++) {
+    const column = ONTOLOGY_PROPS[i]
+    const groups = await getOntologyIdsAndLabels(column, hdf5File)
 
     if (groups) {
       issues = issues.concat(
-        await checkOntologyLabelsAndIds(key, ontologies, groups)
+        await checkOntologyLabelsAndIds(column, ontologies, groups)
       )
     }
   }
@@ -253,14 +261,12 @@ async function validateOntologyIdFormat(hdf5File) {
   let issues = []
 
   // Validate IDs for species, organ, disease, and library preparation protocol
-  for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
-    const column = REQUIRED_CONVENTION_COLUMNS[i]
-    if (!column.endsWith('__ontology_label')) {continue}
-    const key = column.split('__ontology_label')[0]
-    const ontologyIds = await getOntologyIds(key, hdf5File)
+  for (let i = 0; i < ONTOLOGY_PROPS.length; i++) {
+    const column = ONTOLOGY_PROPS[i]
+    const ontologyIds = await getOntologyIds(column, hdf5File)
 
     issues = issues.concat(
-      checkOntologyIdFormat(key, ontologyIds)
+      checkOntologyIdFormat(column, ontologyIds)
     )
   }
 

diff --git a/app/javascript/lib/validation/validate-file-content.js b/app/javascript/lib/validation/validate-file-content.js
@@ -22,7 +22,8 @@ import {
 } from './shared-validation'
 import { parseDifferentialExpressionFile } from './validate-differential-expression'
 import { parseAnnDataFile } from './validate-anndata'
-
+import { fetchOntologies, getOntologyBasedProps } from '~/lib/validation/ontology-validation'
+import { getOntologyShortNameLc, getLabelSuffixForOntology } from './shared-validation'
 
 /**
  * Gzip decompression requires reading the whole file, given the current
@@ -36,7 +37,7 @@ const MAX_GZIP_FILESIZE = 50 * oneMiB
 
 /** File extensions / suffixes that indicate content must be gzipped */
 const EXTENSIONS_MUST_GZIP = ['gz', 'bam', 'tbi', 'csi']
-
+const ONTOLOGY_PROPS = getOntologyBasedProps()
 
 /**
  * Helper function to verify first pair of headers is NAME or TYPE
@@ -217,7 +218,11 @@ export async function parseMetadataFile(chunker, mimeType, fileOptions) {
   const { headers, delimiter } = await getParsedHeaderLines(chunker, mimeType)
   let issues = validateCapFormat(headers)
   issues = issues.concat(validateNoMetadataCoordinates(headers))
+  let ontologies
+  // keep track of a map of ontology-based errors to avoid duplications
+  const knownErrors = []
   if (fileOptions.use_metadata_convention) {
+    ontologies = await fetchOntologies()
     issues = issues.concat(validateRequiredMetadataColumns(headers))
   }
 
@@ -232,12 +237,87 @@ export async function parseMetadataFile(chunker, mimeType, fileOptions) {
       issues = issues.concat(validateUniqueCellNamesWithinFile(line, isLastLine, dataObj))
       issues = issues.concat(validateMetadataLabelMatches(headers, line, isLastLine, dataObj))
       issues = issues.concat(validateGroupColumnCounts(headers, line, isLastLine, dataObj))
+      if (fileOptions.use_metadata_convention) {
+        issues = issues.concat(validateConventionTerms(headers, line, ontologies, knownErrors))
+      }
     // add other line-by-line validations here
     }
   })
   return { issues, delimiter, numColumns: headers[0].length }
 }
 
+/** validate all ontology-based convention terms in a given line */
+export function validateConventionTerms(headers, line, ontologies, knownErrors) {
+  let issues = []
+  const metadataHeaders = headers[0]
+  for (let i = 0; i < metadataHeaders.length; i++) {
+    const header = metadataHeaders[i]
+    if (ONTOLOGY_PROPS.includes(header)) {
+      const ontologyId = line[i]
+      const labelHeader = `${header}${getLabelSuffixForOntology(ontologyId)}`
+      const labelIdx = metadataHeaders.indexOf(labelHeader)
+      const label = line[labelIdx]
+      issues = issues.concat(validateOntologyTerm(header, ontologyId, label, ontologies, knownErrors))
+    }
+  }
+  return issues
+}
+
+/** validate a single ontology ID against stored ontologies and return issues */
+export function validateOntologyTerm(prop, ontologyId, label, ontologies, knownErrors) {
+  const issues = []
+  const ontologyShortNameLc = getOntologyShortNameLc(ontologyId)
+  const ontology = ontologies[ontologyShortNameLc]
+
+  if (ontologyId.includes(':')) {
+    // Convert colon to underscore for ontology lookup
+    const idParts = ontologyId.split(':')
+    ontologyId = `${idParts[0]}_${idParts[1]}`
+  }
+
+  let errorIdentifier
+  let issue
+
+  if (!ontology) {
+    errorIdentifier = `${ontologyId}-label-lookup-error`
+    const accepted = Object.keys(ontologies).join(', ')
+    const msg =
+      `Ontology ID "${ontologyId}" ` +
+      `is not among accepted ontologies (${accepted}) ` +
+      `for key "${prop}"`
+
+    issue = ['error', 'ontology:label-lookup-error', msg]
+  } else if (!(ontologyId in ontology)) {
+    // Register invalid ontology ID
+    const msg = `Invalid ontology ID: ${ontologyId}`
+    errorIdentifier = `${ontologyId}-invalid-id`
+    issue = [
+      'error', 'ontology:label-lookup-error', msg,
+      { subtype: 'ontology:invalid-id' }
+    ]
+  } else {
+    const validLabels = ontology[ontologyId]
+
+    if (!(validLabels.includes(label))) {
+      errorIdentifier = `${ontologyId}-label-lookup-error`
+      // Register invalid ontology label
+      const prettyLabels = validLabels.join(', ')
+      const validLabelsClause = `Valid labels for ${ontologyId}: ${prettyLabels}`
+      const msg = `Invalid ${prop} label "${label}".  ${validLabelsClause}`
+      issue = [
+        'error', 'ontology:label-not-match-id', msg,
+        { subtype: 'ontology:invalid-label' }
+      ]
+    }
+  }
+  // only store unique instances of errors since we're validating line by line
+  if (issue && knownErrors.indexOf(errorIdentifier) < 0) {
+    issues.push(issue)
+    knownErrors.push(errorIdentifier)
+  }
+  return issues
+}
+
 /** parse a cluster file, and return an array of issues, along with file parsing info */
 export async function parseClusterFile(chunker, mimeType) {
   const { headers, delimiter } = await getParsedHeaderLines(chunker, mimeType)

diff --git a/app/models/ingest_job.rb b/app/models/ingest_job.rb
@@ -1044,7 +1044,7 @@ def log_to_mixpanel
     mixpanel_log_props = get_job_analytics
     # log job properties to Mixpanel
     MetricsService.log(mixpanel_event_name, mixpanel_log_props, user)
-    report_anndata_summary if study_file.is_viz_anndata? && !%i[ingest_anndata differential_expression].include?(action)
+    report_anndata_summary if study_file.is_viz_anndata?
   end
 
   # set a mixpanel event name based on action
@@ -1087,7 +1087,7 @@ def anndata_summary_props
       commands = client.get_job_command_line(job:)
       commands.detect { |c| c == '--extract' } || client.job_error(job.name).present?
     end.count
-    num_files_extracted += 1 if extracted_raw_counts?(initial_extract)
+    num_files_extracted += 1 if extracted_raw_counts?(initial_extract) && job_status == 'success'
     # event properties for Mixpanel summary event
     {
       perfTime: job_perftime,
@@ -1115,10 +1115,20 @@ def extracted_raw_counts?(job)
     extract_params.include?('raw_counts')
   end
 
+  # determine if this job qualifies for sending an ingestSummary event
+  # will return false if summary exists, this is a DE job, or
+  # a successful AnnData extract (meaning downstream jobs are running)
+  def skip_anndata_summary?
+    study_file.has_anndata_summary? ||
+      action == :differential_expression ||
+      should_retry? ||
+      (!failed? && action == :ingest_anndata)
+  end
+
   # report a summary of all AnnData extraction for this file to Mixpanel, if this is the last job
   def report_anndata_summary
     study_file.reload
-    return false if study_file.has_anndata_summary? # don't bother checking if summary is already sent
+    return false if skip_anndata_summary?
 
     file_identifier = "#{study_file.upload_file_name} (#{study_file.id})"
     Rails.logger.info "Checking AnnData summary for #{file_identifier} after #{action}"

diff --git a/package.json b/package.json
@@ -105,7 +105,7 @@
     "prop-types": "^15.7.2",
     "react-select-event": "^5.3.0",
     "stylelint-prettier": "^1.1.2",
-    "vite": "^4.5.10",
+    "vite": "^4.5.13",
     "vite-plugin-ruby": "^3.0.5"
   }
 }