diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml index 36a64cb8..0aa0c69e 100644 --- a/.github/workflows/minify_ontologies.yml +++ b/.github/workflows/minify_ontologies.yml @@ -3,9 +3,9 @@ name: Minify ontologies on: pull_request: types: [opened] # Only trigger on PR "opened" event -# push: # Uncomment, update branches to develop / debug -# branches: -# jb-anndata-mixpanel-props + push: # Uncomment, update branches to develop / debug + branches: + jb-metadata-boolean jobs: build: @@ -86,7 +86,7 @@ jobs: # Commit changes git commit -m "Update minified ontologies via GitHub Actions" - git push origin ${{ github.ref_name }} + git push origin ${{ github.head_ref }} else echo "No changes to commit." fi diff --git a/ingest/anndata_.py b/ingest/anndata_.py index 5c1c357c..c502958d 100644 --- a/ingest/anndata_.py +++ b/ingest/anndata_.py @@ -147,7 +147,9 @@ def generate_metadata_file(adata, output_name): headers = adata.obs.columns.tolist() types = [] for header in headers: - if pd.api.types.is_numeric_dtype(adata.obs[header]): + if pd.api.types.is_bool_dtype(adata.obs[header]): + types.append("GROUP") + elif pd.api.types.is_numeric_dtype(adata.obs[header]): types.append("NUMERIC") else: types.append("GROUP") diff --git a/ingest/validation/ontologies/cl.min.tsv.gz b/ingest/validation/ontologies/cl.min.tsv.gz index b249bd8b..73feba2b 100644 Binary files a/ingest/validation/ontologies/cl.min.tsv.gz and b/ingest/validation/ontologies/cl.min.tsv.gz differ diff --git a/ingest/validation/ontologies/efo.min.tsv.gz b/ingest/validation/ontologies/efo.min.tsv.gz index 34d07843..3aaba05d 100644 Binary files a/ingest/validation/ontologies/efo.min.tsv.gz and b/ingest/validation/ontologies/efo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/mondo.min.tsv.gz b/ingest/validation/ontologies/mondo.min.tsv.gz index 867c58d1..0d74da3e 100644 Binary files a/ingest/validation/ontologies/mondo.min.tsv.gz and b/ingest/validation/ontologies/mondo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/version.txt b/ingest/validation/ontologies/version.txt index 8db27dc3..aa13a28a 100644 --- a/ingest/validation/ontologies/version.txt +++ b/ingest/validation/ontologies/version.txt @@ -1 +1 @@ -1726600528 # validation cache key +1729700083 # validation cache key diff --git a/tests/data/anndata/anndata_boolean_test.h5ad b/tests/data/anndata/anndata_boolean_test.h5ad new file mode 100644 index 00000000..3ac902b1 Binary files /dev/null and b/tests/data/anndata/anndata_boolean_test.h5ad differ diff --git a/tests/test_anndata.py b/tests/test_anndata.py index c669ce4d..ce412b41 100644 --- a/tests/test_anndata.py +++ b/tests/test_anndata.py @@ -29,6 +29,7 @@ def setup_class(self): filepath_dup_cell = "../tests/data/anndata/dup_cell.h5ad" filepath_nan = "../tests/data/anndata/nan_value.h5ad" filepath_synthetic = "../tests/data/anndata/anndata_test.h5ad" + filepath_boolean = "../tests/data/anndata/anndata_boolean_test.h5ad" self.study_id = "addedfeed000000000000000" self.study_file_id = "dec0dedfeed0000000000000" self.valid_args = [filepath_valid, self.study_id, self.study_file_id] @@ -41,6 +42,7 @@ def setup_class(self): self.dup_cell_args = [filepath_dup_cell, self.study_id, self.study_file_id] self.nan_value_args = [filepath_nan, self.study_id, self.study_file_id] self.synthetic_args = [filepath_synthetic, self.study_id, self.study_file_id] + self.boolean_args = [filepath_boolean, self.study_id, self.study_file_id] self.cluster_name = 'X_tsne' self.valid_kwargs = {'obsm_keys': [self.cluster_name]} self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs) @@ -181,6 +183,42 @@ def test_generate_metadata_file(self): expected_types, type_line, 'did not get expected types from metadata body' ) + def test_generate_metadata_with_boolean(self): + boolean_ingest = AnnDataIngestor(*self.boolean_args, **self.valid_kwargs) + adata = boolean_ingest.obtain_adata() + boolean_filename = "h5ad_frag.metadata_boolean.tsv" + boolean_ingest.generate_metadata_file( + adata, boolean_filename + ) + self.assertEqual( + 'bool', adata.obs['is_primary_data'].dtype.name, + 'did not correctly get "bool" dtype for "is_primary_data"' + ) + compressed_file = boolean_filename + ".gz" + with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body: + name_line = metadata_body.readline().split("\t") + expected_headers = [ + 'NAME', 'donor_id', 'biosample_id', 'sex', 'species', 'species__ontology_label', + 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ', + 'organ__ontology_label', 'disease', 'disease__ontology_label', "is_primary_data\n" + ] + self.assertEqual( + expected_headers, name_line, 'did not get expected headers from metadata body' + ) + expected_types = [ + 'TYPE', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', + 'GROUP', "GROUP\n" + ] + type_line = metadata_body.readline().split("\t") + self.assertEqual( + expected_types, type_line, 'did not get expected types from metadata body' + ) + for line in metadata_body.readlines(): + is_primary_data = line.split("\t")[12].strip() + self.assertEqual( + "False", is_primary_data, 'did not correctly read boolean value as string from data' + ) + def test_gene_id_indexed_generate_processed_matrix(self): """Tests creating matrix when indexed by Ensembl ID, not gene name