Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions project/jsonld/data_sheets_schema.jsonld
Original file line number Diff line number Diff line change
Expand Up @@ -2067,15 +2067,15 @@
},
{
"name": "dataset__file_collections",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
"from_schema": "https://w3id.org/bridge2ai/data-sheets-schema",
"mappings": [
"http://schema.org/hasPart"
"https://w3id.org/bridge2ai/data-sheets-schema/fileCollections"
],
"exact_mappings": [
"http://www.w3.org/ns/dcat#distribution"
],
"slot_uri": "http://schema.org/hasPart",
"slot_uri": "https://w3id.org/bridge2ai/data-sheets-schema/fileCollections",
"alias": "file_collections",
"owner": "Dataset",
"domain_of": [
Expand Down Expand Up @@ -6865,11 +6865,11 @@
"attributes": [
{
"name": "file_collections",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
"exact_mappings": [
"dcat:distribution"
],
"slot_uri": "schema:hasPart",
"slot_uri": "d4d:fileCollections",
"range": "FileCollection",
"multivalued": true,
"inlined_as_list": true,
Expand Down Expand Up @@ -10713,9 +10713,9 @@
],
"metamodel_version": "1.7.0",
"source_file": "data_sheets_schema.yaml",
"source_file_date": "2026-03-26T22:44:02",
"source_file_size": 18530,
"generation_date": "2026-04-06T21:13:16",
"source_file_date": "2026-04-07T13:01:39",
"source_file_size": 18558,
"generation_date": "2026-04-07T13:03:27",
"@type": "SchemaDefinition",
"@context": [
"project/jsonld/data_sheets_schema.context.jsonld",
Expand Down
4 changes: 2 additions & 2 deletions project/jsonschema/data_sheets_schema.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,7 @@
]
},
"file_collections": {
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
"items": {
"$ref": "#/$defs/FileCollection"
},
Expand Down Expand Up @@ -2005,7 +2005,7 @@
]
},
"file_collections": {
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
"description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
"items": {
"$ref": "#/$defs/FileCollection"
},
Expand Down
1,888 changes: 944 additions & 944 deletions project/owl/data_sheets_schema.owl.ttl

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/data_sheets_schema/datamodel/data_sheets_schema.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from data_sheets_schema.yaml by pythongen.py version: 0.0.1
# Generation date: 2026-04-06T21:13:18
# Generation date: 2026-04-07T13:03:28
# Schema: data-sheets-schema
#
# id: https://w3id.org/bridge2ai/data-sheets-schema
Expand Down Expand Up @@ -3788,7 +3788,7 @@ class slots:
slots.resources = Slot(uri=SCHEMA.hasPart, name="resources", curie=SCHEMA.curie('hasPart'),
model_uri=DATA_SHEETS_SCHEMA.resources, domain=None, range=Optional[Union[Union[str, DatasetId], list[Union[str, DatasetId]]]])

slots.dataset__file_collections = Slot(uri=SCHEMA.hasPart, name="dataset__file_collections", curie=SCHEMA.curie('hasPart'),
slots.dataset__file_collections = Slot(uri=D4D.fileCollections, name="dataset__file_collections", curie=D4D.curie('fileCollections'),
model_uri=DATA_SHEETS_SCHEMA.dataset__file_collections, domain=None, range=Optional[Union[dict[Union[str, FileCollectionId], Union[dict, FileCollection]], list[Union[dict, FileCollection]]]])

slots.dataset__total_file_count = Slot(uri=D4D.totalFileCount, name="dataset__total_file_count", curie=D4D.curie('totalFileCount'),
Expand Down
4 changes: 4 additions & 0 deletions src/data_sheets_schema/schema/D4D_FileCollection.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ classes:
- range: FileCollection
multivalued: true
inlined_as_list: true
# NOTE: LinkML generator limitation - Generated artifacts (Python datamodel,
# JSON Schema) do not fully reflect this union type constraint. The generated
# code still types resources as Dataset rather than (File | FileCollection).
# This is a known limitation of LinkML's union type handling.
attributes:
collection_type:
description: >-
Expand Down
4 changes: 2 additions & 2 deletions src/data_sheets_schema/schema/data_sheets_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ classes:
Collections of files within this dataset. Each collection represents
a logical grouping of files with shared characteristics (e.g., all
training data, all image files, all raw data files). Maps to nested
RO-Crate Dataset entities via schema:hasPart.
slot_uri: schema:hasPart
RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.
slot_uri: d4d:fileCollections
range: FileCollection
multivalued: true
inlined_as_list: true
Expand Down
8 changes: 4 additions & 4 deletions src/data_sheets_schema/schema/data_sheets_schema_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2171,11 +2171,11 @@ classes:
description: Collections of files within this dataset. Each collection represents
a logical grouping of files with shared characteristics (e.g., all training
data, all image files, all raw data files). Maps to nested RO-Crate Dataset
entities via schema:hasPart.
entities via schema:hasPart in RO-Crate converters.
from_schema: https://w3id.org/bridge2ai/data-sheets-schema
exact_mappings:
- dcat:distribution
slot_uri: schema:hasPart
slot_uri: d4d:fileCollections
alias: file_collections
owner: Dataset
domain_of:
Expand Down Expand Up @@ -3567,11 +3567,11 @@ classes:
description: Collections of files within this dataset. Each collection represents
a logical grouping of files with shared characteristics (e.g., all training
data, all image files, all raw data files). Maps to nested RO-Crate Dataset
entities via schema:hasPart.
entities via schema:hasPart in RO-Crate converters.
from_schema: https://w3id.org/bridge2ai/data-sheets-schema
exact_mappings:
- dcat:distribution
slot_uri: schema:hasPart
slot_uri: d4d:fileCollections
alias: file_collections
owner: DataSubset
domain_of:
Expand Down
16 changes: 15 additions & 1 deletion src/fairscape_integration/d4d_to_fairscape.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ def _build_dataset(self, d4d_dict: Dict[str, Any], hasPart_ids: List[str] = None
author_str = str(authors)

# Build dataset params using JSON-LD field names (aliases)
# Collect all hasPart references: file collections + other resources
all_hasPart_ids = list(hasPart_ids or [])

# Include Dataset.resources (non-file-collection nested datasets) in hasPart
if "resources" in d4d_dict and d4d_dict["resources"]:
for resource in d4d_dict["resources"]:
if isinstance(resource, dict) and "id" in resource:
all_hasPart_ids.append(resource["id"])

dataset_params = {
"@id": "./",
"@type": ["Dataset", "https://w3id.org/EVI#ROCrate"],
Expand All @@ -130,7 +139,7 @@ def _build_dataset(self, d4d_dict: Dict[str, Any], hasPart_ids: List[str] = None
"version": d4d_dict.get("version", "1.0"),
"author": author_str,
"license": d4d_dict.get("license", "No license specified"), # Required field
"hasPart": [{"@id": id} for id in (hasPart_ids or [])] # Add file collection references
"hasPart": [{"@id": id} for id in all_hasPart_ids]
}

# Add optional Schema.org fields
Expand Down Expand Up @@ -280,6 +289,11 @@ def _build_file_collections(self, d4d_dict: Dict[str, Any]) -> tuple[List[ROCrat
if "file_count" in fc:
collection_params["d4d:fileCount"] = fc["file_count"]

# TODO: Convert FileCollection.resources (File objects) to RO-Crate File entities
# Currently, file-level metadata in resources is not converted to RO-Crate.
# Future work: iterate fc.get('resources', []), create RO-Crate File entities,
# and add their @ids to hasPart.

# Create nested Dataset element
collection_elem = ROCrateMetadataElem(**collection_params)
file_collections.append(collection_elem)
Expand Down
5 changes: 5 additions & 0 deletions src/fairscape_integration/fairscape_to_d4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,11 @@ def _build_file_collections(self, nested_datasets: List[Dict]) -> List[Dict[str,
if 'd4d:fileCount' in dataset:
collection['file_count'] = dataset['d4d:fileCount']

# TODO: Parse nested Dataset's hasPart to build FileCollection.resources
# Currently, file-level information in RO-Crate File entities is not converted
# to FileCollection.resources (File objects). Future work: parse dataset['hasPart'],
# fetch referenced File entities, and convert to D4D File objects in resources.

# Only add non-empty collections
if collection:
file_collections.append(collection)
Expand Down
2 changes: 1 addition & 1 deletion src/validation/unified_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def migrate_legacy_file_properties(data: Dict[str, Any]) -> tuple[Dict[str, Any]
# Check if migration needed
all_legacy_props = file_level_props + collection_props + ['bytes']
has_file_props = any(k in data for k in all_legacy_props)
has_collections = 'file_collections' in data and data['file_collections']
has_collections = 'file_collections' in data

if has_file_props and not has_collections:
# Create default file collection
Expand Down
29 changes: 19 additions & 10 deletions tests/test_file_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ def test_filecollection_basic_validation(self):
'id': 'test-collection-1',
'name': 'Training Data',
'description': 'Training dataset files',
'collection_type': 'training_split',
'collection_type': ['training_split'],
'total_bytes': 1048576,
'file_count': 100
}

# This should validate without errors when using linkml-validate
# For now, just test the data structure is correct
self.assertIn('id', filecollection_data)
self.assertEqual(filecollection_data['collection_type'], 'training_split')
self.assertEqual(filecollection_data['collection_type'], ['training_split'])

def test_dataset_with_file_collections(self):
"""Test Dataset containing FileCollections."""
Expand All @@ -41,13 +41,13 @@ def test_dataset_with_file_collections(self):
{
'id': 'collection-1',
'name': 'Training Files',
'collection_type': 'training_split',
'collection_type': ['training_split'],
'total_bytes': 1048576
},
{
'id': 'collection-2',
'name': 'Test Files',
'collection_type': 'test_split',
'collection_type': ['test_split'],
'total_bytes': 524288
}
],
Expand Down Expand Up @@ -77,17 +77,17 @@ def test_filecollection_enum_values(self):
collection = {
'id': f'collection-{collection_type}',
'name': f'{collection_type} files',
'collection_type': collection_type
'collection_type': [collection_type]
}
self.assertEqual(collection['collection_type'], collection_type)
self.assertEqual(collection['collection_type'], [collection_type])

def test_filecollection_properties_complete(self):
"""Test FileCollection with all collection-level properties."""
complete_collection = {
'id': 'complete-collection',
'name': 'Complete File Collection',
'description': 'A collection with all properties',
'collection_type': 'processed_data',
'collection_type': ['processed_data'],
'total_bytes': 2097152,
'file_count': 50,
'path': '/data/processed/',
Expand Down Expand Up @@ -231,9 +231,17 @@ def test_write_and_read_filecollection_yaml(self):
{
'id': 'test-collection',
'name': 'Test Files',
'collection_type': 'test_split',
'format': 'CSV',
'bytes': 1024
'collection_type': ['test_split'],
'total_bytes': 1024,
'file_count': 1,
'resources': [
{
'id': 'test001.csv',
'file_type': 'data_file',
'format': 'CSV',
'bytes': 1024
}
]
}
]
}
Expand All @@ -250,6 +258,7 @@ def test_write_and_read_filecollection_yaml(self):

self.assertEqual(loaded_data['id'], 'test-dataset')
self.assertEqual(loaded_data['file_collections'][0]['name'], 'Test Files')
self.assertEqual(loaded_data['file_collections'][0]['resources'][0]['format'], 'CSV')
finally:
Path(temp_path).unlink()

Expand Down
Loading