bridge2ai · realmarcin · Apr 8, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/project/jsonld/data_sheets_schema.jsonld b/project/jsonld/data_sheets_schema.jsonld
@@ -2067,15 +2067,15 @@
     },
     {
       "name": "dataset__file_collections",
-      "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
+      "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
       "from_schema": "https://w3id.org/bridge2ai/data-sheets-schema",
       "mappings": [
-        "http://schema.org/hasPart"
+        "https://w3id.org/bridge2ai/data-sheets-schema/fileCollections"
       ],
       "exact_mappings": [
         "http://www.w3.org/ns/dcat#distribution"
       ],
-      "slot_uri": "http://schema.org/hasPart",
+      "slot_uri": "https://w3id.org/bridge2ai/data-sheets-schema/fileCollections",
       "alias": "file_collections",
       "owner": "Dataset",
       "domain_of": [
@@ -6865,11 +6865,11 @@
       "attributes": [
         {
           "name": "file_collections",
-          "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
+          "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
           "exact_mappings": [
             "dcat:distribution"
           ],
-          "slot_uri": "schema:hasPart",
+          "slot_uri": "d4d:fileCollections",
           "range": "FileCollection",
           "multivalued": true,
           "inlined_as_list": true,
@@ -10713,9 +10713,9 @@
   ],
   "metamodel_version": "1.7.0",
   "source_file": "data_sheets_schema.yaml",
-  "source_file_date": "2026-03-26T22:44:02",
-  "source_file_size": 18530,
-  "generation_date": "2026-04-06T21:13:16",
+  "source_file_date": "2026-04-07T13:01:39",
+  "source_file_size": 18558,
+  "generation_date": "2026-04-07T13:03:27",
   "@type": "SchemaDefinition",
   "@context": [
     "project/jsonld/data_sheets_schema.context.jsonld",

diff --git a/project/jsonschema/data_sheets_schema.schema.json b/project/jsonschema/data_sheets_schema.schema.json
@@ -1207,7 +1207,7 @@
                     ]
                 },
                 "file_collections": {
-                    "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
+                    "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
                     "items": {
                         "$ref": "#/$defs/FileCollection"
                     },
@@ -2005,7 +2005,7 @@
                     ]
                 },
                 "file_collections": {
-                    "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart.",
+                    "description": "Collections of files within this dataset. Each collection represents a logical grouping of files with shared characteristics (e.g., all training data, all image files, all raw data files). Maps to nested RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.",
                     "items": {
                         "$ref": "#/$defs/FileCollection"
                     },

diff --git a/project/owl/data_sheets_schema.owl.ttl b/project/owl/data_sheets_schema.owl.ttl
diff --git a/src/data_sheets_schema/datamodel/data_sheets_schema.py b/src/data_sheets_schema/datamodel/data_sheets_schema.py
@@ -1,5 +1,5 @@
 # Auto generated from data_sheets_schema.yaml by pythongen.py version: 0.0.1
-# Generation date: 2026-04-06T21:13:18
+# Generation date: 2026-04-07T13:03:28
 # Schema: data-sheets-schema
 #
 # id: https://w3id.org/bridge2ai/data-sheets-schema
@@ -3788,7 +3788,7 @@ class slots:
 slots.resources = Slot(uri=SCHEMA.hasPart, name="resources", curie=SCHEMA.curie('hasPart'),
                    model_uri=DATA_SHEETS_SCHEMA.resources, domain=None, range=Optional[Union[Union[str, DatasetId], list[Union[str, DatasetId]]]])
 
-slots.dataset__file_collections = Slot(uri=SCHEMA.hasPart, name="dataset__file_collections", curie=SCHEMA.curie('hasPart'),
+slots.dataset__file_collections = Slot(uri=D4D.fileCollections, name="dataset__file_collections", curie=D4D.curie('fileCollections'),
                    model_uri=DATA_SHEETS_SCHEMA.dataset__file_collections, domain=None, range=Optional[Union[dict[Union[str, FileCollectionId], Union[dict, FileCollection]], list[Union[dict, FileCollection]]]])
 
 slots.dataset__total_file_count = Slot(uri=D4D.totalFileCount, name="dataset__total_file_count", curie=D4D.curie('totalFileCount'),

diff --git a/src/data_sheets_schema/schema/D4D_FileCollection.yaml b/src/data_sheets_schema/schema/D4D_FileCollection.yaml
@@ -103,6 +103,10 @@ classes:
           - range: FileCollection
         multivalued: true
         inlined_as_list: true
+        # NOTE: LinkML generator limitation - Generated artifacts (Python datamodel,
+        # JSON Schema) do not fully reflect this union type constraint. The generated
+        # code still types resources as Dataset rather than (File | FileCollection).
+        # This is a known limitation of LinkML's union type handling.
     attributes:
       collection_type:
         description: >-

diff --git a/src/data_sheets_schema/schema/data_sheets_schema.yaml b/src/data_sheets_schema/schema/data_sheets_schema.yaml
@@ -120,8 +120,8 @@ classes:
           Collections of files within this dataset. Each collection represents
           a logical grouping of files with shared characteristics (e.g., all
           training data, all image files, all raw data files). Maps to nested
-          RO-Crate Dataset entities via schema:hasPart.
-        slot_uri: schema:hasPart
+          RO-Crate Dataset entities via schema:hasPart in RO-Crate converters.
+        slot_uri: d4d:fileCollections
         range: FileCollection
         multivalued: true
         inlined_as_list: true

diff --git a/src/data_sheets_schema/schema/data_sheets_schema_all.yaml b/src/data_sheets_schema/schema/data_sheets_schema_all.yaml
@@ -2171,11 +2171,11 @@ classes:
         description: Collections of files within this dataset. Each collection represents
           a logical grouping of files with shared characteristics (e.g., all training
           data, all image files, all raw data files). Maps to nested RO-Crate Dataset
-          entities via schema:hasPart.
+          entities via schema:hasPart in RO-Crate converters.
         from_schema: https://w3id.org/bridge2ai/data-sheets-schema
         exact_mappings:
         - dcat:distribution
-        slot_uri: schema:hasPart
+        slot_uri: d4d:fileCollections
         alias: file_collections
         owner: Dataset
         domain_of:
@@ -3567,11 +3567,11 @@ classes:
         description: Collections of files within this dataset. Each collection represents
           a logical grouping of files with shared characteristics (e.g., all training
           data, all image files, all raw data files). Maps to nested RO-Crate Dataset
-          entities via schema:hasPart.
+          entities via schema:hasPart in RO-Crate converters.
         from_schema: https://w3id.org/bridge2ai/data-sheets-schema
         exact_mappings:
         - dcat:distribution
-        slot_uri: schema:hasPart
+        slot_uri: d4d:fileCollections
         alias: file_collections
         owner: DataSubset
         domain_of:

diff --git a/src/fairscape_integration/d4d_to_fairscape.py b/src/fairscape_integration/d4d_to_fairscape.py
@@ -121,6 +121,15 @@ def _build_dataset(self, d4d_dict: Dict[str, Any], hasPart_ids: List[str] = None
                 author_str = str(authors)
 
         # Build dataset params using JSON-LD field names (aliases)
+        # Collect all hasPart references: file collections + other resources
+        all_hasPart_ids = list(hasPart_ids or [])
+
+        # Include Dataset.resources (non-file-collection nested datasets) in hasPart
+        if "resources" in d4d_dict and d4d_dict["resources"]:
+            for resource in d4d_dict["resources"]:
+                if isinstance(resource, dict) and "id" in resource:
+                    all_hasPart_ids.append(resource["id"])
+
         dataset_params = {
             "@id": "./",
             "@type": ["Dataset", "https://w3id.org/EVI#ROCrate"],
@@ -130,7 +139,7 @@ def _build_dataset(self, d4d_dict: Dict[str, Any], hasPart_ids: List[str] = None
             "version": d4d_dict.get("version", "1.0"),
             "author": author_str,
             "license": d4d_dict.get("license", "No license specified"),  # Required field
-            "hasPart": [{"@id": id} for id in (hasPart_ids or [])]  # Add file collection references
+            "hasPart": [{"@id": id} for id in all_hasPart_ids]
         }
 
         # Add optional Schema.org fields
@@ -280,6 +289,11 @@ def _build_file_collections(self, d4d_dict: Dict[str, Any]) -> tuple[List[ROCrat
             if "file_count" in fc:
                 collection_params["d4d:fileCount"] = fc["file_count"]
 
+            # TODO: Convert FileCollection.resources (File objects) to RO-Crate File entities
+            # Currently, file-level metadata in resources is not converted to RO-Crate.
+            # Future work: iterate fc.get('resources', []), create RO-Crate File entities,
+            # and add their @ids to hasPart.
+
             # Create nested Dataset element
             collection_elem = ROCrateMetadataElem(**collection_params)
             file_collections.append(collection_elem)

diff --git a/src/fairscape_integration/fairscape_to_d4d.py b/src/fairscape_integration/fairscape_to_d4d.py
@@ -286,6 +286,11 @@ def _build_file_collections(self, nested_datasets: List[Dict]) -> List[Dict[str,
             if 'd4d:fileCount' in dataset:
                 collection['file_count'] = dataset['d4d:fileCount']
 
+            # TODO: Parse nested Dataset's hasPart to build FileCollection.resources
+            # Currently, file-level information in RO-Crate File entities is not converted
+            # to FileCollection.resources (File objects). Future work: parse dataset['hasPart'],
+            # fetch referenced File entities, and convert to D4D File objects in resources.
+
             # Only add non-empty collections
             if collection:
                 file_collections.append(collection)

diff --git a/src/validation/unified_validator.py b/src/validation/unified_validator.py
@@ -187,7 +187,7 @@ def migrate_legacy_file_properties(data: Dict[str, Any]) -> tuple[Dict[str, Any]
         # Check if migration needed
         all_legacy_props = file_level_props + collection_props + ['bytes']
         has_file_props = any(k in data for k in all_legacy_props)
-        has_collections = 'file_collections' in data and data['file_collections']
+        has_collections = 'file_collections' in data
 
         if has_file_props and not has_collections:
             # Create default file collection

diff --git a/tests/test_file_collection.py b/tests/test_file_collection.py
@@ -21,15 +21,15 @@ def test_filecollection_basic_validation(self):
             'id': 'test-collection-1',
             'name': 'Training Data',
             'description': 'Training dataset files',
-            'collection_type': 'training_split',
+            'collection_type': ['training_split'],
             'total_bytes': 1048576,
             'file_count': 100
         }
 
         # This should validate without errors when using linkml-validate
         # For now, just test the data structure is correct
         self.assertIn('id', filecollection_data)
-        self.assertEqual(filecollection_data['collection_type'], 'training_split')
+        self.assertEqual(filecollection_data['collection_type'], ['training_split'])
 
     def test_dataset_with_file_collections(self):
         """Test Dataset containing FileCollections."""
@@ -41,13 +41,13 @@ def test_dataset_with_file_collections(self):
                 {
                     'id': 'collection-1',
                     'name': 'Training Files',
-                    'collection_type': 'training_split',
+                    'collection_type': ['training_split'],
                     'total_bytes': 1048576
                 },
                 {
                     'id': 'collection-2',
                     'name': 'Test Files',
-                    'collection_type': 'test_split',
+                    'collection_type': ['test_split'],
                     'total_bytes': 524288
                 }
             ],
@@ -77,17 +77,17 @@ def test_filecollection_enum_values(self):
             collection = {
                 'id': f'collection-{collection_type}',
                 'name': f'{collection_type} files',
-                'collection_type': collection_type
+                'collection_type': [collection_type]
             }
-            self.assertEqual(collection['collection_type'], collection_type)
+            self.assertEqual(collection['collection_type'], [collection_type])
 
     def test_filecollection_properties_complete(self):
         """Test FileCollection with all collection-level properties."""
         complete_collection = {
             'id': 'complete-collection',
             'name': 'Complete File Collection',
             'description': 'A collection with all properties',
-            'collection_type': 'processed_data',
+            'collection_type': ['processed_data'],
             'total_bytes': 2097152,
             'file_count': 50,
             'path': '/data/processed/',
@@ -231,9 +231,17 @@ def test_write_and_read_filecollection_yaml(self):
                 {
                     'id': 'test-collection',
                     'name': 'Test Files',
-                    'collection_type': 'test_split',
-                    'format': 'CSV',
-                    'bytes': 1024
+                    'collection_type': ['test_split'],
+                    'total_bytes': 1024,
+                    'file_count': 1,
+                    'resources': [
+                        {
+                            'id': 'test001.csv',
+                            'file_type': 'data_file',
+                            'format': 'CSV',
+                            'bytes': 1024
+                        }
+                    ]
                 }
             ]
         }
@@ -250,6 +258,7 @@ def test_write_and_read_filecollection_yaml(self):
 
             self.assertEqual(loaded_data['id'], 'test-dataset')
             self.assertEqual(loaded_data['file_collections'][0]['name'], 'Test Files')
+            self.assertEqual(loaded_data['file_collections'][0]['resources'][0]['format'], 'CSV')
         finally:
             Path(temp_path).unlink()