Skip to content

Commit 0faa201

Browse files
Merge pull request #56 from AustralianBioCommons/fix-array-should-have-items
Fix array should have items
2 parents 64d6cb2 + 10ea086 commit 0faa201

File tree

8 files changed

+166
-16
lines changed

8 files changed

+166
-16
lines changed

docs/gen3schemadev/first_dictionary.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,7 @@ Properties are found within nodes. Properties can be thought of as the columns i
176176
| integer | Whole numbers | 42 |
177177
| number | Numeric values (integer or floating point) | 3.14 |
178178
| boolean | True or false values | true |
179-
| object | Key-value pairs (dictionary/map) | {"age": 30} |
180179
| array | Ordered list of values | [1, 2, 3] |
181-
| null | Null value (no value) | null |
182180
| datetime | ISO 8601 date/time string (special type recognised by Gen3SchemaDev) | "2024-01-01T12:00:00Z" |
183181

184182
*These are the standard [JSON Schema data types](https://json-schema.org/understanding-json-schema/reference/type.html) used to define the kind of data a property can hold.*

jupyter/protoyping.ipynb

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,25 @@
1212
},
1313
{
1414
"cell_type": "code",
15-
"execution_count": 2,
15+
"execution_count": 3,
1616
"id": "7bda6e32",
1717
"metadata": {},
1818
"outputs": [
1919
{
20-
"ename": "ValueError",
21-
"evalue": "Schema 'lipidomics_file' with category 'data_file' must include properties 'data_type', 'data_format', and 'data_category'. Please add these properties to the 'properties' section.",
22-
"output_type": "error",
23-
"traceback": [
24-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
25-
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
26-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m lipid_schema = g.utils.load_yaml(\u001b[33m\"\u001b[39m\u001b[33m../output/lipidomics_file.yaml\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 2\u001b[39m rule_val = g.validators.rule_validator.RuleValidator(lipid_schema)\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mrule_val\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdata_file_props_need_data_props\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
27-
"\u001b[36mFile \u001b[39m\u001b[32m~/projects/gen3schemadev/src/gen3schemadev/validators/rule_validator.py:208\u001b[39m, in \u001b[36mRuleValidator.data_file_props_need_data_props\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 206\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m need_props.issubset(prop_keys):\n\u001b[32m 207\u001b[39m schema_id = \u001b[38;5;28mself\u001b[39m.schema.get(\u001b[33m\"\u001b[39m\u001b[33mid\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m<unknown id>\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m208\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 209\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSchema \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mschema_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m with category \u001b[39m\u001b[33m'\u001b[39m\u001b[33mdata_file\u001b[39m\u001b[33m'\u001b[39m\u001b[33m must include properties \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 210\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[33mdata_type\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mdata_format\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, and \u001b[39m\u001b[33m'\u001b[39m\u001b[33mdata_category\u001b[39m\u001b[33m'\u001b[39m\u001b[33m. Please add these properties \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 211\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mto the \u001b[39m\u001b[33m'\u001b[39m\u001b[33mproperties\u001b[39m\u001b[33m'\u001b[39m\u001b[33m section.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 212\u001b[39m )\n\u001b[32m 213\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
28-
"\u001b[31mValueError\u001b[39m: Schema 'lipidomics_file' with category 'data_file' must include properties 'data_type', 'data_format', and 'data_category'. Please add these properties to the 'properties' section."
29-
]
20+
"data": {
21+
"text/plain": [
22+
"True"
23+
]
24+
},
25+
"execution_count": 3,
26+
"metadata": {},
27+
"output_type": "execute_result"
3028
}
3129
],
3230
"source": [
3331
"lipid_schema = g.utils.load_yaml(\"../output/lipidomics_file.yaml\")\n",
3432
"rule_val = g.validators.rule_validator.RuleValidator(lipid_schema)\n",
35-
"rule_val.data_file_props_need_data_props()"
33+
"rule_val.type_array_needs_items()"
3634
]
3735
},
3836
{

src/gen3schemadev/converter.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,65 @@ def format_datetime(prop_dict: dict) -> dict:
524524
raise RuntimeError(f"Error formatting datetime property: {e}") from e
525525

526526

527+
def format_array(prop_dict: dict) -> dict:
528+
"""
529+
Formats a property dictionary with a 'type' of 'array' to use a Gen3 'array' schema definition.
530+
531+
If the property is not of type 'array', returns the property unchanged.
532+
533+
Example input:
534+
{
535+
"sample_ids": {
536+
"type": "array",
537+
"description": "Sample IDs (array)"
538+
}
539+
}
540+
541+
Example output:
542+
{
543+
"sample_ids": {
544+
"type": "array",
545+
"items": {
546+
"type": "string"
547+
}
548+
}
549+
}
550+
551+
Args:
552+
prop_dict (dict): A dictionary with a single property as key and its attributes as value.
553+
554+
Returns:
555+
dict: The formatted property dictionary suitable for Gen3 schema usage.
556+
557+
Raises:
558+
ValueError: If the input dictionary does not contain exactly one property.
559+
RuntimeError: If an error occurs during formatting.
560+
"""
561+
try:
562+
if len(prop_dict) != 1:
563+
raise ValueError("Expected a single property dictionary")
564+
565+
key, value = next(iter(prop_dict.items()))
566+
567+
if isinstance(value, dict) and value.get('type') == 'array':
568+
# Safely preserve description and required if present
569+
formatted = {
570+
"type": "array",
571+
"items": {
572+
"type": "string"
573+
}
574+
}
575+
if "description" in value:
576+
formatted["description"] = value["description"]
577+
if "required" in value:
578+
formatted["required"] = value["required"]
579+
return {key: formatted}
580+
else:
581+
return {key: value}
582+
except Exception as e:
583+
raise RuntimeError(f"Error formatting array property: {e}") from e
584+
585+
527586
def construct_props(node_name: str, data: DataSourceProtocol) -> dict:
528587
"""
529588
Construct the 'properties' section for a Gen3 schema node.
@@ -554,6 +613,7 @@ def construct_props(node_name: str, data: DataSourceProtocol) -> dict:
554613
if isinstance(prop, dict):
555614
prop = format_enum(prop)
556615
prop = format_datetime(prop)
616+
prop = format_array(prop)
557617
props_dict.update(prop)
558618

559619
# Add link properties

src/gen3schemadev/schema/input_schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class Property(BaseModel):
1212
"""Schema for gen3 property"""
1313
name: str = Field(description="The name of the property.")
1414
type: Literal[
15-
'string', 'integer', 'number', 'boolean', 'datetime', 'enum'
15+
'string', 'integer', 'number', 'boolean', 'datetime', 'enum', 'array'
1616
] = Field(description="The data type of the property.")
1717
description: str = Field(description="A human-readable description of the property.")
1818
required: bool = Field(default=False, description="Whether this property is required for the node.")

src/gen3schemadev/validators/rule_validator.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def validate(self):
1414
self.link_props_exist()
1515
self.props_cannot_be_system_props()
1616
self.props_must_have_type()
17+
self.type_array_needs_items()
1718

1819
def _get_links(self):
1920
links = self.schema.get("links", [])
@@ -211,3 +212,17 @@ def data_file_props_need_data_props(self):
211212
f"to the 'properties' section."
212213
)
213214
return True
215+
216+
217+
def type_array_needs_items(self):
218+
"""Ensure that any property with 'type': 'array' includes an 'items' property."""
219+
props = self._get_props()
220+
221+
for key, value in props.items():
222+
if isinstance(value, dict) and value.get("type") == "array":
223+
if "items" not in value:
224+
raise ValueError(
225+
f"Schema '{self.schema.get('id')}' property '{key}' with type 'array' must include an 'items' property. "
226+
"Please add an 'items' property to the 'properties' section."
227+
)
228+
return True

tests/input_example.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ nodes:
5858
type: number
5959
description: Coefficient of variation (%)
6060
required: true
61+
- name: lipid_species
62+
type: array
63+
description: List of lipid species
6164

6265

6366
links:

tests/test_converter.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,8 +464,14 @@ def test_construct_prop_lipidomics_file(fixture_input_yaml_pass):
464464
"data_type": {
465465
"description": "The type of data in this data file",
466466
"enum": ['data_type_1', 'data_type_2', 'data_type_3']
467+
},
468+
"lipid_species": {
469+
"description": "List of lipid species",
470+
"items": {
471+
"type": "string"
472+
},
473+
"type": "array"
467474
}
468-
469475
}
470476
assert result == expected
471477

@@ -614,6 +620,13 @@ def fixture_expected_output_lipid():
614620
"data_type": {
615621
"description": "The type of data in this data file",
616622
"enum": ['data_type_1', 'data_type_2', 'data_type_3']
623+
},
624+
"lipid_species": {
625+
"description": "List of lipid species",
626+
"items": {
627+
"type": "string"
628+
},
629+
"type": "array"
617630
}
618631
},
619632
'additionalProperties': False

tests/test_rule_validator.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,66 @@ def test_data_file_props_need_data_props():
166166
assert "data_format" in str(excinfo2.value)
167167
assert "data_category" in str(excinfo2.value)
168168

169+
170+
def test_type_array_needs_items_passes_on_arrays_with_items():
171+
schema = {
172+
"id": "array_test_schema",
173+
"properties": {
174+
"string_list": {
175+
"type": "array",
176+
"items": {"type": "string"}
177+
},
178+
"int_list": {
179+
"type": "array",
180+
"items": {"type": "integer"}
181+
},
182+
"foo": {"type": "string"}
183+
}
184+
}
185+
rv = RuleValidator(schema)
186+
assert rv.type_array_needs_items() is True
187+
188+
def test_type_array_needs_items_fails_when_missing_items():
189+
schema = {
190+
"id": "array_bad_schema",
191+
"properties": {
192+
"string_list": {
193+
"type": "array"
194+
# missing "items"
195+
},
196+
"foo": {"type": "string"}
197+
}
198+
}
199+
rv = RuleValidator(schema)
200+
with pytest.raises(ValueError) as excinfo:
201+
rv.type_array_needs_items()
202+
assert "must include an 'items' property" in str(excinfo.value)
203+
assert "string_list" in str(excinfo.value)
204+
assert "array_bad_schema" in str(excinfo.value)
205+
206+
def test_type_array_needs_items_ignores_non_array_properties():
207+
schema = {
208+
"id": "array_ok_schema",
209+
"properties": {
210+
"foo": {"type": "string"},
211+
"bar": {"type": "integer"},
212+
"flag": {"type": "boolean"}
213+
}
214+
}
215+
rv = RuleValidator(schema)
216+
assert rv.type_array_needs_items() is True
217+
218+
def test_type_array_needs_items_skips_refs():
219+
schema = {
220+
"id": "array_ref_schema",
221+
"properties": {
222+
"referenced_prop": {"$ref": "#/definitions/foo"},
223+
"another_array": {
224+
"type": "array",
225+
"items": {"type": "number"}
226+
}
227+
}
228+
}
229+
rv = RuleValidator(schema)
230+
# The $ref property should be ignored, should not raise
231+
assert rv.type_array_needs_items() is True

0 commit comments

Comments
 (0)